apache-airflow-providers-openlineage 1.8.0rc1__py3-none-any.whl → 1.9.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apache-airflow-providers-openlineage might be problematic. Click here for more details.
- airflow/providers/openlineage/LICENSE +4 -4
- airflow/providers/openlineage/__init__.py +1 -1
- airflow/providers/openlineage/conf.py +16 -1
- airflow/providers/openlineage/facets/AirflowJobFacet.json +40 -0
- airflow/providers/openlineage/facets/AirflowRunFacet.json +261 -0
- airflow/providers/openlineage/facets/AirflowStateRunFacet.json +34 -0
- airflow/providers/openlineage/facets/__init__.py +16 -0
- airflow/providers/openlineage/get_provider_info.py +11 -3
- airflow/providers/openlineage/plugins/adapter.py +61 -14
- airflow/providers/openlineage/plugins/facets.py +44 -3
- airflow/providers/openlineage/plugins/listener.py +128 -33
- airflow/providers/openlineage/plugins/macros.py +1 -1
- airflow/providers/openlineage/sqlparser.py +12 -4
- airflow/providers/openlineage/utils/sql.py +7 -1
- airflow/providers/openlineage/utils/utils.py +179 -21
- {apache_airflow_providers_openlineage-1.8.0rc1.dist-info → apache_airflow_providers_openlineage-1.9.0rc1.dist-info}/METADATA +10 -10
- apache_airflow_providers_openlineage-1.9.0rc1.dist-info/RECORD +28 -0
- apache_airflow_providers_openlineage-1.8.0rc1.dist-info/RECORD +0 -24
- {apache_airflow_providers_openlineage-1.8.0rc1.dist-info → apache_airflow_providers_openlineage-1.9.0rc1.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_openlineage-1.8.0rc1.dist-info → apache_airflow_providers_openlineage-1.9.0rc1.dist-info}/entry_points.txt +0 -0
|
@@ -215,7 +215,7 @@ Third party Apache 2.0 licenses
|
|
|
215
215
|
|
|
216
216
|
The following components are provided under the Apache 2.0 License.
|
|
217
217
|
See project link for details. The text of each license is also included
|
|
218
|
-
at licenses/LICENSE-[project].txt.
|
|
218
|
+
at 3rd-party-licenses/LICENSE-[project].txt.
|
|
219
219
|
|
|
220
220
|
(ALv2 License) hue v4.3.0 (https://github.com/cloudera/hue/)
|
|
221
221
|
(ALv2 License) jqclock v2.3.0 (https://github.com/JohnRDOrazio/jQuery-Clock-Plugin)
|
|
@@ -227,7 +227,7 @@ MIT licenses
|
|
|
227
227
|
========================================================================
|
|
228
228
|
|
|
229
229
|
The following components are provided under the MIT License. See project link for details.
|
|
230
|
-
The text of each license is also included at licenses/LICENSE-[project].txt.
|
|
230
|
+
The text of each license is also included at 3rd-party-licenses/LICENSE-[project].txt.
|
|
231
231
|
|
|
232
232
|
(MIT License) jquery v3.5.1 (https://jquery.org/license/)
|
|
233
233
|
(MIT License) dagre-d3 v0.6.4 (https://github.com/cpettitt/dagre-d3)
|
|
@@ -243,11 +243,11 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
|
|
|
243
243
|
BSD 3-Clause licenses
|
|
244
244
|
========================================================================
|
|
245
245
|
The following components are provided under the BSD 3-Clause license. See project links for details.
|
|
246
|
-
The text of each license is also included at licenses/LICENSE-[project].txt.
|
|
246
|
+
The text of each license is also included at 3rd-party-licenses/LICENSE-[project].txt.
|
|
247
247
|
|
|
248
248
|
(BSD 3 License) d3 v5.16.0 (https://d3js.org)
|
|
249
249
|
(BSD 3 License) d3-shape v2.1.0 (https://github.com/d3/d3-shape)
|
|
250
250
|
(BSD 3 License) cgroupspy 0.2.1 (https://github.com/cloudsigma/cgroupspy)
|
|
251
251
|
|
|
252
252
|
========================================================================
|
|
253
|
-
See licenses/LICENSES-ui.txt for packages used in `/airflow/www`
|
|
253
|
+
See 3rd-party-licenses/LICENSES-ui.txt for packages used in `/airflow/www`
|
|
@@ -29,7 +29,7 @@ from airflow import __version__ as airflow_version
|
|
|
29
29
|
|
|
30
30
|
__all__ = ["__version__"]
|
|
31
31
|
|
|
32
|
-
__version__ = "1.
|
|
32
|
+
__version__ = "1.9.0"
|
|
33
33
|
|
|
34
34
|
if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
|
|
35
35
|
"2.7.0"
|
|
@@ -33,7 +33,15 @@ from __future__ import annotations
|
|
|
33
33
|
import os
|
|
34
34
|
from typing import Any
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
# Disable caching if we're inside tests - this makes config easier to mock.
|
|
37
|
+
if os.getenv("PYTEST_VERSION"):
|
|
38
|
+
|
|
39
|
+
def decorator(func):
|
|
40
|
+
return func
|
|
41
|
+
|
|
42
|
+
cache = decorator
|
|
43
|
+
else:
|
|
44
|
+
from airflow.compat.functools import cache
|
|
37
45
|
from airflow.configuration import conf
|
|
38
46
|
|
|
39
47
|
_CONFIG_SECTION = "openlineage"
|
|
@@ -130,3 +138,10 @@ def dag_state_change_process_pool_size() -> int:
|
|
|
130
138
|
"""[openlineage] dag_state_change_process_pool_size."""
|
|
131
139
|
option = conf.get(_CONFIG_SECTION, "dag_state_change_process_pool_size", fallback="")
|
|
132
140
|
return _safe_int_convert(str(option).strip(), default=1)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@cache
|
|
144
|
+
def execution_timeout() -> int:
|
|
145
|
+
"""[openlineage] execution_timeout."""
|
|
146
|
+
option = conf.get(_CONFIG_SECTION, "execution_timeout", fallback="")
|
|
147
|
+
return _safe_int_convert(str(option).strip(), default=10)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$defs": {
|
|
4
|
+
"AirflowJobFacet": {
|
|
5
|
+
"allOf": [
|
|
6
|
+
{
|
|
7
|
+
"$ref": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet"
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"type": "object",
|
|
11
|
+
"properties": {
|
|
12
|
+
"taskTree": {
|
|
13
|
+
"description": "The hierarchical structure of tasks in the DAG.",
|
|
14
|
+
"type": "object",
|
|
15
|
+
"additionalProperties": true
|
|
16
|
+
},
|
|
17
|
+
"taskGroups": {
|
|
18
|
+
"description": "Information about all task groups within the DAG.",
|
|
19
|
+
"type": "object",
|
|
20
|
+
"additionalProperties": true
|
|
21
|
+
},
|
|
22
|
+
"tasks": {
|
|
23
|
+
"description": "Details of all individual tasks within the DAG.",
|
|
24
|
+
"type": "object",
|
|
25
|
+
"additionalProperties": true
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"required": ["taskTree", "taskGroups", "tasks"]
|
|
29
|
+
}
|
|
30
|
+
],
|
|
31
|
+
"type": "object"
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"type": "object",
|
|
35
|
+
"properties": {
|
|
36
|
+
"airflow": {
|
|
37
|
+
"$ref": "#/$defs/AirflowJobFacet"
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$defs": {
|
|
4
|
+
"AirflowRunFacet": {
|
|
5
|
+
"allOf": [
|
|
6
|
+
{
|
|
7
|
+
"$ref": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet"
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"type": "object",
|
|
11
|
+
"properties": {
|
|
12
|
+
"dag": {
|
|
13
|
+
"$ref": "#/$defs/DAG"
|
|
14
|
+
},
|
|
15
|
+
"dagRun": {
|
|
16
|
+
"$ref": "#/$defs/DagRun"
|
|
17
|
+
},
|
|
18
|
+
"taskInstance": {
|
|
19
|
+
"$ref": "#/$defs/TaskInstance"
|
|
20
|
+
},
|
|
21
|
+
"task": {
|
|
22
|
+
"$ref": "#/$defs/Task"
|
|
23
|
+
},
|
|
24
|
+
"taskUuid": {
|
|
25
|
+
"type": "string"
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"required": [
|
|
29
|
+
"dag",
|
|
30
|
+
"dagRun",
|
|
31
|
+
"taskInstance",
|
|
32
|
+
"task",
|
|
33
|
+
"taskUuid"
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
]
|
|
37
|
+
},
|
|
38
|
+
"Task": {
|
|
39
|
+
"type": "object",
|
|
40
|
+
"properties": {
|
|
41
|
+
"depends_on_past": {
|
|
42
|
+
"type": "boolean"
|
|
43
|
+
},
|
|
44
|
+
"downstream_task_ids": {
|
|
45
|
+
"type": "string"
|
|
46
|
+
},
|
|
47
|
+
"execution_timeout": {
|
|
48
|
+
"type": "string"
|
|
49
|
+
},
|
|
50
|
+
"executor_config": {
|
|
51
|
+
"type": "object",
|
|
52
|
+
"additionalProperties": true
|
|
53
|
+
},
|
|
54
|
+
"ignore_first_depends_on_past": {
|
|
55
|
+
"type": "boolean"
|
|
56
|
+
},
|
|
57
|
+
"is_setup": {
|
|
58
|
+
"type": "boolean"
|
|
59
|
+
},
|
|
60
|
+
"is_teardown": {
|
|
61
|
+
"type": "boolean"
|
|
62
|
+
},
|
|
63
|
+
"mapped": {
|
|
64
|
+
"type": "boolean"
|
|
65
|
+
},
|
|
66
|
+
"max_active_tis_per_dag": {
|
|
67
|
+
"type": "integer"
|
|
68
|
+
},
|
|
69
|
+
"max_active_tis_per_dagrun": {
|
|
70
|
+
"type": "integer"
|
|
71
|
+
},
|
|
72
|
+
"max_retry_delay": {
|
|
73
|
+
"type": "string"
|
|
74
|
+
},
|
|
75
|
+
"multiple_outputs": {
|
|
76
|
+
"type": "boolean"
|
|
77
|
+
},
|
|
78
|
+
"operator_class": {
|
|
79
|
+
"description": "Module + class name of the operator",
|
|
80
|
+
"type": "string"
|
|
81
|
+
},
|
|
82
|
+
"owner": {
|
|
83
|
+
"type": "string"
|
|
84
|
+
},
|
|
85
|
+
"priority_weight": {
|
|
86
|
+
"type": "integer"
|
|
87
|
+
},
|
|
88
|
+
"queue": {
|
|
89
|
+
"type": "string"
|
|
90
|
+
},
|
|
91
|
+
"retries": {
|
|
92
|
+
"type": "integer"
|
|
93
|
+
},
|
|
94
|
+
"retry_exponential_backoff": {
|
|
95
|
+
"type": "boolean"
|
|
96
|
+
},
|
|
97
|
+
"run_as_user": {
|
|
98
|
+
"type": "string"
|
|
99
|
+
},
|
|
100
|
+
"sla": {
|
|
101
|
+
"type": "number"
|
|
102
|
+
},
|
|
103
|
+
"task_id": {
|
|
104
|
+
"type": "string"
|
|
105
|
+
},
|
|
106
|
+
"trigger_rule": {
|
|
107
|
+
"type": "string"
|
|
108
|
+
},
|
|
109
|
+
"upstream_task_ids": {
|
|
110
|
+
"type": "string"
|
|
111
|
+
},
|
|
112
|
+
"wait_for_downstream": {
|
|
113
|
+
"type": "boolean"
|
|
114
|
+
},
|
|
115
|
+
"wait_for_past_depends_before_skipping": {
|
|
116
|
+
"type": "boolean"
|
|
117
|
+
},
|
|
118
|
+
"weight_rule": {
|
|
119
|
+
"type": "string"
|
|
120
|
+
},
|
|
121
|
+
"task_group": {
|
|
122
|
+
"description": "Task group related information",
|
|
123
|
+
"type": "object",
|
|
124
|
+
"properties": {
|
|
125
|
+
"group_id": {
|
|
126
|
+
"type": "string"
|
|
127
|
+
},
|
|
128
|
+
"downstream_group_ids": {
|
|
129
|
+
"type": "string"
|
|
130
|
+
},
|
|
131
|
+
"downstream_task_ids": {
|
|
132
|
+
"type": "string"
|
|
133
|
+
},
|
|
134
|
+
"prefix_group_id": {
|
|
135
|
+
"type": "boolean"
|
|
136
|
+
},
|
|
137
|
+
"tooltip": {
|
|
138
|
+
"type": "string"
|
|
139
|
+
},
|
|
140
|
+
"upstream_group_ids": {
|
|
141
|
+
"type": "string"
|
|
142
|
+
},
|
|
143
|
+
"upstream_task_ids": {
|
|
144
|
+
"type": "string"
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
"additionalProperties": true,
|
|
148
|
+
"required": ["group_id"]
|
|
149
|
+
}
|
|
150
|
+
},
|
|
151
|
+
"additionalProperties": true,
|
|
152
|
+
"required": [
|
|
153
|
+
"task_id"
|
|
154
|
+
]
|
|
155
|
+
},
|
|
156
|
+
"DAG": {
|
|
157
|
+
"type": "object",
|
|
158
|
+
"properties": {
|
|
159
|
+
"dag_id": {
|
|
160
|
+
"type": "string"
|
|
161
|
+
},
|
|
162
|
+
"description": {
|
|
163
|
+
"type": "string"
|
|
164
|
+
},
|
|
165
|
+
"owner": {
|
|
166
|
+
"type": "string"
|
|
167
|
+
},
|
|
168
|
+
"schedule_interval": {
|
|
169
|
+
"type": "string"
|
|
170
|
+
},
|
|
171
|
+
"start_date": {
|
|
172
|
+
"type": "string",
|
|
173
|
+
"format": "date-time"
|
|
174
|
+
},
|
|
175
|
+
"tags": {
|
|
176
|
+
"type": "string"
|
|
177
|
+
},
|
|
178
|
+
"timetable": {
|
|
179
|
+
"description": "Describes timetable (successor of schedule_interval)",
|
|
180
|
+
"type": "object",
|
|
181
|
+
"additionalProperties": true
|
|
182
|
+
}
|
|
183
|
+
},
|
|
184
|
+
"additionalProperties": true,
|
|
185
|
+
"required": [
|
|
186
|
+
"dag_id",
|
|
187
|
+
"start_date"
|
|
188
|
+
]
|
|
189
|
+
},
|
|
190
|
+
"TaskInstance": {
|
|
191
|
+
"type": "object",
|
|
192
|
+
"properties": {
|
|
193
|
+
"duration": {
|
|
194
|
+
"type": "number"
|
|
195
|
+
},
|
|
196
|
+
"map_index": {
|
|
197
|
+
"type": "integer"
|
|
198
|
+
},
|
|
199
|
+
"pool": {
|
|
200
|
+
"type": "string"
|
|
201
|
+
},
|
|
202
|
+
"try_number": {
|
|
203
|
+
"type": "integer"
|
|
204
|
+
},
|
|
205
|
+
"queued_dttm": {
|
|
206
|
+
"type": "string",
|
|
207
|
+
"format": "date-time"
|
|
208
|
+
}
|
|
209
|
+
},
|
|
210
|
+
"additionalProperties": true,
|
|
211
|
+
"required": [
|
|
212
|
+
"pool",
|
|
213
|
+
"try_number"
|
|
214
|
+
]
|
|
215
|
+
},
|
|
216
|
+
"DagRun": {
|
|
217
|
+
"type": "object",
|
|
218
|
+
"properties": {
|
|
219
|
+
"conf": {
|
|
220
|
+
"type": "object",
|
|
221
|
+
"additionalProperties": true
|
|
222
|
+
},
|
|
223
|
+
"dag_id": {
|
|
224
|
+
"type": "string"
|
|
225
|
+
},
|
|
226
|
+
"data_interval_start": {
|
|
227
|
+
"type": "string",
|
|
228
|
+
"format": "date-time"
|
|
229
|
+
},
|
|
230
|
+
"data_interval_end": {
|
|
231
|
+
"type": "string",
|
|
232
|
+
"format": "date-time"
|
|
233
|
+
},
|
|
234
|
+
"external_trigger": {
|
|
235
|
+
"type": "boolean"
|
|
236
|
+
},
|
|
237
|
+
"run_id": {
|
|
238
|
+
"type": "string"
|
|
239
|
+
},
|
|
240
|
+
"run_type": {
|
|
241
|
+
"type": "string"
|
|
242
|
+
},
|
|
243
|
+
"start_date": {
|
|
244
|
+
"type": "string",
|
|
245
|
+
"format": "date-time"
|
|
246
|
+
}
|
|
247
|
+
},
|
|
248
|
+
"additionalProperties": true,
|
|
249
|
+
"required": [
|
|
250
|
+
"dag_id",
|
|
251
|
+
"run_id"
|
|
252
|
+
]
|
|
253
|
+
}
|
|
254
|
+
},
|
|
255
|
+
"type": "object",
|
|
256
|
+
"properties": {
|
|
257
|
+
"airflow": {
|
|
258
|
+
"$ref": "#/$defs/AirflowRunFacet"
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$defs": {
|
|
4
|
+
"AirflowStateRunFacet": {
|
|
5
|
+
"allOf": [
|
|
6
|
+
{
|
|
7
|
+
"$ref": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet"
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"type": "object",
|
|
11
|
+
"properties": {
|
|
12
|
+
"dagRunState": {
|
|
13
|
+
"description": "The final status of the entire DagRun",
|
|
14
|
+
"type": "string"
|
|
15
|
+
},
|
|
16
|
+
"tasksState": {
|
|
17
|
+
"description": "Mapping of task IDs to their respective states",
|
|
18
|
+
"type": "object",
|
|
19
|
+
"additionalProperties": true
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
"required": ["dagRunState", "tasksState"]
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"type": "object"
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"type": "object",
|
|
29
|
+
"properties": {
|
|
30
|
+
"airflowState": {
|
|
31
|
+
"$ref": "#/$defs/AirflowStateRunFacet"
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
@@ -28,8 +28,9 @@ def get_provider_info():
|
|
|
28
28
|
"name": "OpenLineage Airflow",
|
|
29
29
|
"description": "`OpenLineage <https://openlineage.io/>`__\n",
|
|
30
30
|
"state": "ready",
|
|
31
|
-
"source-date-epoch":
|
|
31
|
+
"source-date-epoch": 1718605195,
|
|
32
32
|
"versions": [
|
|
33
|
+
"1.9.0",
|
|
33
34
|
"1.8.0",
|
|
34
35
|
"1.7.1",
|
|
35
36
|
"1.7.0",
|
|
@@ -50,8 +51,8 @@ def get_provider_info():
|
|
|
50
51
|
"apache-airflow>=2.7.0",
|
|
51
52
|
"apache-airflow-providers-common-sql>=1.6.0",
|
|
52
53
|
"attrs>=22.2",
|
|
53
|
-
"openlineage-integration-common>=
|
|
54
|
-
"openlineage-python>=
|
|
54
|
+
"openlineage-integration-common>=1.16.0",
|
|
55
|
+
"openlineage-python>=1.16.0",
|
|
55
56
|
],
|
|
56
57
|
"integrations": [
|
|
57
58
|
{
|
|
@@ -134,6 +135,13 @@ def get_provider_info():
|
|
|
134
135
|
"type": "integer",
|
|
135
136
|
"version_added": "1.8.0",
|
|
136
137
|
},
|
|
138
|
+
"execution_timeout": {
|
|
139
|
+
"description": "Maximum amount of time (in seconds) that OpenLineage can spend executing metadata extraction.\n",
|
|
140
|
+
"default": "10",
|
|
141
|
+
"example": None,
|
|
142
|
+
"type": "integer",
|
|
143
|
+
"version_added": "1.9.0",
|
|
144
|
+
},
|
|
137
145
|
},
|
|
138
146
|
}
|
|
139
147
|
},
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
from __future__ import annotations
|
|
18
18
|
|
|
19
19
|
import traceback
|
|
20
|
-
import uuid
|
|
21
20
|
from contextlib import ExitStack
|
|
22
21
|
from typing import TYPE_CHECKING
|
|
23
22
|
|
|
@@ -36,13 +35,19 @@ from openlineage.client.facet import (
|
|
|
36
35
|
SourceCodeLocationJobFacet,
|
|
37
36
|
)
|
|
38
37
|
from openlineage.client.run import Job, Run, RunEvent, RunState
|
|
38
|
+
from openlineage.client.uuid import generate_static_uuid
|
|
39
39
|
|
|
40
40
|
from airflow.providers.openlineage import __version__ as OPENLINEAGE_PROVIDER_VERSION, conf
|
|
41
|
-
from airflow.providers.openlineage.utils.utils import
|
|
41
|
+
from airflow.providers.openlineage.utils.utils import (
|
|
42
|
+
OpenLineageRedactor,
|
|
43
|
+
get_airflow_state_run_facet,
|
|
44
|
+
)
|
|
42
45
|
from airflow.stats import Stats
|
|
43
46
|
from airflow.utils.log.logging_mixin import LoggingMixin
|
|
44
47
|
|
|
45
48
|
if TYPE_CHECKING:
|
|
49
|
+
from datetime import datetime
|
|
50
|
+
|
|
46
51
|
from airflow.models.dagrun import DagRun
|
|
47
52
|
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
48
53
|
from airflow.utils.log.secrets_masker import SecretsMasker
|
|
@@ -111,15 +116,25 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
111
116
|
return yaml.safe_load(config_file)
|
|
112
117
|
|
|
113
118
|
@staticmethod
|
|
114
|
-
def build_dag_run_id(dag_id,
|
|
115
|
-
return str(
|
|
119
|
+
def build_dag_run_id(dag_id: str, execution_date: datetime) -> str:
|
|
120
|
+
return str(
|
|
121
|
+
generate_static_uuid(
|
|
122
|
+
instant=execution_date,
|
|
123
|
+
data=f"{conf.namespace()}.{dag_id}".encode(),
|
|
124
|
+
)
|
|
125
|
+
)
|
|
116
126
|
|
|
117
127
|
@staticmethod
|
|
118
|
-
def build_task_instance_run_id(
|
|
128
|
+
def build_task_instance_run_id(
|
|
129
|
+
dag_id: str,
|
|
130
|
+
task_id: str,
|
|
131
|
+
try_number: int,
|
|
132
|
+
execution_date: datetime,
|
|
133
|
+
):
|
|
119
134
|
return str(
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
f"{conf.namespace()}.{dag_id}.{task_id}.{
|
|
135
|
+
generate_static_uuid(
|
|
136
|
+
instant=execution_date,
|
|
137
|
+
data=f"{conf.namespace()}.{dag_id}.{task_id}.{try_number}".encode(),
|
|
123
138
|
)
|
|
124
139
|
)
|
|
125
140
|
|
|
@@ -264,6 +279,7 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
264
279
|
parent_run_id: str | None,
|
|
265
280
|
end_time: str,
|
|
266
281
|
task: OperatorLineage,
|
|
282
|
+
error: str | BaseException | None = None,
|
|
267
283
|
) -> RunEvent:
|
|
268
284
|
"""
|
|
269
285
|
Emit openlineage event of type FAIL.
|
|
@@ -275,7 +291,16 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
275
291
|
:param parent_run_id: identifier of job spawning this task
|
|
276
292
|
:param end_time: time of task completion
|
|
277
293
|
:param task: metadata container with information extracted from operator
|
|
294
|
+
:param error: error
|
|
278
295
|
"""
|
|
296
|
+
error_facet = {}
|
|
297
|
+
if error:
|
|
298
|
+
if isinstance(error, BaseException):
|
|
299
|
+
import traceback
|
|
300
|
+
|
|
301
|
+
error = "\\n".join(traceback.format_exception(type(error), error, error.__traceback__))
|
|
302
|
+
error_facet = {"errorMessage": ErrorMessageRunFacet(message=error, programmingLanguage="python")}
|
|
303
|
+
|
|
279
304
|
event = RunEvent(
|
|
280
305
|
eventType=RunState.FAIL,
|
|
281
306
|
eventTime=end_time,
|
|
@@ -284,7 +309,7 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
284
309
|
job_name=job_name,
|
|
285
310
|
parent_job_name=parent_job_name,
|
|
286
311
|
parent_run_id=parent_run_id,
|
|
287
|
-
run_facets=task.run_facets,
|
|
312
|
+
run_facets={**task.run_facets, **error_facet},
|
|
288
313
|
),
|
|
289
314
|
job=self._build_job(job_name, job_type=_JOB_TYPE_TASK, job_facets=task.job_facets),
|
|
290
315
|
inputs=task.inputs,
|
|
@@ -299,14 +324,24 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
299
324
|
msg: str,
|
|
300
325
|
nominal_start_time: str,
|
|
301
326
|
nominal_end_time: str,
|
|
327
|
+
job_facets: dict[str, BaseFacet] | None = None, # Custom job facets
|
|
302
328
|
):
|
|
303
329
|
try:
|
|
304
330
|
event = RunEvent(
|
|
305
331
|
eventType=RunState.START,
|
|
306
332
|
eventTime=dag_run.start_date.isoformat(),
|
|
307
|
-
job=self._build_job(
|
|
333
|
+
job=self._build_job(
|
|
334
|
+
job_name=dag_run.dag_id,
|
|
335
|
+
job_type=_JOB_TYPE_DAG,
|
|
336
|
+
job_description=dag_run.dag.description if dag_run.dag else None,
|
|
337
|
+
owners=[x.strip() for x in dag_run.dag.owner.split(",")] if dag_run.dag else None,
|
|
338
|
+
job_facets=job_facets,
|
|
339
|
+
),
|
|
308
340
|
run=self._build_run(
|
|
309
|
-
run_id=self.build_dag_run_id(
|
|
341
|
+
run_id=self.build_dag_run_id(
|
|
342
|
+
dag_id=dag_run.dag_id,
|
|
343
|
+
execution_date=dag_run.execution_date,
|
|
344
|
+
),
|
|
310
345
|
job_name=dag_run.dag_id,
|
|
311
346
|
nominal_start_time=nominal_start_time,
|
|
312
347
|
nominal_end_time=nominal_end_time,
|
|
@@ -328,7 +363,13 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
328
363
|
eventType=RunState.COMPLETE,
|
|
329
364
|
eventTime=dag_run.end_date.isoformat(),
|
|
330
365
|
job=self._build_job(job_name=dag_run.dag_id, job_type=_JOB_TYPE_DAG),
|
|
331
|
-
run=Run(
|
|
366
|
+
run=Run(
|
|
367
|
+
runId=self.build_dag_run_id(
|
|
368
|
+
dag_id=dag_run.dag_id,
|
|
369
|
+
execution_date=dag_run.execution_date,
|
|
370
|
+
),
|
|
371
|
+
facets={**get_airflow_state_run_facet(dag_run)},
|
|
372
|
+
),
|
|
332
373
|
inputs=[],
|
|
333
374
|
outputs=[],
|
|
334
375
|
producer=_PRODUCER,
|
|
@@ -347,8 +388,14 @@ class OpenLineageAdapter(LoggingMixin):
|
|
|
347
388
|
eventTime=dag_run.end_date.isoformat(),
|
|
348
389
|
job=self._build_job(job_name=dag_run.dag_id, job_type=_JOB_TYPE_DAG),
|
|
349
390
|
run=Run(
|
|
350
|
-
runId=self.build_dag_run_id(
|
|
351
|
-
|
|
391
|
+
runId=self.build_dag_run_id(
|
|
392
|
+
dag_id=dag_run.dag_id,
|
|
393
|
+
execution_date=dag_run.execution_date,
|
|
394
|
+
),
|
|
395
|
+
facets={
|
|
396
|
+
"errorMessage": ErrorMessageRunFacet(message=msg, programmingLanguage="python"),
|
|
397
|
+
**get_airflow_state_run_facet(dag_run),
|
|
398
|
+
},
|
|
352
399
|
),
|
|
353
400
|
inputs=[],
|
|
354
401
|
outputs=[],
|
|
@@ -39,15 +39,56 @@ class AirflowMappedTaskRunFacet(BaseFacet):
|
|
|
39
39
|
|
|
40
40
|
@classmethod
|
|
41
41
|
def from_task_instance(cls, task_instance):
|
|
42
|
-
|
|
43
|
-
from airflow.providers.openlineage.utils.utils import get_operator_class
|
|
42
|
+
from airflow.providers.openlineage.utils.utils import get_fully_qualified_class_name
|
|
44
43
|
|
|
45
44
|
return cls(
|
|
46
45
|
mapIndex=task_instance.map_index,
|
|
47
|
-
operatorClass=
|
|
46
|
+
operatorClass=get_fully_qualified_class_name(task_instance.task),
|
|
48
47
|
)
|
|
49
48
|
|
|
50
49
|
|
|
50
|
+
@define(slots=False)
|
|
51
|
+
class AirflowJobFacet(BaseFacet):
|
|
52
|
+
"""
|
|
53
|
+
Composite Airflow job facet.
|
|
54
|
+
|
|
55
|
+
This facet encapsulates all the necessary information to re-create full scope of an Airflow DAG logic,
|
|
56
|
+
enabling reconstruction, visualization, and analysis of DAGs in a comprehensive manner.
|
|
57
|
+
It includes detailed representations of the tasks, task groups, and their hierarchical relationships,
|
|
58
|
+
making it possible to draw a graph that visually represents the entire DAG structure (like in Airflow UI).
|
|
59
|
+
It also indicates whether a task should emit an OpenLineage (OL) event, enabling consumers to anticipate
|
|
60
|
+
the number of events and identify the tasks from which they can expect these events.
|
|
61
|
+
|
|
62
|
+
Attributes:
|
|
63
|
+
taskTree: A dictionary representing the hierarchical structure of tasks in the DAG.
|
|
64
|
+
taskGroups: A dictionary that contains information about task groups within the DAG.
|
|
65
|
+
tasks: A dictionary detailing individual tasks within the DAG.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
taskTree: dict
|
|
69
|
+
taskGroups: dict
|
|
70
|
+
tasks: dict
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@define(slots=False)
|
|
74
|
+
class AirflowStateRunFacet(BaseFacet):
|
|
75
|
+
"""
|
|
76
|
+
Airflow facet providing state information.
|
|
77
|
+
|
|
78
|
+
This facet is designed to be sent at a completion event, offering state information about
|
|
79
|
+
the DAG run and each individual task. This information is crucial for understanding
|
|
80
|
+
the execution flow and comprehensive post-run analysis and debugging, including why certain tasks
|
|
81
|
+
did not emit events, which can occur due to the use of control flow operators like the BranchOperator.
|
|
82
|
+
|
|
83
|
+
Attributes:
|
|
84
|
+
dagRunState: This indicates the final status of the entire DAG run (e.g., "success", "failed").
|
|
85
|
+
tasksState: A dictionary mapping task IDs to their respective states. (e.g., "failed", "skipped").
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
dagRunState: str
|
|
89
|
+
tasksState: dict[str, str]
|
|
90
|
+
|
|
91
|
+
|
|
51
92
|
@define(slots=False)
|
|
52
93
|
class AirflowRunFacet(BaseFacet):
|
|
53
94
|
"""Composite Airflow run facet."""
|