apache-airflow-providers-openlineage 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apache-airflow-providers-openlineage might be problematic. Click here for more details.

@@ -215,7 +215,7 @@ Third party Apache 2.0 licenses
215
215
 
216
216
  The following components are provided under the Apache 2.0 License.
217
217
  See project link for details. The text of each license is also included
218
- at licenses/LICENSE-[project].txt.
218
+ at 3rd-party-licenses/LICENSE-[project].txt.
219
219
 
220
220
  (ALv2 License) hue v4.3.0 (https://github.com/cloudera/hue/)
221
221
  (ALv2 License) jqclock v2.3.0 (https://github.com/JohnRDOrazio/jQuery-Clock-Plugin)
@@ -227,7 +227,7 @@ MIT licenses
227
227
  ========================================================================
228
228
 
229
229
  The following components are provided under the MIT License. See project link for details.
230
- The text of each license is also included at licenses/LICENSE-[project].txt.
230
+ The text of each license is also included at 3rd-party-licenses/LICENSE-[project].txt.
231
231
 
232
232
  (MIT License) jquery v3.5.1 (https://jquery.org/license/)
233
233
  (MIT License) dagre-d3 v0.6.4 (https://github.com/cpettitt/dagre-d3)
@@ -243,11 +243,11 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
243
243
  BSD 3-Clause licenses
244
244
  ========================================================================
245
245
  The following components are provided under the BSD 3-Clause license. See project links for details.
246
- The text of each license is also included at licenses/LICENSE-[project].txt.
246
+ The text of each license is also included at 3rd-party-licenses/LICENSE-[project].txt.
247
247
 
248
248
  (BSD 3 License) d3 v5.16.0 (https://d3js.org)
249
249
  (BSD 3 License) d3-shape v2.1.0 (https://github.com/d3/d3-shape)
250
250
  (BSD 3 License) cgroupspy 0.2.1 (https://github.com/cloudsigma/cgroupspy)
251
251
 
252
252
  ========================================================================
253
- See licenses/LICENSES-ui.txt for packages used in `/airflow/www`
253
+ See 3rd-party-licenses/LICENSES-ui.txt for packages used in `/airflow/www`
@@ -29,7 +29,7 @@ from airflow import __version__ as airflow_version
29
29
 
30
30
  __all__ = ["__version__"]
31
31
 
32
- __version__ = "1.8.0"
32
+ __version__ = "1.9.0"
33
33
 
34
34
  if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
35
35
  "2.7.0"
@@ -33,7 +33,15 @@ from __future__ import annotations
33
33
  import os
34
34
  from typing import Any
35
35
 
36
- from airflow.compat.functools import cache
36
+ # Disable caching if we're inside tests - this makes config easier to mock.
37
+ if os.getenv("PYTEST_VERSION"):
38
+
39
+ def decorator(func):
40
+ return func
41
+
42
+ cache = decorator
43
+ else:
44
+ from airflow.compat.functools import cache
37
45
  from airflow.configuration import conf
38
46
 
39
47
  _CONFIG_SECTION = "openlineage"
@@ -130,3 +138,10 @@ def dag_state_change_process_pool_size() -> int:
130
138
  """[openlineage] dag_state_change_process_pool_size."""
131
139
  option = conf.get(_CONFIG_SECTION, "dag_state_change_process_pool_size", fallback="")
132
140
  return _safe_int_convert(str(option).strip(), default=1)
141
+
142
+
143
+ @cache
144
+ def execution_timeout() -> int:
145
+ """[openlineage] execution_timeout."""
146
+ option = conf.get(_CONFIG_SECTION, "execution_timeout", fallback="")
147
+ return _safe_int_convert(str(option).strip(), default=10)
@@ -41,7 +41,8 @@ class OperatorLineage:
41
41
 
42
42
 
43
43
  class BaseExtractor(ABC, LoggingMixin):
44
- """Abstract base extractor class.
44
+ """
45
+ Abstract base extractor class.
45
46
 
46
47
  This is used mostly to maintain support for custom extractors.
47
48
  """
@@ -55,7 +56,8 @@ class BaseExtractor(ABC, LoggingMixin):
55
56
  @classmethod
56
57
  @abstractmethod
57
58
  def get_operator_classnames(cls) -> list[str]:
58
- """Get a list of operators that extractor works for.
59
+ """
60
+ Get a list of operators that extractor works for.
59
61
 
60
62
  This is an abstract method that subclasses should implement. There are
61
63
  operators that work very similarly and one extractor can cover.
@@ -77,7 +79,8 @@ class DefaultExtractor(BaseExtractor):
77
79
 
78
80
  @classmethod
79
81
  def get_operator_classnames(cls) -> list[str]:
80
- """Assign this extractor to *no* operators.
82
+ """
83
+ Assign this extractor to *no* operators.
81
84
 
82
85
  Default extractor is chosen not on the classname basis, but
83
86
  by existence of get_openlineage_facets method on operator.
@@ -0,0 +1,40 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$defs": {
4
+ "AirflowJobFacet": {
5
+ "allOf": [
6
+ {
7
+ "$ref": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet"
8
+ },
9
+ {
10
+ "type": "object",
11
+ "properties": {
12
+ "taskTree": {
13
+ "description": "The hierarchical structure of tasks in the DAG.",
14
+ "type": "object",
15
+ "additionalProperties": true
16
+ },
17
+ "taskGroups": {
18
+ "description": "Information about all task groups within the DAG.",
19
+ "type": "object",
20
+ "additionalProperties": true
21
+ },
22
+ "tasks": {
23
+ "description": "Details of all individual tasks within the DAG.",
24
+ "type": "object",
25
+ "additionalProperties": true
26
+ }
27
+ },
28
+ "required": ["taskTree", "taskGroups", "tasks"]
29
+ }
30
+ ],
31
+ "type": "object"
32
+ }
33
+ },
34
+ "type": "object",
35
+ "properties": {
36
+ "airflow": {
37
+ "$ref": "#/$defs/AirflowJobFacet"
38
+ }
39
+ }
40
+ }
@@ -0,0 +1,261 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$defs": {
4
+ "AirflowRunFacet": {
5
+ "allOf": [
6
+ {
7
+ "$ref": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet"
8
+ },
9
+ {
10
+ "type": "object",
11
+ "properties": {
12
+ "dag": {
13
+ "$ref": "#/$defs/DAG"
14
+ },
15
+ "dagRun": {
16
+ "$ref": "#/$defs/DagRun"
17
+ },
18
+ "taskInstance": {
19
+ "$ref": "#/$defs/TaskInstance"
20
+ },
21
+ "task": {
22
+ "$ref": "#/$defs/Task"
23
+ },
24
+ "taskUuid": {
25
+ "type": "string"
26
+ }
27
+ },
28
+ "required": [
29
+ "dag",
30
+ "dagRun",
31
+ "taskInstance",
32
+ "task",
33
+ "taskUuid"
34
+ ]
35
+ }
36
+ ]
37
+ },
38
+ "Task": {
39
+ "type": "object",
40
+ "properties": {
41
+ "depends_on_past": {
42
+ "type": "boolean"
43
+ },
44
+ "downstream_task_ids": {
45
+ "type": "string"
46
+ },
47
+ "execution_timeout": {
48
+ "type": "string"
49
+ },
50
+ "executor_config": {
51
+ "type": "object",
52
+ "additionalProperties": true
53
+ },
54
+ "ignore_first_depends_on_past": {
55
+ "type": "boolean"
56
+ },
57
+ "is_setup": {
58
+ "type": "boolean"
59
+ },
60
+ "is_teardown": {
61
+ "type": "boolean"
62
+ },
63
+ "mapped": {
64
+ "type": "boolean"
65
+ },
66
+ "max_active_tis_per_dag": {
67
+ "type": "integer"
68
+ },
69
+ "max_active_tis_per_dagrun": {
70
+ "type": "integer"
71
+ },
72
+ "max_retry_delay": {
73
+ "type": "string"
74
+ },
75
+ "multiple_outputs": {
76
+ "type": "boolean"
77
+ },
78
+ "operator_class": {
79
+ "description": "Module + class name of the operator",
80
+ "type": "string"
81
+ },
82
+ "owner": {
83
+ "type": "string"
84
+ },
85
+ "priority_weight": {
86
+ "type": "integer"
87
+ },
88
+ "queue": {
89
+ "type": "string"
90
+ },
91
+ "retries": {
92
+ "type": "integer"
93
+ },
94
+ "retry_exponential_backoff": {
95
+ "type": "boolean"
96
+ },
97
+ "run_as_user": {
98
+ "type": "string"
99
+ },
100
+ "sla": {
101
+ "type": "number"
102
+ },
103
+ "task_id": {
104
+ "type": "string"
105
+ },
106
+ "trigger_rule": {
107
+ "type": "string"
108
+ },
109
+ "upstream_task_ids": {
110
+ "type": "string"
111
+ },
112
+ "wait_for_downstream": {
113
+ "type": "boolean"
114
+ },
115
+ "wait_for_past_depends_before_skipping": {
116
+ "type": "boolean"
117
+ },
118
+ "weight_rule": {
119
+ "type": "string"
120
+ },
121
+ "task_group": {
122
+ "description": "Task group related information",
123
+ "type": "object",
124
+ "properties": {
125
+ "group_id": {
126
+ "type": "string"
127
+ },
128
+ "downstream_group_ids": {
129
+ "type": "string"
130
+ },
131
+ "downstream_task_ids": {
132
+ "type": "string"
133
+ },
134
+ "prefix_group_id": {
135
+ "type": "boolean"
136
+ },
137
+ "tooltip": {
138
+ "type": "string"
139
+ },
140
+ "upstream_group_ids": {
141
+ "type": "string"
142
+ },
143
+ "upstream_task_ids": {
144
+ "type": "string"
145
+ }
146
+ },
147
+ "additionalProperties": true,
148
+ "required": ["group_id"]
149
+ }
150
+ },
151
+ "additionalProperties": true,
152
+ "required": [
153
+ "task_id"
154
+ ]
155
+ },
156
+ "DAG": {
157
+ "type": "object",
158
+ "properties": {
159
+ "dag_id": {
160
+ "type": "string"
161
+ },
162
+ "description": {
163
+ "type": "string"
164
+ },
165
+ "owner": {
166
+ "type": "string"
167
+ },
168
+ "schedule_interval": {
169
+ "type": "string"
170
+ },
171
+ "start_date": {
172
+ "type": "string",
173
+ "format": "date-time"
174
+ },
175
+ "tags": {
176
+ "type": "string"
177
+ },
178
+ "timetable": {
179
+ "description": "Describes timetable (successor of schedule_interval)",
180
+ "type": "object",
181
+ "additionalProperties": true
182
+ }
183
+ },
184
+ "additionalProperties": true,
185
+ "required": [
186
+ "dag_id",
187
+ "start_date"
188
+ ]
189
+ },
190
+ "TaskInstance": {
191
+ "type": "object",
192
+ "properties": {
193
+ "duration": {
194
+ "type": "number"
195
+ },
196
+ "map_index": {
197
+ "type": "integer"
198
+ },
199
+ "pool": {
200
+ "type": "string"
201
+ },
202
+ "try_number": {
203
+ "type": "integer"
204
+ },
205
+ "queued_dttm": {
206
+ "type": "string",
207
+ "format": "date-time"
208
+ }
209
+ },
210
+ "additionalProperties": true,
211
+ "required": [
212
+ "pool",
213
+ "try_number"
214
+ ]
215
+ },
216
+ "DagRun": {
217
+ "type": "object",
218
+ "properties": {
219
+ "conf": {
220
+ "type": "object",
221
+ "additionalProperties": true
222
+ },
223
+ "dag_id": {
224
+ "type": "string"
225
+ },
226
+ "data_interval_start": {
227
+ "type": "string",
228
+ "format": "date-time"
229
+ },
230
+ "data_interval_end": {
231
+ "type": "string",
232
+ "format": "date-time"
233
+ },
234
+ "external_trigger": {
235
+ "type": "boolean"
236
+ },
237
+ "run_id": {
238
+ "type": "string"
239
+ },
240
+ "run_type": {
241
+ "type": "string"
242
+ },
243
+ "start_date": {
244
+ "type": "string",
245
+ "format": "date-time"
246
+ }
247
+ },
248
+ "additionalProperties": true,
249
+ "required": [
250
+ "dag_id",
251
+ "run_id"
252
+ ]
253
+ }
254
+ },
255
+ "type": "object",
256
+ "properties": {
257
+ "airflow": {
258
+ "$ref": "#/$defs/AirflowRunFacet"
259
+ }
260
+ }
261
+ }
@@ -0,0 +1,34 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$defs": {
4
+ "AirflowStateRunFacet": {
5
+ "allOf": [
6
+ {
7
+ "$ref": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet"
8
+ },
9
+ {
10
+ "type": "object",
11
+ "properties": {
12
+ "dagRunState": {
13
+ "description": "The final status of the entire DagRun",
14
+ "type": "string"
15
+ },
16
+ "tasksState": {
17
+ "description": "Mapping of task IDs to their respective states",
18
+ "type": "object",
19
+ "additionalProperties": true
20
+ }
21
+ },
22
+ "required": ["dagRunState", "tasksState"]
23
+ }
24
+ ],
25
+ "type": "object"
26
+ }
27
+ },
28
+ "type": "object",
29
+ "properties": {
30
+ "airflowState": {
31
+ "$ref": "#/$defs/AirflowStateRunFacet"
32
+ }
33
+ }
34
+ }
@@ -0,0 +1,16 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
@@ -28,8 +28,9 @@ def get_provider_info():
28
28
  "name": "OpenLineage Airflow",
29
29
  "description": "`OpenLineage <https://openlineage.io/>`__\n",
30
30
  "state": "ready",
31
- "source-date-epoch": 1715684338,
31
+ "source-date-epoch": 1718605195,
32
32
  "versions": [
33
+ "1.9.0",
33
34
  "1.8.0",
34
35
  "1.7.1",
35
36
  "1.7.0",
@@ -50,8 +51,8 @@ def get_provider_info():
50
51
  "apache-airflow>=2.7.0",
51
52
  "apache-airflow-providers-common-sql>=1.6.0",
52
53
  "attrs>=22.2",
53
- "openlineage-integration-common>=0.28.0",
54
- "openlineage-python>=0.28.0",
54
+ "openlineage-integration-common>=1.16.0",
55
+ "openlineage-python>=1.16.0",
55
56
  ],
56
57
  "integrations": [
57
58
  {
@@ -134,6 +135,13 @@ def get_provider_info():
134
135
  "type": "integer",
135
136
  "version_added": "1.8.0",
136
137
  },
138
+ "execution_timeout": {
139
+ "description": "Maximum amount of time (in seconds) that OpenLineage can spend executing metadata extraction.\n",
140
+ "default": "10",
141
+ "example": None,
142
+ "type": "integer",
143
+ "version_added": "1.9.0",
144
+ },
137
145
  },
138
146
  }
139
147
  },
@@ -17,7 +17,6 @@
17
17
  from __future__ import annotations
18
18
 
19
19
  import traceback
20
- import uuid
21
20
  from contextlib import ExitStack
22
21
  from typing import TYPE_CHECKING
23
22
 
@@ -36,13 +35,19 @@ from openlineage.client.facet import (
36
35
  SourceCodeLocationJobFacet,
37
36
  )
38
37
  from openlineage.client.run import Job, Run, RunEvent, RunState
38
+ from openlineage.client.uuid import generate_static_uuid
39
39
 
40
40
  from airflow.providers.openlineage import __version__ as OPENLINEAGE_PROVIDER_VERSION, conf
41
- from airflow.providers.openlineage.utils.utils import OpenLineageRedactor
41
+ from airflow.providers.openlineage.utils.utils import (
42
+ OpenLineageRedactor,
43
+ get_airflow_state_run_facet,
44
+ )
42
45
  from airflow.stats import Stats
43
46
  from airflow.utils.log.logging_mixin import LoggingMixin
44
47
 
45
48
  if TYPE_CHECKING:
49
+ from datetime import datetime
50
+
46
51
  from airflow.models.dagrun import DagRun
47
52
  from airflow.providers.openlineage.extractors import OperatorLineage
48
53
  from airflow.utils.log.secrets_masker import SecretsMasker
@@ -111,20 +116,31 @@ class OpenLineageAdapter(LoggingMixin):
111
116
  return yaml.safe_load(config_file)
112
117
 
113
118
  @staticmethod
114
- def build_dag_run_id(dag_id, dag_run_id):
115
- return str(uuid.uuid3(uuid.NAMESPACE_URL, f"{conf.namespace()}.{dag_id}.{dag_run_id}"))
119
+ def build_dag_run_id(dag_id: str, execution_date: datetime) -> str:
120
+ return str(
121
+ generate_static_uuid(
122
+ instant=execution_date,
123
+ data=f"{conf.namespace()}.{dag_id}".encode(),
124
+ )
125
+ )
116
126
 
117
127
  @staticmethod
118
- def build_task_instance_run_id(dag_id, task_id, execution_date, try_number):
128
+ def build_task_instance_run_id(
129
+ dag_id: str,
130
+ task_id: str,
131
+ try_number: int,
132
+ execution_date: datetime,
133
+ ):
119
134
  return str(
120
- uuid.uuid3(
121
- uuid.NAMESPACE_URL,
122
- f"{conf.namespace()}.{dag_id}.{task_id}.{execution_date}.{try_number}",
135
+ generate_static_uuid(
136
+ instant=execution_date,
137
+ data=f"{conf.namespace()}.{dag_id}.{task_id}.{try_number}".encode(),
123
138
  )
124
139
  )
125
140
 
126
141
  def emit(self, event: RunEvent):
127
- """Emit OpenLineage event.
142
+ """
143
+ Emit OpenLineage event.
128
144
 
129
145
  :param event: Event to be emitted.
130
146
  :return: Redacted Event.
@@ -264,6 +280,7 @@ class OpenLineageAdapter(LoggingMixin):
264
280
  parent_run_id: str | None,
265
281
  end_time: str,
266
282
  task: OperatorLineage,
283
+ error: str | BaseException | None = None,
267
284
  ) -> RunEvent:
268
285
  """
269
286
  Emit openlineage event of type FAIL.
@@ -275,7 +292,22 @@ class OpenLineageAdapter(LoggingMixin):
275
292
  :param parent_run_id: identifier of job spawning this task
276
293
  :param end_time: time of task completion
277
294
  :param task: metadata container with information extracted from operator
295
+ :param error: error
278
296
  """
297
+ error_facet = {}
298
+ if error:
299
+ stack_trace = None
300
+ if isinstance(error, BaseException) and error.__traceback__:
301
+ import traceback
302
+
303
+ stack_trace = "\\n".join(traceback.format_exception(type(error), error, error.__traceback__))
304
+
305
+ error_facet = {
306
+ "errorMessage": ErrorMessageRunFacet(
307
+ message=str(error), programmingLanguage="python", stackTrace=stack_trace
308
+ )
309
+ }
310
+
279
311
  event = RunEvent(
280
312
  eventType=RunState.FAIL,
281
313
  eventTime=end_time,
@@ -284,7 +316,7 @@ class OpenLineageAdapter(LoggingMixin):
284
316
  job_name=job_name,
285
317
  parent_job_name=parent_job_name,
286
318
  parent_run_id=parent_run_id,
287
- run_facets=task.run_facets,
319
+ run_facets={**task.run_facets, **error_facet},
288
320
  ),
289
321
  job=self._build_job(job_name, job_type=_JOB_TYPE_TASK, job_facets=task.job_facets),
290
322
  inputs=task.inputs,
@@ -299,14 +331,24 @@ class OpenLineageAdapter(LoggingMixin):
299
331
  msg: str,
300
332
  nominal_start_time: str,
301
333
  nominal_end_time: str,
334
+ job_facets: dict[str, BaseFacet] | None = None, # Custom job facets
302
335
  ):
303
336
  try:
304
337
  event = RunEvent(
305
338
  eventType=RunState.START,
306
339
  eventTime=dag_run.start_date.isoformat(),
307
- job=self._build_job(job_name=dag_run.dag_id, job_type=_JOB_TYPE_DAG),
340
+ job=self._build_job(
341
+ job_name=dag_run.dag_id,
342
+ job_type=_JOB_TYPE_DAG,
343
+ job_description=dag_run.dag.description if dag_run.dag else None,
344
+ owners=[x.strip() for x in dag_run.dag.owner.split(",")] if dag_run.dag else None,
345
+ job_facets=job_facets,
346
+ ),
308
347
  run=self._build_run(
309
- run_id=self.build_dag_run_id(dag_run.dag_id, dag_run.run_id),
348
+ run_id=self.build_dag_run_id(
349
+ dag_id=dag_run.dag_id,
350
+ execution_date=dag_run.execution_date,
351
+ ),
310
352
  job_name=dag_run.dag_id,
311
353
  nominal_start_time=nominal_start_time,
312
354
  nominal_end_time=nominal_end_time,
@@ -328,7 +370,13 @@ class OpenLineageAdapter(LoggingMixin):
328
370
  eventType=RunState.COMPLETE,
329
371
  eventTime=dag_run.end_date.isoformat(),
330
372
  job=self._build_job(job_name=dag_run.dag_id, job_type=_JOB_TYPE_DAG),
331
- run=Run(runId=self.build_dag_run_id(dag_run.dag_id, dag_run.run_id)),
373
+ run=Run(
374
+ runId=self.build_dag_run_id(
375
+ dag_id=dag_run.dag_id,
376
+ execution_date=dag_run.execution_date,
377
+ ),
378
+ facets={**get_airflow_state_run_facet(dag_run)},
379
+ ),
332
380
  inputs=[],
333
381
  outputs=[],
334
382
  producer=_PRODUCER,
@@ -347,8 +395,14 @@ class OpenLineageAdapter(LoggingMixin):
347
395
  eventTime=dag_run.end_date.isoformat(),
348
396
  job=self._build_job(job_name=dag_run.dag_id, job_type=_JOB_TYPE_DAG),
349
397
  run=Run(
350
- runId=self.build_dag_run_id(dag_run.dag_id, dag_run.run_id),
351
- facets={"errorMessage": ErrorMessageRunFacet(message=msg, programmingLanguage="python")},
398
+ runId=self.build_dag_run_id(
399
+ dag_id=dag_run.dag_id,
400
+ execution_date=dag_run.execution_date,
401
+ ),
402
+ facets={
403
+ "errorMessage": ErrorMessageRunFacet(message=msg, programmingLanguage="python"),
404
+ **get_airflow_state_run_facet(dag_run),
405
+ },
352
406
  ),
353
407
  inputs=[],
354
408
  outputs=[],