apache-airflow-providers-openlineage 2.1.1rc1__tar.gz → 2.1.2rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apache-airflow-providers-openlineage might be problematic. Click here for more details.

Files changed (31) hide show
  1. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/PKG-INFO +8 -8
  2. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/README.rst +4 -4
  3. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/pyproject.toml +5 -5
  4. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/__init__.py +1 -1
  5. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/extractors/base.py +37 -37
  6. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/extractors/manager.py +21 -12
  7. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/get_provider_info.py +2 -1
  8. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/plugins/adapter.py +1 -1
  9. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/plugins/listener.py +16 -5
  10. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/plugins/macros.py +15 -4
  11. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/utils/spark.py +64 -26
  12. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/utils/utils.py +12 -7
  13. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/LICENSE +0 -0
  14. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/conf.py +0 -0
  15. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/extractors/__init__.py +0 -0
  16. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/extractors/bash.py +0 -0
  17. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/extractors/python.py +0 -0
  18. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/facets/AirflowDagRunFacet.json +0 -0
  19. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/facets/AirflowDebugRunFacet.json +0 -0
  20. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/facets/AirflowJobFacet.json +0 -0
  21. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/facets/AirflowRunFacet.json +0 -0
  22. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/facets/AirflowStateRunFacet.json +0 -0
  23. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/facets/__init__.py +0 -0
  24. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/plugins/__init__.py +0 -0
  25. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/plugins/facets.py +0 -0
  26. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/plugins/openlineage.py +0 -0
  27. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/sqlparser.py +0 -0
  28. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/utils/__init__.py +0 -0
  29. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/utils/selective_enable.py +0 -0
  30. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/utils/sql.py +0 -0
  31. {apache_airflow_providers_openlineage-2.1.1rc1 → apache_airflow_providers_openlineage-2.1.2rc1}/src/airflow/providers/openlineage/version_compat.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apache-airflow-providers-openlineage
3
- Version: 2.1.1rc1
3
+ Version: 2.1.2rc1
4
4
  Summary: Provider package apache-airflow-providers-openlineage for Apache Airflow
5
5
  Keywords: airflow-provider,openlineage,airflow,integration
6
6
  Author-email: Apache Software Foundation <dev@airflow.apache.org>
@@ -27,11 +27,11 @@ Requires-Dist: attrs>=22.2
27
27
  Requires-Dist: openlineage-integration-common>=1.24.2
28
28
  Requires-Dist: openlineage-python>=1.24.2
29
29
  Project-URL: Bug Tracker, https://github.com/apache/airflow/issues
30
- Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.1/changelog.html
31
- Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.1
30
+ Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.2/changelog.html
31
+ Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.2
32
+ Project-URL: Mastodon, https://fosstodon.org/@airflow
32
33
  Project-URL: Slack Chat, https://s.apache.org/airflow-slack
33
34
  Project-URL: Source Code, https://github.com/apache/airflow
34
- Project-URL: Twitter, https://x.com/ApacheAirflow
35
35
  Project-URL: YouTube, https://www.youtube.com/channel/UCSXwxpWZQ7XZ1WL3wqevChA/
36
36
 
37
37
 
@@ -59,7 +59,7 @@ Project-URL: YouTube, https://www.youtube.com/channel/UCSXwxpWZQ7XZ1WL3wqevChA/
59
59
 
60
60
  Package ``apache-airflow-providers-openlineage``
61
61
 
62
- Release: ``2.1.1``
62
+ Release: ``2.1.2``
63
63
 
64
64
 
65
65
  `OpenLineage <https://openlineage.io/>`__
@@ -72,7 +72,7 @@ This is a provider package for ``openlineage`` provider. All classes for this pr
72
72
  are in ``airflow.providers.openlineage`` python package.
73
73
 
74
74
  You can find package information and changelog for the provider
75
- in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.1/>`_.
75
+ in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.2/>`_.
76
76
 
77
77
  Installation
78
78
  ------------
@@ -101,7 +101,7 @@ Cross provider package dependencies
101
101
  -----------------------------------
102
102
 
103
103
  Those are dependencies that might be needed in order to use all the features of the package.
104
- You need to install the specified provider packages in order to use them.
104
+ You need to install the specified providers in order to use them.
105
105
 
106
106
  You can install such cross-provider dependencies when installing from PyPI. For example:
107
107
 
@@ -118,5 +118,5 @@ Dependent package
118
118
  ================================================================================================================== =================
119
119
 
120
120
  The changelog for the provider package can be found in the
121
- `changelog <https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.1/changelog.html>`_.
121
+ `changelog <https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.2/changelog.html>`_.
122
122
 
@@ -23,7 +23,7 @@
23
23
 
24
24
  Package ``apache-airflow-providers-openlineage``
25
25
 
26
- Release: ``2.1.1``
26
+ Release: ``2.1.2``
27
27
 
28
28
 
29
29
  `OpenLineage <https://openlineage.io/>`__
@@ -36,7 +36,7 @@ This is a provider package for ``openlineage`` provider. All classes for this pr
36
36
  are in ``airflow.providers.openlineage`` python package.
37
37
 
38
38
  You can find package information and changelog for the provider
39
- in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.1/>`_.
39
+ in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.2/>`_.
40
40
 
41
41
  Installation
42
42
  ------------
@@ -65,7 +65,7 @@ Cross provider package dependencies
65
65
  -----------------------------------
66
66
 
67
67
  Those are dependencies that might be needed in order to use all the features of the package.
68
- You need to install the specified provider packages in order to use them.
68
+ You need to install the specified providers in order to use them.
69
69
 
70
70
  You can install such cross-provider dependencies when installing from PyPI. For example:
71
71
 
@@ -82,4 +82,4 @@ Dependent package
82
82
  ================================================================================================================== =================
83
83
 
84
84
  The changelog for the provider package can be found in the
85
- `changelog <https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.1/changelog.html>`_.
85
+ `changelog <https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.2/changelog.html>`_.
@@ -20,12 +20,12 @@
20
20
  # IF YOU WANT TO MODIFY THIS FILE EXCEPT DEPENDENCIES, YOU SHOULD MODIFY THE TEMPLATE
21
21
  # `pyproject_TEMPLATE.toml.jinja2` IN the `dev/breeze/src/airflow_breeze/templates` DIRECTORY
22
22
  [build-system]
23
- requires = ["flit_core==3.11.0"]
23
+ requires = ["flit_core==3.12.0"]
24
24
  build-backend = "flit_core.buildapi"
25
25
 
26
26
  [project]
27
27
  name = "apache-airflow-providers-openlineage"
28
- version = "2.1.1.rc1"
28
+ version = "2.1.2.rc1"
29
29
  description = "Provider package apache-airflow-providers-openlineage for Apache Airflow"
30
30
  readme = "README.rst"
31
31
  authors = [
@@ -87,12 +87,12 @@ apache-airflow-providers-fab = {workspace = true}
87
87
  apache-airflow-providers-standard = {workspace = true}
88
88
 
89
89
  [project.urls]
90
- "Documentation" = "https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.1"
91
- "Changelog" = "https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.1/changelog.html"
90
+ "Documentation" = "https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.2"
91
+ "Changelog" = "https://airflow.apache.org/docs/apache-airflow-providers-openlineage/2.1.2/changelog.html"
92
92
  "Bug Tracker" = "https://github.com/apache/airflow/issues"
93
93
  "Source Code" = "https://github.com/apache/airflow"
94
94
  "Slack Chat" = "https://s.apache.org/airflow-slack"
95
- "Twitter" = "https://x.com/ApacheAirflow"
95
+ "Mastodon" = "https://fosstodon.org/@airflow"
96
96
  "YouTube" = "https://www.youtube.com/channel/UCSXwxpWZQ7XZ1WL3wqevChA/"
97
97
 
98
98
  [project.entry-points."apache_airflow_provider"]
@@ -29,7 +29,7 @@ from airflow import __version__ as airflow_version
29
29
 
30
30
  __all__ = ["__version__"]
31
31
 
32
- __version__ = "2.1.1"
32
+ __version__ = "2.1.2"
33
33
 
34
34
  if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
35
35
  "2.9.0"
@@ -29,14 +29,16 @@ with warnings.catch_warnings():
29
29
  from openlineage.client.facet import BaseFacet as BaseFacet_V1
30
30
  from openlineage.client.facet_v2 import JobFacet, RunFacet
31
31
 
32
- from airflow.providers.openlineage.utils.utils import AIRFLOW_V_2_10_PLUS
33
32
  from airflow.utils.log.logging_mixin import LoggingMixin
34
- from airflow.utils.state import TaskInstanceState
35
33
 
36
34
  # this is not to break static checks compatibility with v1 OpenLineage facet classes
37
35
  DatasetSubclass = TypeVar("DatasetSubclass", bound=OLDataset)
38
36
  BaseFacetSubclass = TypeVar("BaseFacetSubclass", bound=Union[BaseFacet_V1, RunFacet, JobFacet])
39
37
 
38
+ OL_METHOD_NAME_START = "get_openlineage_facets_on_start"
39
+ OL_METHOD_NAME_COMPLETE = "get_openlineage_facets_on_complete"
40
+ OL_METHOD_NAME_FAIL = "get_openlineage_facets_on_failure"
41
+
40
42
 
41
43
  @define
42
44
  class OperatorLineage(Generic[DatasetSubclass, BaseFacetSubclass]):
@@ -81,6 +83,9 @@ class BaseExtractor(ABC, LoggingMixin):
81
83
  def extract_on_complete(self, task_instance) -> OperatorLineage | None:
82
84
  return self.extract()
83
85
 
86
+ def extract_on_failure(self, task_instance) -> OperatorLineage | None:
87
+ return self.extract()
88
+
84
89
 
85
90
  class DefaultExtractor(BaseExtractor):
86
91
  """Extractor that uses `get_openlineage_facets_on_start/complete/failure` methods."""
@@ -96,46 +101,41 @@ class DefaultExtractor(BaseExtractor):
96
101
  return []
97
102
 
98
103
  def _execute_extraction(self) -> OperatorLineage | None:
99
- # OpenLineage methods are optional - if there's no method, return None
100
- try:
101
- self.log.debug(
102
- "Trying to execute `get_openlineage_facets_on_start` for %s.", self.operator.task_type
103
- )
104
- return self._get_openlineage_facets(self.operator.get_openlineage_facets_on_start) # type: ignore
105
- except ImportError:
106
- self.log.error(
107
- "OpenLineage provider method failed to import OpenLineage integration. "
108
- "This should not happen. Please report this bug to developers."
109
- )
110
- return None
111
- except AttributeError:
104
+ method = getattr(self.operator, OL_METHOD_NAME_START, None)
105
+ if callable(method):
112
106
  self.log.debug(
113
- "Operator %s does not have the get_openlineage_facets_on_start method.",
114
- self.operator.task_type,
107
+ "Trying to execute '%s' method of '%s'.", OL_METHOD_NAME_START, self.operator.task_type
115
108
  )
116
- return OperatorLineage()
109
+ return self._get_openlineage_facets(method)
110
+ self.log.debug(
111
+ "Operator '%s' does not have '%s' method.", self.operator.task_type, OL_METHOD_NAME_START
112
+ )
113
+ return OperatorLineage()
117
114
 
118
115
  def extract_on_complete(self, task_instance) -> OperatorLineage | None:
119
- failed_states = [TaskInstanceState.FAILED, TaskInstanceState.UP_FOR_RETRY]
120
- if not AIRFLOW_V_2_10_PLUS: # todo: remove when min airflow version >= 2.10.0
121
- # Before fix (#41053) implemented in Airflow 2.10 TaskInstance's state was still RUNNING when
122
- # being passed to listener's on_failure method. Since `extract_on_complete()` is only called
123
- # after task completion, RUNNING state means that we are dealing with FAILED task in < 2.10
124
- failed_states = [TaskInstanceState.RUNNING]
125
-
126
- if task_instance.state in failed_states:
127
- on_failed = getattr(self.operator, "get_openlineage_facets_on_failure", None)
128
- if on_failed and callable(on_failed):
129
- self.log.debug(
130
- "Executing `get_openlineage_facets_on_failure` for %s.", self.operator.task_type
131
- )
132
- return self._get_openlineage_facets(on_failed, task_instance)
133
- on_complete = getattr(self.operator, "get_openlineage_facets_on_complete", None)
134
- if on_complete and callable(on_complete):
135
- self.log.debug("Executing `get_openlineage_facets_on_complete` for %s.", self.operator.task_type)
136
- return self._get_openlineage_facets(on_complete, task_instance)
116
+ method = getattr(self.operator, OL_METHOD_NAME_COMPLETE, None)
117
+ if callable(method):
118
+ self.log.debug(
119
+ "Trying to execute '%s' method of '%s'.", OL_METHOD_NAME_COMPLETE, self.operator.task_type
120
+ )
121
+ return self._get_openlineage_facets(method, task_instance)
122
+ self.log.debug(
123
+ "Operator '%s' does not have '%s' method.", self.operator.task_type, OL_METHOD_NAME_COMPLETE
124
+ )
137
125
  return self.extract()
138
126
 
127
+ def extract_on_failure(self, task_instance) -> OperatorLineage | None:
128
+ method = getattr(self.operator, OL_METHOD_NAME_FAIL, None)
129
+ if callable(method):
130
+ self.log.debug(
131
+ "Trying to execute '%s' method of '%s'.", OL_METHOD_NAME_FAIL, self.operator.task_type
132
+ )
133
+ return self._get_openlineage_facets(method, task_instance)
134
+ self.log.debug(
135
+ "Operator '%s' does not have '%s' method.", self.operator.task_type, OL_METHOD_NAME_FAIL
136
+ )
137
+ return self.extract_on_complete(task_instance)
138
+
139
139
  def _get_openlineage_facets(self, get_facets_method, *args) -> OperatorLineage | None:
140
140
  try:
141
141
  facets: OperatorLineage = get_facets_method(*args)
@@ -153,5 +153,5 @@ class DefaultExtractor(BaseExtractor):
153
153
  "This should not happen."
154
154
  )
155
155
  except Exception:
156
- self.log.warning("OpenLineage provider method failed to extract data from provider. ")
156
+ self.log.warning("OpenLineage provider method failed to extract data from provider.")
157
157
  return None
@@ -24,7 +24,11 @@ from airflow.providers.common.compat.openlineage.utils.utils import (
24
24
  )
25
25
  from airflow.providers.openlineage import conf
26
26
  from airflow.providers.openlineage.extractors import BaseExtractor, OperatorLineage
27
- from airflow.providers.openlineage.extractors.base import DefaultExtractor
27
+ from airflow.providers.openlineage.extractors.base import (
28
+ OL_METHOD_NAME_COMPLETE,
29
+ OL_METHOD_NAME_START,
30
+ DefaultExtractor,
31
+ )
28
32
  from airflow.providers.openlineage.extractors.bash import BashExtractor
29
33
  from airflow.providers.openlineage.extractors.python import PythonExtractor
30
34
  from airflow.providers.openlineage.utils.utils import (
@@ -32,6 +36,7 @@ from airflow.providers.openlineage.utils.utils import (
32
36
  try_import_from_string,
33
37
  )
34
38
  from airflow.utils.log.logging_mixin import LoggingMixin
39
+ from airflow.utils.state import TaskInstanceState
35
40
 
36
41
  if TYPE_CHECKING:
37
42
  from openlineage.client.event_v2 import Dataset
@@ -87,7 +92,9 @@ class ExtractorManager(LoggingMixin):
87
92
  def add_extractor(self, operator_class: str, extractor: type[BaseExtractor]):
88
93
  self.extractors[operator_class] = extractor
89
94
 
90
- def extract_metadata(self, dagrun, task, complete: bool = False, task_instance=None) -> OperatorLineage:
95
+ def extract_metadata(
96
+ self, dagrun, task, task_instance_state: TaskInstanceState, task_instance=None
97
+ ) -> OperatorLineage:
91
98
  extractor = self._get_extractor(task)
92
99
  task_info = (
93
100
  f"task_type={task.task_type} "
@@ -104,10 +111,15 @@ class ExtractorManager(LoggingMixin):
104
111
  extractor.__class__.__name__,
105
112
  str(task_info),
106
113
  )
107
- if complete:
108
- task_metadata = extractor.extract_on_complete(task_instance)
109
- else:
114
+ if task_instance_state == TaskInstanceState.RUNNING:
110
115
  task_metadata = extractor.extract()
116
+ elif task_instance_state == TaskInstanceState.FAILED:
117
+ if callable(getattr(extractor, "extract_on_failure", None)):
118
+ task_metadata = extractor.extract_on_failure(task_instance)
119
+ else:
120
+ task_metadata = extractor.extract_on_complete(task_instance)
121
+ else:
122
+ task_metadata = extractor.extract_on_complete(task_instance)
111
123
 
112
124
  self.log.debug(
113
125
  "Found task metadata for operation %s: %s",
@@ -155,13 +167,9 @@ class ExtractorManager(LoggingMixin):
155
167
  return self.extractors[task.task_type]
156
168
 
157
169
  def method_exists(method_name):
158
- method = getattr(task, method_name, None)
159
- if method:
160
- return callable(method)
170
+ return callable(getattr(task, method_name, None))
161
171
 
162
- if method_exists("get_openlineage_facets_on_start") or method_exists(
163
- "get_openlineage_facets_on_complete"
164
- ):
172
+ if method_exists(OL_METHOD_NAME_START) or method_exists(OL_METHOD_NAME_COMPLETE):
165
173
  return self.default_extractor
166
174
  return None
167
175
 
@@ -191,7 +199,8 @@ class ExtractorManager(LoggingMixin):
191
199
  if d:
192
200
  task_metadata.outputs.append(d)
193
201
 
194
- def get_hook_lineage(self) -> tuple[list[Dataset], list[Dataset]] | None:
202
+ @staticmethod
203
+ def get_hook_lineage() -> tuple[list[Dataset], list[Dataset]] | None:
195
204
  try:
196
205
  from airflow.providers.common.compat.lineage.hook import (
197
206
  get_hook_lineage_collector,
@@ -27,8 +27,9 @@ def get_provider_info():
27
27
  "name": "OpenLineage Airflow",
28
28
  "description": "`OpenLineage <https://openlineage.io/>`__\n",
29
29
  "state": "ready",
30
- "source-date-epoch": 1741509355,
30
+ "source-date-epoch": 1742478177,
31
31
  "versions": [
32
+ "2.1.2",
32
33
  "2.1.1",
33
34
  "2.1.0",
34
35
  "2.0.0",
@@ -85,7 +85,7 @@ class OpenLineageAdapter(LoggingMixin):
85
85
  if config:
86
86
  self.log.debug(
87
87
  "OpenLineage configuration found. Transport type: `%s`",
88
- config.get("type", "no type provided"),
88
+ config.get("transport", {}).get("type", "no type provided"),
89
89
  )
90
90
  self._client = OpenLineageClient(config=config) # type: ignore[call-arg]
91
91
  else:
@@ -69,13 +69,15 @@ def _get_try_number_success(val):
69
69
 
70
70
  def _executor_initializer():
71
71
  """
72
- Initialize worker processes for the executor used for DagRun listener.
72
+ Initialize processes for the executor used with DAGRun listener's methods (on scheduler).
73
73
 
74
74
  This function must be picklable, so it cannot be defined as an inner method or local function.
75
75
 
76
76
  Reconfigures the ORM engine to prevent issues that arise when multiple processes interact with
77
77
  the Airflow database.
78
78
  """
79
+ # This initializer is used only on the scheduler
80
+ # We can configure_orm regardless of the Airflow version, as DB access is always allowed from scheduler.
79
81
  settings.configure_orm()
80
82
 
81
83
 
@@ -199,7 +201,9 @@ class OpenLineageListener:
199
201
  operator_name = task.task_type.lower()
200
202
 
201
203
  with Stats.timer(f"ol.extract.{event_type}.{operator_name}"):
202
- task_metadata = self.extractor_manager.extract_metadata(dagrun, task)
204
+ task_metadata = self.extractor_manager.extract_metadata(
205
+ dagrun=dagrun, task=task, task_instance_state=TaskInstanceState.RUNNING
206
+ )
203
207
 
204
208
  redacted_event = self.adapter.start_task(
205
209
  run_id=task_uuid,
@@ -302,7 +306,10 @@ class OpenLineageListener:
302
306
 
303
307
  with Stats.timer(f"ol.extract.{event_type}.{operator_name}"):
304
308
  task_metadata = self.extractor_manager.extract_metadata(
305
- dagrun, task, complete=True, task_instance=task_instance
309
+ dagrun=dagrun,
310
+ task=task,
311
+ task_instance_state=TaskInstanceState.SUCCESS,
312
+ task_instance=task_instance,
306
313
  )
307
314
 
308
315
  redacted_event = self.adapter.complete_task(
@@ -423,7 +430,10 @@ class OpenLineageListener:
423
430
 
424
431
  with Stats.timer(f"ol.extract.{event_type}.{operator_name}"):
425
432
  task_metadata = self.extractor_manager.extract_metadata(
426
- dagrun, task, complete=True, task_instance=task_instance
433
+ dagrun=dagrun,
434
+ task=task,
435
+ task_instance_state=TaskInstanceState.FAILED,
436
+ task_instance=task_instance,
427
437
  )
428
438
 
429
439
  redacted_event = self.adapter.fail_task(
@@ -481,7 +491,8 @@ class OpenLineageListener:
481
491
  self.log.debug("Process with pid %s finished - parent", pid)
482
492
  else:
483
493
  setproctitle(getproctitle() + " - OpenLineage - " + callable_name)
484
- configure_orm(disable_connection_pool=True)
494
+ if not AIRFLOW_V_3_0_PLUS:
495
+ configure_orm(disable_connection_pool=True)
485
496
  self.log.debug("Executing OpenLineage process - %s - pid %s", callable_name, os.getpid())
486
497
  callable()
487
498
  self.log.debug("Process with current pid finishes after %s", callable_name)
@@ -21,6 +21,7 @@ from typing import TYPE_CHECKING
21
21
  from airflow.providers.openlineage import conf
22
22
  from airflow.providers.openlineage.plugins.adapter import OpenLineageAdapter
23
23
  from airflow.providers.openlineage.utils.utils import get_job_name
24
+ from airflow.providers.openlineage.version_compat import AIRFLOW_V_3_0_PLUS
24
25
 
25
26
  if TYPE_CHECKING:
26
27
  from airflow.models import TaskInstance
@@ -58,15 +59,25 @@ def lineage_run_id(task_instance: TaskInstance):
58
59
  For more information take a look at the guide:
59
60
  :ref:`howto/macros:openlineage`
60
61
  """
61
- if hasattr(task_instance, "logical_date"):
62
- logical_date = task_instance.logical_date
62
+ if AIRFLOW_V_3_0_PLUS:
63
+ context = task_instance.get_template_context()
64
+ if hasattr(task_instance, "dag_run"):
65
+ dag_run = task_instance.dag_run
66
+ elif hasattr(context, "dag_run"):
67
+ dag_run = context["dag_run"]
68
+ if hasattr(dag_run, "logical_date") and dag_run.logical_date:
69
+ date = dag_run.logical_date
70
+ else:
71
+ date = dag_run.run_after
72
+ elif hasattr(task_instance, "logical_date"):
73
+ date = task_instance.logical_date
63
74
  else:
64
- logical_date = task_instance.execution_date
75
+ date = task_instance.execution_date
65
76
  return OpenLineageAdapter.build_task_instance_run_id(
66
77
  dag_id=task_instance.dag_id,
67
78
  task_id=task_instance.task_id,
68
79
  try_number=task_instance.try_number,
69
- logical_date=logical_date,
80
+ logical_date=date,
70
81
  map_index=task_instance.map_index,
71
82
  )
72
83
 
@@ -53,35 +53,73 @@ def _get_parent_job_information_as_spark_properties(context: Context) -> dict:
53
53
 
54
54
  def _get_transport_information_as_spark_properties() -> dict:
55
55
  """Retrieve transport information as Spark properties."""
56
- transport = get_openlineage_listener().adapter.get_or_create_openlineage_client().transport
57
- if transport.kind != "http":
58
- log.info(
59
- "OpenLineage transport type `%s` does not support automatic "
60
- "injection of OpenLineage transport information into Spark properties.",
61
- transport.kind,
62
- )
63
- return {}
64
-
65
- properties = {
66
- "spark.openlineage.transport.type": transport.kind,
67
- "spark.openlineage.transport.url": transport.url,
68
- "spark.openlineage.transport.endpoint": transport.endpoint,
69
- "spark.openlineage.transport.timeoutInMillis": str(
70
- int(transport.timeout * 1000) # convert to milliseconds, as required by Spark integration
71
- ),
72
- }
73
- if transport.compression:
74
- properties["spark.openlineage.transport.compression"] = str(transport.compression)
75
56
 
76
- if hasattr(transport.config.auth, "api_key") and transport.config.auth.get_bearer():
77
- properties["spark.openlineage.transport.auth.type"] = "api_key"
78
- properties["spark.openlineage.transport.auth.apiKey"] = transport.config.auth.get_bearer()
57
+ def _get_transport_information(tp) -> dict:
58
+ properties = {
59
+ "type": tp.kind,
60
+ "url": tp.url,
61
+ "endpoint": tp.endpoint,
62
+ "timeoutInMillis": str(
63
+ int(tp.timeout) * 1000 # convert to milliseconds, as required by Spark integration
64
+ ),
65
+ }
66
+ if hasattr(tp, "compression") and tp.compression:
67
+ properties["compression"] = str(tp.compression)
68
+
69
+ if hasattr(tp.config.auth, "api_key") and tp.config.auth.get_bearer():
70
+ properties["auth.type"] = "api_key"
71
+ properties["auth.apiKey"] = tp.config.auth.get_bearer()
72
+
73
+ if hasattr(tp.config, "custom_headers") and tp.config.custom_headers:
74
+ for key, value in tp.config.custom_headers.items():
75
+ properties[f"headers.{key}"] = value
76
+ return properties
77
+
78
+ def _format_transport(props: dict, transport: dict, name: str | None):
79
+ for key, value in transport.items():
80
+ if name:
81
+ props[f"spark.openlineage.transport.transports.{name}.{key}"] = value
82
+ else:
83
+ props[f"spark.openlineage.transport.{key}"] = value
84
+ return props
79
85
 
80
- if hasattr(transport.config, "custom_headers") and transport.config.custom_headers:
81
- for key, value in transport.config.custom_headers.items():
82
- properties[f"spark.openlineage.transport.headers.{key}"] = value
86
+ transport = get_openlineage_listener().adapter.get_or_create_openlineage_client().transport
83
87
 
84
- return properties
88
+ if transport.kind == "composite":
89
+ http_transports = {}
90
+ for nested_transport in transport.transports:
91
+ if nested_transport.kind == "http":
92
+ http_transports[nested_transport.name] = _get_transport_information(nested_transport)
93
+ else:
94
+ name = nested_transport.name if hasattr(nested_transport, "name") else "no-name"
95
+ log.info(
96
+ "OpenLineage transport type `%s` with name `%s` is not supported in composite transport.",
97
+ nested_transport.kind,
98
+ name,
99
+ )
100
+ if len(http_transports) == 0:
101
+ log.warning(
102
+ "OpenLineage transport type `composite` does not contain http transport. Skipping "
103
+ "injection of OpenLineage transport information into Spark properties.",
104
+ )
105
+ return {}
106
+ props = {
107
+ "spark.openlineage.transport.type": "composite",
108
+ "spark.openlineage.transport.continueOnFailure": str(transport.config.continue_on_failure),
109
+ }
110
+ for name, http_transport in http_transports.items():
111
+ props = _format_transport(props, http_transport, name)
112
+ return props
113
+
114
+ elif transport.kind == "http":
115
+ return _format_transport({}, _get_transport_information(transport), None)
116
+
117
+ log.info(
118
+ "OpenLineage transport type `%s` does not support automatic "
119
+ "injection of OpenLineage transport information into Spark properties.",
120
+ transport.kind,
121
+ )
122
+ return {}
85
123
 
86
124
 
87
125
  def _is_parent_job_information_present_in_spark_properties(properties: dict) -> bool:
@@ -213,13 +213,7 @@ def is_ti_rescheduled_already(ti: TaskInstance, session=NEW_SESSION):
213
213
 
214
214
  return (
215
215
  session.query(
216
- exists().where(
217
- TaskReschedule.dag_id == ti.dag_id,
218
- TaskReschedule.task_id == ti.task_id,
219
- TaskReschedule.run_id == ti.run_id,
220
- TaskReschedule.map_index == ti.map_index,
221
- TaskReschedule.try_number == ti.try_number,
222
- )
216
+ exists().where(TaskReschedule.ti_id == ti.id, TaskReschedule.try_number == ti.try_number)
223
217
  ).scalar()
224
218
  is True
225
219
  )
@@ -369,8 +363,19 @@ class DagRunInfo(InfoJsonEncodable):
369
363
  "run_id",
370
364
  "run_type",
371
365
  "start_date",
366
+ "end_date",
372
367
  ]
373
368
 
369
+ casts = {"duration": lambda dagrun: DagRunInfo.duration(dagrun)}
370
+
371
+ @classmethod
372
+ def duration(cls, dagrun: DagRun) -> float | None:
373
+ if not getattr(dagrun, "end_date", None) or not isinstance(dagrun.end_date, datetime.datetime):
374
+ return None
375
+ if not getattr(dagrun, "start_date", None) or not isinstance(dagrun.start_date, datetime.datetime):
376
+ return None
377
+ return (dagrun.end_date - dagrun.start_date).total_seconds()
378
+
374
379
 
375
380
  class TaskInstanceInfo(InfoJsonEncodable):
376
381
  """Defines encoding TaskInstance object to JSON."""