apache-airflow-providers-openlineage 1.3.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apache-airflow-providers-openlineage might be problematic. Click here for more details.

@@ -0,0 +1,199 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+ from __future__ import annotations
18
+
19
+ import os
20
+ from contextlib import suppress
21
+ from typing import TYPE_CHECKING, Iterator
22
+
23
+ from airflow.configuration import conf
24
+ from airflow.providers.openlineage.extractors import BaseExtractor, OperatorLineage
25
+ from airflow.providers.openlineage.extractors.base import DefaultExtractor
26
+ from airflow.providers.openlineage.extractors.bash import BashExtractor
27
+ from airflow.providers.openlineage.extractors.python import PythonExtractor
28
+ from airflow.providers.openlineage.plugins.facets import (
29
+ UnknownOperatorAttributeRunFacet,
30
+ UnknownOperatorInstance,
31
+ )
32
+ from airflow.providers.openlineage.utils.utils import get_filtered_unknown_operator_keys
33
+ from airflow.utils.log.logging_mixin import LoggingMixin
34
+ from airflow.utils.module_loading import import_string
35
+
36
+ if TYPE_CHECKING:
37
+ from airflow.models import Operator
38
+
39
+
40
+ def try_import_from_string(string):
41
+ with suppress(ImportError):
42
+ return import_string(string)
43
+
44
+
45
+ def _iter_extractor_types() -> Iterator[type[BaseExtractor]]:
46
+ if PythonExtractor is not None:
47
+ yield PythonExtractor
48
+ if BashExtractor is not None:
49
+ yield BashExtractor
50
+
51
+
52
+ class ExtractorManager(LoggingMixin):
53
+ """Class abstracting management of custom extractors."""
54
+
55
+ def __init__(self):
56
+ super().__init__()
57
+ self.extractors: dict[str, type[BaseExtractor]] = {}
58
+ self.default_extractor = DefaultExtractor
59
+
60
+ # Comma-separated extractors in OPENLINEAGE_EXTRACTORS variable.
61
+ # Extractors should implement BaseExtractor
62
+ for extractor in _iter_extractor_types():
63
+ for operator_class in extractor.get_operator_classnames():
64
+ self.extractors[operator_class] = extractor
65
+
66
+ env_extractors = conf.get("openlineage", "extractors", fallback=os.getenv("OPENLINEAGE_EXTRACTORS"))
67
+ # skip either when it's empty string or None
68
+ if env_extractors:
69
+ for extractor in env_extractors.split(";"):
70
+ extractor: type[BaseExtractor] = try_import_from_string(extractor.strip())
71
+ for operator_class in extractor.get_operator_classnames():
72
+ self.extractors[operator_class] = extractor
73
+
74
+ def add_extractor(self, operator_class: str, extractor: type[BaseExtractor]):
75
+ self.extractors[operator_class] = extractor
76
+
77
+ def extract_metadata(self, dagrun, task, complete: bool = False, task_instance=None) -> OperatorLineage:
78
+ extractor = self._get_extractor(task)
79
+ task_info = (
80
+ f"task_type={task.task_type} "
81
+ f"airflow_dag_id={task.dag_id} "
82
+ f"task_id={task.task_id} "
83
+ f"airflow_run_id={dagrun.run_id} "
84
+ )
85
+
86
+ if extractor:
87
+ # Extracting advanced metadata is only possible when extractor for particular operator
88
+ # is defined. Without it, we can't extract any input or output data.
89
+ try:
90
+ self.log.debug("Using extractor %s %s", extractor.__class__.__name__, str(task_info))
91
+ if complete:
92
+ task_metadata = extractor.extract_on_complete(task_instance)
93
+ else:
94
+ task_metadata = extractor.extract()
95
+
96
+ self.log.debug("Found task metadata for operation %s: %s", task.task_id, str(task_metadata))
97
+ task_metadata = self.validate_task_metadata(task_metadata)
98
+ if task_metadata:
99
+ if (not task_metadata.inputs) and (not task_metadata.outputs):
100
+ self.extract_inlets_and_outlets(task_metadata, task.inlets, task.outlets)
101
+
102
+ return task_metadata
103
+
104
+ except Exception as e:
105
+ self.log.warning(
106
+ "Failed to extract metadata using found extractor %s - %s %s", extractor, e, task_info
107
+ )
108
+ else:
109
+ self.log.debug("Unable to find an extractor %s", task_info)
110
+
111
+ # Only include the unkonwnSourceAttribute facet if there is no extractor
112
+ task_metadata = OperatorLineage(
113
+ run_facets={
114
+ "unknownSourceAttribute": UnknownOperatorAttributeRunFacet(
115
+ unknownItems=[
116
+ UnknownOperatorInstance(
117
+ name=task.task_type,
118
+ properties=get_filtered_unknown_operator_keys(task),
119
+ )
120
+ ]
121
+ )
122
+ },
123
+ )
124
+ inlets = task.get_inlet_defs()
125
+ outlets = task.get_outlet_defs()
126
+ self.extract_inlets_and_outlets(task_metadata, inlets, outlets)
127
+ return task_metadata
128
+
129
+ return OperatorLineage()
130
+
131
+ def get_extractor_class(self, task: Operator) -> type[BaseExtractor] | None:
132
+ if task.task_type in self.extractors:
133
+ return self.extractors[task.task_type]
134
+
135
+ def method_exists(method_name):
136
+ method = getattr(task, method_name, None)
137
+ if method:
138
+ return callable(method)
139
+
140
+ if method_exists("get_openlineage_facets_on_start") or method_exists(
141
+ "get_openlineage_facets_on_complete"
142
+ ):
143
+ return self.default_extractor
144
+ return None
145
+
146
+ def _get_extractor(self, task: Operator) -> BaseExtractor | None:
147
+ # TODO: Re-enable in Extractor PR
148
+ # self.instantiate_abstract_extractors(task)
149
+ extractor = self.get_extractor_class(task)
150
+ self.log.debug("extractor for %s is %s", task.task_type, extractor)
151
+ if extractor:
152
+ return extractor(task)
153
+ return None
154
+
155
+ def extract_inlets_and_outlets(
156
+ self,
157
+ task_metadata: OperatorLineage,
158
+ inlets: list,
159
+ outlets: list,
160
+ ):
161
+ if inlets or outlets:
162
+ self.log.debug("Manually extracting lineage metadata from inlets and outlets")
163
+ for i in inlets:
164
+ d = self.convert_to_ol_dataset(i)
165
+ if d:
166
+ task_metadata.inputs.append(d)
167
+ for o in outlets:
168
+ d = self.convert_to_ol_dataset(o)
169
+ if d:
170
+ task_metadata.outputs.append(d)
171
+
172
+ @staticmethod
173
+ def convert_to_ol_dataset(obj):
174
+ from openlineage.client.run import Dataset
175
+
176
+ from airflow.lineage.entities import Table
177
+
178
+ if isinstance(obj, Dataset):
179
+ return obj
180
+ elif isinstance(obj, Table):
181
+ return Dataset(
182
+ namespace=f"{obj.cluster}",
183
+ name=f"{obj.database}.{obj.name}",
184
+ facets={},
185
+ )
186
+ else:
187
+ return None
188
+
189
+ def validate_task_metadata(self, task_metadata) -> OperatorLineage | None:
190
+ try:
191
+ return OperatorLineage(
192
+ inputs=task_metadata.inputs,
193
+ outputs=task_metadata.outputs,
194
+ run_facets=task_metadata.run_facets,
195
+ job_facets=task_metadata.job_facets,
196
+ )
197
+ except AttributeError:
198
+ self.log.warning("Extractor returns non-valid metadata: %s", task_metadata)
199
+ return None
@@ -0,0 +1,97 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ from __future__ import annotations
19
+
20
+ import inspect
21
+ from typing import Callable
22
+
23
+ from openlineage.client.facet import SourceCodeJobFacet
24
+
25
+ from airflow.providers.openlineage.extractors.base import BaseExtractor, OperatorLineage
26
+ from airflow.providers.openlineage.plugins.facets import (
27
+ UnknownOperatorAttributeRunFacet,
28
+ UnknownOperatorInstance,
29
+ )
30
+ from airflow.providers.openlineage.utils.utils import (
31
+ get_filtered_unknown_operator_keys,
32
+ is_source_enabled,
33
+ )
34
+
35
+ """
36
+ :meta private:
37
+ """
38
+
39
+
40
+ class PythonExtractor(BaseExtractor):
41
+ """
42
+ Extract executed source code and put it into SourceCodeJobFacet.
43
+
44
+ This extractor provides visibility on what particular task does by extracting
45
+ executed source code and putting it into SourceCodeJobFacet. It does not extract
46
+ datasets yet.
47
+
48
+ :meta private:
49
+ """
50
+
51
+ @classmethod
52
+ def get_operator_classnames(cls) -> list[str]:
53
+ return ["PythonOperator"]
54
+
55
+ def _execute_extraction(self) -> OperatorLineage | None:
56
+ source_code = self.get_source_code(self.operator.python_callable)
57
+ job_facet: dict = {}
58
+ if is_source_enabled() and source_code:
59
+ job_facet = {
60
+ "sourceCode": SourceCodeJobFacet(
61
+ language="python",
62
+ # We're on worker and should have access to DAG files
63
+ source=source_code,
64
+ )
65
+ }
66
+ return OperatorLineage(
67
+ job_facets=job_facet,
68
+ run_facets={
69
+ # The PythonOperator is recorded as an "unknownSource" even though we have an
70
+ # extractor, as the data lineage cannot be determined from the operator
71
+ # directly.
72
+ "unknownSourceAttribute": UnknownOperatorAttributeRunFacet(
73
+ unknownItems=[
74
+ UnknownOperatorInstance(
75
+ name="PythonOperator",
76
+ properties=get_filtered_unknown_operator_keys(self.operator),
77
+ )
78
+ ]
79
+ )
80
+ },
81
+ )
82
+
83
+ def get_source_code(self, callable: Callable) -> str | None:
84
+ try:
85
+ return inspect.getsource(callable)
86
+ except TypeError:
87
+ # Trying to extract source code of builtin_function_or_method
88
+ return str(callable)
89
+ except OSError:
90
+ self.log.warning(
91
+ "Can't get source code facet of PythonOperator %s",
92
+ self.operator.task_id,
93
+ )
94
+ return None
95
+
96
+ def extract(self) -> OperatorLineage | None:
97
+ return super().extract()
@@ -0,0 +1,110 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ # NOTE! THIS FILE IS AUTOMATICALLY GENERATED AND WILL BE
19
+ # OVERWRITTEN WHEN PREPARING PACKAGES.
20
+ #
21
+ # IF YOU WANT TO MODIFY THIS FILE, YOU SHOULD MODIFY THE TEMPLATE
22
+ # `get_provider_info_TEMPLATE.py.jinja2` IN the `dev/breeze/src/airflow_breeze/templates` DIRECTORY
23
+
24
+
25
+ def get_provider_info():
26
+ return {
27
+ "package-name": "apache-airflow-providers-openlineage",
28
+ "name": "OpenLineage Airflow",
29
+ "description": "`OpenLineage <https://openlineage.io/>`__\n",
30
+ "suspended": False,
31
+ "source-date-epoch": 1703288155,
32
+ "versions": ["1.3.1", "1.3.0", "1.2.1", "1.2.0", "1.1.1", "1.1.0", "1.0.2", "1.0.1", "1.0.0"],
33
+ "dependencies": [
34
+ "apache-airflow>=2.7.0",
35
+ "apache-airflow-providers-common-sql>=1.6.0",
36
+ "attrs>=22.2",
37
+ "openlineage-integration-common>=0.28.0",
38
+ "openlineage-python>=0.28.0",
39
+ ],
40
+ "integrations": [
41
+ {
42
+ "integration-name": "OpenLineage",
43
+ "external-doc-url": "https://openlineage.io",
44
+ "logo": "/integration-logos/openlineage/openlineage.svg",
45
+ "tags": ["protocol"],
46
+ }
47
+ ],
48
+ "plugins": [
49
+ {
50
+ "name": "openlineage",
51
+ "plugin-class": "airflow.providers.openlineage.plugins.openlineage.OpenLineageProviderPlugin",
52
+ }
53
+ ],
54
+ "config": {
55
+ "openlineage": {
56
+ "description": "This section applies settings for OpenLineage integration.\nFor backwards compatibility with `openlineage-python` one can still use\n`openlineage.yml` file or `OPENLINEAGE_` environment variables. However, below\nconfiguration takes precedence over those.\nMore in documentation - https://openlineage.io/docs/client/python#configuration.\n",
57
+ "options": {
58
+ "disabled": {
59
+ "description": "Set this to true if you don't want OpenLineage to emit events.\n",
60
+ "type": "boolean",
61
+ "example": None,
62
+ "default": "False",
63
+ "version_added": None,
64
+ },
65
+ "disabled_for_operators": {
66
+ "description": "Semicolon separated string of Airflow Operator names to disable\n",
67
+ "type": "string",
68
+ "example": "airflow.operators.bash.BashOperator;airflow.operators.python.PythonOperator",
69
+ "default": "",
70
+ "version_added": "1.1.0",
71
+ },
72
+ "namespace": {
73
+ "description": "OpenLineage namespace\n",
74
+ "version_added": None,
75
+ "type": "string",
76
+ "example": "food_delivery",
77
+ "default": None,
78
+ },
79
+ "extractors": {
80
+ "description": "Semicolon separated paths to custom OpenLineage extractors.\n",
81
+ "type": "string",
82
+ "example": "full.path.to.ExtractorClass;full.path.to.AnotherExtractorClass",
83
+ "default": None,
84
+ "version_added": None,
85
+ },
86
+ "config_path": {
87
+ "description": "Path to YAML config. This provides backwards compatibility to pass config as\n`openlineage.yml` file.\n",
88
+ "version_added": None,
89
+ "type": "string",
90
+ "example": None,
91
+ "default": "",
92
+ },
93
+ "transport": {
94
+ "description": "OpenLineage Client transport configuration. It should contain type\nand additional options per each type.\n\nCurrently supported types are:\n\n * HTTP\n * Kafka\n * Console\n",
95
+ "type": "string",
96
+ "example": '{"type": "http", "url": "http://localhost:5000"}',
97
+ "default": "",
98
+ "version_added": None,
99
+ },
100
+ "disable_source_code": {
101
+ "description": "If disabled, OpenLineage events do not contain source code of particular\noperators, like PythonOperator.\n",
102
+ "default": None,
103
+ "example": None,
104
+ "type": "boolean",
105
+ "version_added": None,
106
+ },
107
+ },
108
+ }
109
+ },
110
+ }
@@ -0,0 +1,16 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.