acryl-datahub-airflow-plugin 1.3.1.5__py3-none-any.whl → 1.3.1.5rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/METADATA +91 -0
- acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/RECORD +33 -0
- datahub_airflow_plugin/_airflow_shims.py +31 -64
- datahub_airflow_plugin/_config.py +19 -97
- datahub_airflow_plugin/_datahub_ol_adapter.py +2 -14
- datahub_airflow_plugin/_extractors.py +365 -0
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/client/airflow_generator.py +43 -147
- datahub_airflow_plugin/datahub_listener.py +790 -19
- datahub_airflow_plugin/example_dags/__init__.py +0 -32
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +4 -12
- datahub_airflow_plugin/hooks/datahub.py +2 -11
- datahub_airflow_plugin/operators/datahub.py +3 -20
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +0 -303
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +0 -65
- datahub_airflow_plugin/_airflow_compat.py +0 -32
- datahub_airflow_plugin/_airflow_version_specific.py +0 -184
- datahub_airflow_plugin/_constants.py +0 -16
- datahub_airflow_plugin/airflow2/__init__.py +0 -6
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +0 -402
- datahub_airflow_plugin/airflow2/_airflow_compat.py +0 -95
- datahub_airflow_plugin/airflow2/_extractors.py +0 -477
- datahub_airflow_plugin/airflow2/_legacy_shims.py +0 -20
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +0 -123
- datahub_airflow_plugin/airflow2/_provider_shims.py +0 -29
- datahub_airflow_plugin/airflow2/_shims.py +0 -88
- datahub_airflow_plugin/airflow2/datahub_listener.py +0 -1072
- datahub_airflow_plugin/airflow3/__init__.py +0 -6
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +0 -408
- datahub_airflow_plugin/airflow3/_airflow_compat.py +0 -108
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +0 -153
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +0 -273
- datahub_airflow_plugin/airflow3/_shims.py +0 -82
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +0 -88
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +0 -308
- datahub_airflow_plugin/airflow3/datahub_listener.py +0 -1452
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +0 -8
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +0 -54
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +0 -43
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +0 -69
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +0 -69
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +0 -81
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +0 -68
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +0 -99
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +0 -8
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +0 -51
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +0 -51
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +0 -89
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
# Airflow 2.x compatibility module
|
|
2
|
-
# This module must be imported before any Airflow imports in any of our files.
|
|
3
|
-
|
|
4
|
-
from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
|
|
5
|
-
|
|
6
|
-
# Critical safety check: Ensure MarkupSafe compatibility patch is applied
|
|
7
|
-
# This must happen before importing Airflow to prevent MarkupSafe version conflicts
|
|
8
|
-
# Using explicit exception instead of assert to ensure it runs even with python -O
|
|
9
|
-
if not MARKUPSAFE_PATCHED:
|
|
10
|
-
raise RuntimeError(
|
|
11
|
-
"MarkupSafe compatibility patch must be applied before importing Airflow modules. "
|
|
12
|
-
"This is a critical safety check that cannot be disabled. "
|
|
13
|
-
"The patch ensures compatibility between different MarkupSafe versions used by "
|
|
14
|
-
"Airflow and DataHub dependencies."
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
# Apply SQLParser patch for Airflow 2.10+ with apache-airflow-providers-openlineage
|
|
18
|
-
# When using the provider package, SQL operators call SQLParser.generate_openlineage_metadata_from_sql()
|
|
19
|
-
# directly (similar to Airflow 3.x), so we need to patch that method to use DataHub's SQL parser.
|
|
20
|
-
#
|
|
21
|
-
# For legacy openlineage-airflow package (Airflow 2.5-2.9), we use the extractor-based approach
|
|
22
|
-
# in _extractors.py instead.
|
|
23
|
-
import importlib.util
|
|
24
|
-
import logging
|
|
25
|
-
|
|
26
|
-
logger = logging.getLogger(__name__)
|
|
27
|
-
|
|
28
|
-
# Check if OpenLineage provider package is available
|
|
29
|
-
# Use try-except because find_spec can raise ModuleNotFoundError if parent module doesn't exist
|
|
30
|
-
try:
|
|
31
|
-
has_openlineage_provider = (
|
|
32
|
-
importlib.util.find_spec("airflow.providers.openlineage.sqlparser") is not None
|
|
33
|
-
)
|
|
34
|
-
except (ModuleNotFoundError, ImportError, ValueError):
|
|
35
|
-
# Parent module doesn't exist or other import error
|
|
36
|
-
has_openlineage_provider = False
|
|
37
|
-
|
|
38
|
-
if has_openlineage_provider:
|
|
39
|
-
# Provider package detected - apply SQL parser patch
|
|
40
|
-
from datahub_airflow_plugin.airflow2._airflow2_sql_parser_patch import (
|
|
41
|
-
patch_sqlparser,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
patch_sqlparser()
|
|
45
|
-
else:
|
|
46
|
-
# Provider package not available - using legacy openlineage-airflow package
|
|
47
|
-
# No patching needed, extractors will handle SQL parsing
|
|
48
|
-
pass
|
|
49
|
-
|
|
50
|
-
# Apply operator-specific patches for provider mode
|
|
51
|
-
# These patches work for both Airflow 2.x and 3.x when using OpenLineage provider
|
|
52
|
-
try:
|
|
53
|
-
from datahub_airflow_plugin._config import get_lineage_config
|
|
54
|
-
|
|
55
|
-
config = get_lineage_config()
|
|
56
|
-
enable_extractors = config.enable_extractors
|
|
57
|
-
extract_teradata_operator = config.extract_teradata_operator
|
|
58
|
-
except Exception:
|
|
59
|
-
# If config loading fails, apply patches by default (backward compatibility)
|
|
60
|
-
enable_extractors = True
|
|
61
|
-
extract_teradata_operator = True
|
|
62
|
-
|
|
63
|
-
if enable_extractors and extract_teradata_operator:
|
|
64
|
-
# TeradataOperator patch - works for both Airflow 2.x provider mode and Airflow 3.x
|
|
65
|
-
# The patch checks for method existence, so it's safe to import from airflow3 module
|
|
66
|
-
# Note: We defer the import to avoid potential issues with Airflow 3.x specific imports
|
|
67
|
-
# in Airflow 2.x environments. The patch function itself handles version compatibility.
|
|
68
|
-
import logging
|
|
69
|
-
|
|
70
|
-
logger = logging.getLogger(__name__)
|
|
71
|
-
try:
|
|
72
|
-
logger.debug("Attempting to import and apply TeradataOperator patch")
|
|
73
|
-
# Use importlib to safely import the patch module
|
|
74
|
-
import importlib.util
|
|
75
|
-
|
|
76
|
-
patch_module_path = (
|
|
77
|
-
"datahub_airflow_plugin.airflow3._teradata_openlineage_patch"
|
|
78
|
-
)
|
|
79
|
-
patch_module = importlib.import_module(patch_module_path)
|
|
80
|
-
patch_teradata_operator = patch_module.patch_teradata_operator
|
|
81
|
-
|
|
82
|
-
patch_teradata_operator()
|
|
83
|
-
logger.debug("TeradataOperator patch import and call completed")
|
|
84
|
-
except ImportError as e:
|
|
85
|
-
# Teradata provider not installed or patch not available
|
|
86
|
-
logger.debug(f"Could not import TeradataOperator patch: {e}")
|
|
87
|
-
except Exception as e:
|
|
88
|
-
# Log error but don't fail - this is optional functionality
|
|
89
|
-
logger.warning(f"Error applying TeradataOperator patch: {e}", exc_info=True)
|
|
90
|
-
|
|
91
|
-
AIRFLOW_PATCHED = True
|
|
92
|
-
|
|
93
|
-
__all__ = [
|
|
94
|
-
"AIRFLOW_PATCHED",
|
|
95
|
-
]
|
|
@@ -1,477 +0,0 @@
|
|
|
1
|
-
import contextlib
|
|
2
|
-
import logging
|
|
3
|
-
import sys
|
|
4
|
-
import unittest.mock
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Dict, Optional
|
|
6
|
-
|
|
7
|
-
from openlineage.client.facet import (
|
|
8
|
-
ExtractionError,
|
|
9
|
-
ExtractionErrorRunFacet,
|
|
10
|
-
SqlJobFacet,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
import datahub.emitter.mce_builder as builder
|
|
14
|
-
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
15
|
-
get_platform_from_sqlalchemy_uri,
|
|
16
|
-
)
|
|
17
|
-
from datahub.sql_parsing.sqlglot_lineage import (
|
|
18
|
-
SqlParsingResult,
|
|
19
|
-
create_lineage_sql_parsed_result,
|
|
20
|
-
)
|
|
21
|
-
from datahub_airflow_plugin._constants import SQL_PARSING_RESULT_KEY
|
|
22
|
-
from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
|
|
23
|
-
from datahub_airflow_plugin.airflow2._openlineage_compat import (
|
|
24
|
-
USE_OPENLINEAGE_PROVIDER,
|
|
25
|
-
BaseExtractor,
|
|
26
|
-
OLExtractorManager,
|
|
27
|
-
OperatorLineage,
|
|
28
|
-
SnowflakeExtractor,
|
|
29
|
-
SqlExtractor,
|
|
30
|
-
TaskMetadata,
|
|
31
|
-
get_operator_class,
|
|
32
|
-
try_import_from_string,
|
|
33
|
-
)
|
|
34
|
-
from datahub_airflow_plugin.airflow2._shims import Operator
|
|
35
|
-
|
|
36
|
-
if TYPE_CHECKING:
|
|
37
|
-
from airflow.models import DagRun, TaskInstance
|
|
38
|
-
|
|
39
|
-
from datahub.ingestion.graph.client import DataHubGraph
|
|
40
|
-
|
|
41
|
-
# For type checking, define a union type that covers both versions
|
|
42
|
-
if sys.version_info >= (3, 10):
|
|
43
|
-
from typing import TypeAlias
|
|
44
|
-
else:
|
|
45
|
-
from typing_extensions import TypeAlias
|
|
46
|
-
|
|
47
|
-
# Define proper type aliases for the union type
|
|
48
|
-
# Note: BaseExtractor, OLExtractorManager, etc. are already imported above at runtime
|
|
49
|
-
from typing import Union
|
|
50
|
-
|
|
51
|
-
ExtractResult: TypeAlias = Union[
|
|
52
|
-
Any, Any
|
|
53
|
-
] # Will be TaskMetadata or OperatorLineage at runtime
|
|
54
|
-
|
|
55
|
-
logger = logging.getLogger(__name__)
|
|
56
|
-
_DATAHUB_GRAPH_CONTEXT_KEY = "datahub_graph"
|
|
57
|
-
|
|
58
|
-
# Runtime type alias for the return type of extract() methods
|
|
59
|
-
if not TYPE_CHECKING:
|
|
60
|
-
if USE_OPENLINEAGE_PROVIDER:
|
|
61
|
-
ExtractResult = OperatorLineage
|
|
62
|
-
else:
|
|
63
|
-
ExtractResult = TaskMetadata
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
class ExtractorManager(OLExtractorManager):
|
|
67
|
-
# TODO: On Airflow 2.7, the OLExtractorManager is part of the built-in Airflow API.
|
|
68
|
-
# When available, we should use that instead. The same goe for most of the OL
|
|
69
|
-
# extractors.
|
|
70
|
-
|
|
71
|
-
def __init__(
|
|
72
|
-
self,
|
|
73
|
-
patch_sql_parser: bool = True,
|
|
74
|
-
patch_snowflake_schema: bool = True,
|
|
75
|
-
extract_athena_operator: bool = True,
|
|
76
|
-
extract_bigquery_insert_job_operator: bool = True,
|
|
77
|
-
extract_teradata_operator: bool = True,
|
|
78
|
-
):
|
|
79
|
-
super().__init__()
|
|
80
|
-
|
|
81
|
-
# Store patch/extractor configuration
|
|
82
|
-
self._patch_sql_parser = patch_sql_parser
|
|
83
|
-
self._patch_snowflake_schema = patch_snowflake_schema
|
|
84
|
-
self._extract_athena_operator = extract_athena_operator
|
|
85
|
-
self._extract_bigquery_insert_job_operator = (
|
|
86
|
-
extract_bigquery_insert_job_operator
|
|
87
|
-
)
|
|
88
|
-
self._extract_teradata_operator = extract_teradata_operator
|
|
89
|
-
|
|
90
|
-
# Legacy OpenLineage has task_to_extractor attribute, OpenLineage Provider doesn't
|
|
91
|
-
# Register custom extractors only for Legacy OpenLineage (Provider has its own)
|
|
92
|
-
if not USE_OPENLINEAGE_PROVIDER:
|
|
93
|
-
_sql_operator_overrides = [
|
|
94
|
-
# The OL BigQuery extractor has some complex logic to fetch detect
|
|
95
|
-
# the BigQuery job_id and fetch lineage from there. However, it can't
|
|
96
|
-
# generate CLL, so we disable it and use our own extractor instead.
|
|
97
|
-
"BigQueryOperator",
|
|
98
|
-
"BigQueryExecuteQueryOperator",
|
|
99
|
-
# Athena also does something similar.
|
|
100
|
-
"AWSAthenaOperator",
|
|
101
|
-
# Additional types that OL doesn't support. This is only necessary because
|
|
102
|
-
# on older versions of Airflow, these operators don't inherit from SQLExecuteQueryOperator.
|
|
103
|
-
"SqliteOperator",
|
|
104
|
-
]
|
|
105
|
-
for operator in _sql_operator_overrides:
|
|
106
|
-
self.task_to_extractor.extractors[operator] = GenericSqlExtractor # type: ignore[attr-defined]
|
|
107
|
-
|
|
108
|
-
# Register custom extractors based on configuration
|
|
109
|
-
if self._extract_athena_operator:
|
|
110
|
-
self.task_to_extractor.extractors["AthenaOperator"] = ( # type: ignore[attr-defined]
|
|
111
|
-
AthenaOperatorExtractor
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
if self._extract_bigquery_insert_job_operator:
|
|
115
|
-
self.task_to_extractor.extractors["BigQueryInsertJobOperator"] = ( # type: ignore[attr-defined]
|
|
116
|
-
BigQueryInsertJobOperatorExtractor
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
if self._extract_teradata_operator:
|
|
120
|
-
self.task_to_extractor.extractors["TeradataOperator"] = (
|
|
121
|
-
TeradataOperatorExtractor
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
self._graph: Optional["DataHubGraph"] = None
|
|
125
|
-
|
|
126
|
-
@contextlib.contextmanager
|
|
127
|
-
def _patch_extractors(self):
|
|
128
|
-
with contextlib.ExitStack() as stack:
|
|
129
|
-
# Patch the SqlExtractor.extract() method if configured and available
|
|
130
|
-
if self._patch_sql_parser and SqlExtractor is not None:
|
|
131
|
-
stack.enter_context(
|
|
132
|
-
unittest.mock.patch.object(
|
|
133
|
-
SqlExtractor,
|
|
134
|
-
"extract",
|
|
135
|
-
_sql_extractor_extract,
|
|
136
|
-
)
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
# Patch the SnowflakeExtractor.default_schema property if configured and available
|
|
140
|
-
if self._patch_snowflake_schema and SnowflakeExtractor is not None:
|
|
141
|
-
stack.enter_context(
|
|
142
|
-
unittest.mock.patch.object(
|
|
143
|
-
SnowflakeExtractor,
|
|
144
|
-
"default_schema",
|
|
145
|
-
property(_snowflake_default_schema),
|
|
146
|
-
)
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
yield
|
|
150
|
-
|
|
151
|
-
def extract_metadata( # type: ignore[override]
|
|
152
|
-
self,
|
|
153
|
-
dagrun: "DagRun",
|
|
154
|
-
task: "Operator",
|
|
155
|
-
complete: bool = False,
|
|
156
|
-
task_instance: Optional["TaskInstance"] = None,
|
|
157
|
-
task_uuid: Optional[str] = None,
|
|
158
|
-
graph: Optional["DataHubGraph"] = None,
|
|
159
|
-
) -> ExtractResult:
|
|
160
|
-
self._graph = graph
|
|
161
|
-
with self._patch_extractors():
|
|
162
|
-
if USE_OPENLINEAGE_PROVIDER:
|
|
163
|
-
# OpenLineage Provider: Does not have task_uuid parameter
|
|
164
|
-
# In Airflow 3.x, the 'complete' parameter type changed from bool to TaskInstanceState
|
|
165
|
-
return super().extract_metadata(dagrun, task, complete, task_instance) # type: ignore[call-arg,arg-type]
|
|
166
|
-
else:
|
|
167
|
-
# Legacy OpenLineage: Has task_uuid parameter
|
|
168
|
-
return super().extract_metadata( # type: ignore[call-arg,arg-type]
|
|
169
|
-
dagrun,
|
|
170
|
-
task,
|
|
171
|
-
complete, # type: ignore[arg-type]
|
|
172
|
-
task_instance,
|
|
173
|
-
task_uuid,
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
def _get_extractor(self, task: "Operator") -> Optional[BaseExtractor]:
|
|
177
|
-
# For Legacy OpenLineage: Register GenericSqlExtractor as fallback for
|
|
178
|
-
# any operator that inherits from SQLExecuteQueryOperator.
|
|
179
|
-
# For OpenLineage Provider: Rely on SQLParser patch approach instead.
|
|
180
|
-
if not USE_OPENLINEAGE_PROVIDER:
|
|
181
|
-
clazz = get_operator_class(task) # type: ignore[arg-type]
|
|
182
|
-
SQLExecuteQueryOperator = try_import_from_string(
|
|
183
|
-
"airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator"
|
|
184
|
-
)
|
|
185
|
-
if SQLExecuteQueryOperator and issubclass(clazz, SQLExecuteQueryOperator):
|
|
186
|
-
# Legacy OpenLineage: Register GenericSqlExtractor in task_to_extractor.extractors
|
|
187
|
-
self.task_to_extractor.extractors.setdefault( # type: ignore[attr-defined]
|
|
188
|
-
clazz.__name__, GenericSqlExtractor
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
extractor = super()._get_extractor(task)
|
|
192
|
-
|
|
193
|
-
# For OpenLineage Provider: If no extractor was found, check if this is a SQL operator
|
|
194
|
-
# that should use GenericSqlExtractor (e.g., SqliteOperator which provider doesn't support)
|
|
195
|
-
if (
|
|
196
|
-
USE_OPENLINEAGE_PROVIDER
|
|
197
|
-
and extractor is None
|
|
198
|
-
and GenericSqlExtractor is not None
|
|
199
|
-
):
|
|
200
|
-
clazz = get_operator_class(task) # type: ignore[arg-type]
|
|
201
|
-
# Check if this is SqliteOperator (provider doesn't have an extractor for it)
|
|
202
|
-
if clazz.__name__ == "SqliteOperator":
|
|
203
|
-
# Create a GenericSqlExtractor instance for this operator
|
|
204
|
-
extractor = GenericSqlExtractor(task) # type: ignore[call-arg]
|
|
205
|
-
|
|
206
|
-
if extractor and not USE_OPENLINEAGE_PROVIDER:
|
|
207
|
-
# set_context only exists in Legacy OpenLineage
|
|
208
|
-
extractor.set_context(_DATAHUB_GRAPH_CONTEXT_KEY, self._graph) # type: ignore[attr-defined]
|
|
209
|
-
return extractor
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
if SqlExtractor is not None:
|
|
213
|
-
|
|
214
|
-
class GenericSqlExtractor(SqlExtractor): # type: ignore
|
|
215
|
-
# Note that the extract() method is patched elsewhere.
|
|
216
|
-
|
|
217
|
-
@property
|
|
218
|
-
def default_schema(self):
|
|
219
|
-
return super().default_schema
|
|
220
|
-
|
|
221
|
-
def _get_scheme(self) -> Optional[str]:
|
|
222
|
-
# Best effort conversion to DataHub platform names.
|
|
223
|
-
|
|
224
|
-
with contextlib.suppress(Exception):
|
|
225
|
-
if self.hook:
|
|
226
|
-
if hasattr(self.hook, "get_uri"):
|
|
227
|
-
uri = self.hook.get_uri()
|
|
228
|
-
return get_platform_from_sqlalchemy_uri(uri)
|
|
229
|
-
|
|
230
|
-
return self.conn.conn_type or super().dialect
|
|
231
|
-
|
|
232
|
-
def _get_database(self) -> Optional[str]:
|
|
233
|
-
if self.conn:
|
|
234
|
-
# For BigQuery, the "database" is the project name.
|
|
235
|
-
if hasattr(self.conn, "project_id"):
|
|
236
|
-
return self.conn.project_id
|
|
237
|
-
|
|
238
|
-
return self.conn.schema
|
|
239
|
-
return None
|
|
240
|
-
|
|
241
|
-
else:
|
|
242
|
-
# SqlExtractor is not available (OpenLineage Provider package)
|
|
243
|
-
GenericSqlExtractor = None # type: ignore
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
def _sql_extractor_extract(self: "SqlExtractor") -> Optional[ExtractResult]:
|
|
247
|
-
# Why not override the OL sql_parse method directly, instead of overriding
|
|
248
|
-
# extract()? A few reasons:
|
|
249
|
-
#
|
|
250
|
-
# 1. We would want to pass the default_db and graph instance into our sql parser
|
|
251
|
-
# method. The OL code doesn't pass the default_db (despite having it available),
|
|
252
|
-
# and it's not clear how to get the graph instance into that method.
|
|
253
|
-
# 2. OL has some janky logic to fetch table schemas as part of the sql extractor.
|
|
254
|
-
# We don't want that behavior and this lets us disable it.
|
|
255
|
-
# 3. Our SqlParsingResult already has DataHub urns, whereas using SqlMeta would
|
|
256
|
-
# require us to convert those urns to OL uris, just for them to get converted
|
|
257
|
-
# back to urns later on in our processing.
|
|
258
|
-
|
|
259
|
-
task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
|
|
260
|
-
sql = self.operator.sql
|
|
261
|
-
|
|
262
|
-
default_database = getattr(self.operator, "database", None)
|
|
263
|
-
if not default_database:
|
|
264
|
-
default_database = self.database
|
|
265
|
-
default_schema = self.default_schema
|
|
266
|
-
|
|
267
|
-
# TODO: Add better handling for sql being a list of statements.
|
|
268
|
-
if isinstance(sql, list):
|
|
269
|
-
logger.info(f"Got list of SQL statements for {task_name}. Using first one.")
|
|
270
|
-
sql = sql[0]
|
|
271
|
-
|
|
272
|
-
# Run the SQL parser.
|
|
273
|
-
scheme = self.scheme
|
|
274
|
-
platform = OL_SCHEME_TWEAKS.get(scheme, scheme)
|
|
275
|
-
|
|
276
|
-
return _parse_sql_into_task_metadata(
|
|
277
|
-
self,
|
|
278
|
-
sql,
|
|
279
|
-
platform=platform,
|
|
280
|
-
default_database=default_database,
|
|
281
|
-
default_schema=default_schema,
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
def _normalize_sql(sql: str) -> str:
|
|
286
|
-
"""Normalize SQL for logging (strip extra whitespace)"""
|
|
287
|
-
if SqlExtractor is not None and hasattr(SqlExtractor, "_normalize_sql"):
|
|
288
|
-
return SqlExtractor._normalize_sql(sql)
|
|
289
|
-
# Fallback normalization
|
|
290
|
-
return " ".join(sql.split())
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
def _create_lineage_metadata(
|
|
294
|
-
task_name: str,
|
|
295
|
-
run_facets: Dict[str, Any],
|
|
296
|
-
job_facets: Dict[str, Any],
|
|
297
|
-
) -> Optional[ExtractResult]:
|
|
298
|
-
"""Create TaskMetadata (Legacy OpenLineage) or OperatorLineage (OpenLineage Provider)"""
|
|
299
|
-
if USE_OPENLINEAGE_PROVIDER:
|
|
300
|
-
# OpenLineage Provider: Return OperatorLineage (no name field)
|
|
301
|
-
return OperatorLineage( # type: ignore
|
|
302
|
-
inputs=[],
|
|
303
|
-
outputs=[],
|
|
304
|
-
run_facets=run_facets,
|
|
305
|
-
job_facets=job_facets,
|
|
306
|
-
)
|
|
307
|
-
else:
|
|
308
|
-
# Legacy OpenLineage: Return TaskMetadata (with name field)
|
|
309
|
-
return TaskMetadata( # type: ignore
|
|
310
|
-
name=task_name,
|
|
311
|
-
inputs=[],
|
|
312
|
-
outputs=[],
|
|
313
|
-
run_facets=run_facets,
|
|
314
|
-
job_facets=job_facets,
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
def _parse_sql_into_task_metadata(
|
|
319
|
-
self: "BaseExtractor",
|
|
320
|
-
sql: str,
|
|
321
|
-
platform: str,
|
|
322
|
-
default_database: Optional[str],
|
|
323
|
-
default_schema: Optional[str],
|
|
324
|
-
) -> Optional[ExtractResult]:
|
|
325
|
-
task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
|
|
326
|
-
|
|
327
|
-
run_facets = {}
|
|
328
|
-
job_facets = {"sql": SqlJobFacet(query=_normalize_sql(sql))}
|
|
329
|
-
|
|
330
|
-
# Get graph from context (Legacy OpenLineage only)
|
|
331
|
-
graph = None
|
|
332
|
-
if hasattr(self, "context"):
|
|
333
|
-
graph = self.context.get(_DATAHUB_GRAPH_CONTEXT_KEY, None) # type: ignore[attr-defined]
|
|
334
|
-
|
|
335
|
-
self.log.debug(
|
|
336
|
-
"Running the SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
|
|
337
|
-
"with graph client" if graph else "in offline mode",
|
|
338
|
-
platform,
|
|
339
|
-
default_database,
|
|
340
|
-
default_schema,
|
|
341
|
-
sql,
|
|
342
|
-
)
|
|
343
|
-
sql_parsing_result: SqlParsingResult = create_lineage_sql_parsed_result(
|
|
344
|
-
query=sql,
|
|
345
|
-
graph=graph,
|
|
346
|
-
platform=platform,
|
|
347
|
-
platform_instance=None,
|
|
348
|
-
env=builder.DEFAULT_ENV,
|
|
349
|
-
default_db=default_database,
|
|
350
|
-
default_schema=default_schema,
|
|
351
|
-
)
|
|
352
|
-
self.log.debug(f"Got sql lineage {sql_parsing_result}")
|
|
353
|
-
|
|
354
|
-
if sql_parsing_result.debug_info.error:
|
|
355
|
-
error = sql_parsing_result.debug_info.error
|
|
356
|
-
run_facets["extractionError"] = ExtractionErrorRunFacet(
|
|
357
|
-
totalTasks=1,
|
|
358
|
-
failedTasks=1,
|
|
359
|
-
errors=[
|
|
360
|
-
ExtractionError(
|
|
361
|
-
errorMessage=str(error),
|
|
362
|
-
stackTrace=None,
|
|
363
|
-
task="datahub_sql_parser",
|
|
364
|
-
taskNumber=None,
|
|
365
|
-
)
|
|
366
|
-
],
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
# Save sql_parsing_result to the facets dict. It is removed from the
|
|
370
|
-
# facet dict in the extractor's processing logic.
|
|
371
|
-
run_facets[SQL_PARSING_RESULT_KEY] = sql_parsing_result # type: ignore
|
|
372
|
-
|
|
373
|
-
return _create_lineage_metadata(task_name, run_facets, job_facets)
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
class BigQueryInsertJobOperatorExtractor(BaseExtractor):
|
|
377
|
-
def extract(self) -> Optional[ExtractResult]:
|
|
378
|
-
from airflow.providers.google.cloud.operators.bigquery import (
|
|
379
|
-
BigQueryInsertJobOperator, # type: ignore
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
operator: "BigQueryInsertJobOperator" = self.operator
|
|
383
|
-
sql = operator.configuration.get("query", {}).get("query")
|
|
384
|
-
if not sql:
|
|
385
|
-
self.log.warning("No query found in BigQueryInsertJobOperator")
|
|
386
|
-
return None
|
|
387
|
-
|
|
388
|
-
destination_table = operator.configuration.get("query", {}).get(
|
|
389
|
-
"destinationTable"
|
|
390
|
-
)
|
|
391
|
-
destination_table_urn = None
|
|
392
|
-
if destination_table:
|
|
393
|
-
project_id = destination_table.get("projectId")
|
|
394
|
-
dataset_id = destination_table.get("datasetId")
|
|
395
|
-
table_id = destination_table.get("tableId")
|
|
396
|
-
|
|
397
|
-
if project_id and dataset_id and table_id:
|
|
398
|
-
destination_table_urn = builder.make_dataset_urn(
|
|
399
|
-
platform="bigquery",
|
|
400
|
-
name=f"{project_id}.{dataset_id}.{table_id}",
|
|
401
|
-
env=builder.DEFAULT_ENV,
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
task_metadata = _parse_sql_into_task_metadata(
|
|
405
|
-
self,
|
|
406
|
-
sql,
|
|
407
|
-
platform="bigquery",
|
|
408
|
-
default_database=operator.project_id,
|
|
409
|
-
default_schema=None,
|
|
410
|
-
)
|
|
411
|
-
|
|
412
|
-
if destination_table_urn and task_metadata:
|
|
413
|
-
sql_parsing_result = task_metadata.run_facets.get(SQL_PARSING_RESULT_KEY)
|
|
414
|
-
if sql_parsing_result and isinstance(sql_parsing_result, SqlParsingResult):
|
|
415
|
-
sql_parsing_result.out_tables.append(destination_table_urn)
|
|
416
|
-
|
|
417
|
-
return task_metadata
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
class AthenaOperatorExtractor(BaseExtractor):
|
|
421
|
-
def extract(self) -> Optional[ExtractResult]:
|
|
422
|
-
from airflow.providers.amazon.aws.operators.athena import (
|
|
423
|
-
AthenaOperator, # type: ignore
|
|
424
|
-
)
|
|
425
|
-
|
|
426
|
-
operator: "AthenaOperator" = self.operator
|
|
427
|
-
sql = operator.query
|
|
428
|
-
if not sql:
|
|
429
|
-
self.log.warning("No query found in AthenaOperator")
|
|
430
|
-
return None
|
|
431
|
-
|
|
432
|
-
return _parse_sql_into_task_metadata(
|
|
433
|
-
self,
|
|
434
|
-
sql,
|
|
435
|
-
platform="athena",
|
|
436
|
-
default_database=None,
|
|
437
|
-
default_schema=self.operator.database,
|
|
438
|
-
)
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
def _snowflake_default_schema(self: "SnowflakeExtractor") -> Optional[str]:
|
|
442
|
-
if hasattr(self.operator, "schema") and self.operator.schema is not None:
|
|
443
|
-
return self.operator.schema
|
|
444
|
-
return (
|
|
445
|
-
self.conn.extra_dejson.get("extra__snowflake__schema", "")
|
|
446
|
-
or self.conn.extra_dejson.get("schema", "")
|
|
447
|
-
or self.conn.schema
|
|
448
|
-
)
|
|
449
|
-
# TODO: Should we try a fallback of:
|
|
450
|
-
# execute_query_on_hook(self.hook, "SELECT current_schema();")[0][0]
|
|
451
|
-
|
|
452
|
-
# execute_query_on_hook(self.hook, "SELECT current_schema();")
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
class TeradataOperatorExtractor(BaseExtractor):
|
|
456
|
-
"""Extractor for Teradata SQL operations.
|
|
457
|
-
|
|
458
|
-
Extracts lineage from TeradataOperator tasks by parsing the SQL queries
|
|
459
|
-
and understanding Teradata's two-tier database.table naming convention.
|
|
460
|
-
"""
|
|
461
|
-
|
|
462
|
-
def extract(self) -> Optional[ExtractResult]:
|
|
463
|
-
from airflow.providers.teradata.operators.teradata import TeradataOperator
|
|
464
|
-
|
|
465
|
-
operator: "TeradataOperator" = self.operator
|
|
466
|
-
sql = operator.sql
|
|
467
|
-
if not sql:
|
|
468
|
-
self.log.warning("No query found in TeradataOperator")
|
|
469
|
-
return None
|
|
470
|
-
|
|
471
|
-
return _parse_sql_into_task_metadata(
|
|
472
|
-
self,
|
|
473
|
-
sql,
|
|
474
|
-
platform="teradata",
|
|
475
|
-
default_database=None,
|
|
476
|
-
default_schema=None,
|
|
477
|
-
)
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Shims for legacy openlineage-airflow package.
|
|
3
|
-
This module is used when openlineage-airflow is installed (Airflow 2.x with legacy OpenLineage).
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
from openlineage.airflow.listener import TaskHolder
|
|
7
|
-
from openlineage.airflow.plugin import OpenLineagePlugin
|
|
8
|
-
from openlineage.airflow.utils import (
|
|
9
|
-
get_operator_class,
|
|
10
|
-
redact_with_exclusions,
|
|
11
|
-
try_import_from_string,
|
|
12
|
-
)
|
|
13
|
-
|
|
14
|
-
__all__ = [
|
|
15
|
-
"TaskHolder",
|
|
16
|
-
"OpenLineagePlugin",
|
|
17
|
-
"get_operator_class",
|
|
18
|
-
"redact_with_exclusions",
|
|
19
|
-
"try_import_from_string",
|
|
20
|
-
]
|