acryl-datahub-airflow-plugin 1.3.1.3rc2__py3-none-any.whl → 1.3.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
- datahub_airflow_plugin/_airflow_compat.py +32 -0
- datahub_airflow_plugin/_airflow_shims.py +64 -31
- datahub_airflow_plugin/_airflow_version_specific.py +184 -0
- datahub_airflow_plugin/_config.py +97 -19
- datahub_airflow_plugin/_constants.py +16 -0
- datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/airflow2/__init__.py +6 -0
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
- datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
- datahub_airflow_plugin/airflow2/_extractors.py +477 -0
- datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
- datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
- datahub_airflow_plugin/airflow2/_shims.py +88 -0
- datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
- datahub_airflow_plugin/airflow3/__init__.py +6 -0
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
- datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
- datahub_airflow_plugin/airflow3/_shims.py +82 -0
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
- datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
- datahub_airflow_plugin/client/airflow_generator.py +147 -43
- datahub_airflow_plugin/datahub_listener.py +19 -790
- datahub_airflow_plugin/example_dags/__init__.py +32 -0
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
- datahub_airflow_plugin/hooks/datahub.py +11 -2
- datahub_airflow_plugin/operators/datahub.py +20 -3
- acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/METADATA +0 -90
- acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/RECORD +0 -33
- datahub_airflow_plugin/_extractors.py +0 -336
- {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Version-specific utilities for Airflow 2 vs 3 compatibility.
|
|
3
|
+
This module provides clean abstractions for version-specific behavior.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import TYPE_CHECKING, Dict
|
|
8
|
+
|
|
9
|
+
import airflow
|
|
10
|
+
import packaging.version
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from airflow.models import TaskInstance
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Version detection
|
|
18
|
+
AIRFLOW_VERSION = packaging.version.parse(airflow.__version__)
|
|
19
|
+
IS_AIRFLOW_3_OR_HIGHER = AIRFLOW_VERSION >= packaging.version.parse("3.0.0")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_duration_attribute(ti: "TaskInstance") -> Dict[str, str]:
|
|
23
|
+
"""
|
|
24
|
+
Extract duration attribute, calculating it if necessary.
|
|
25
|
+
|
|
26
|
+
Airflow 2.x has duration as a direct attribute.
|
|
27
|
+
Airflow 3.x requires calculation from end_date - start_date.
|
|
28
|
+
"""
|
|
29
|
+
if hasattr(ti, "duration"):
|
|
30
|
+
return {"duration": str(ti.duration)}
|
|
31
|
+
|
|
32
|
+
if (
|
|
33
|
+
hasattr(ti, "end_date")
|
|
34
|
+
and ti.end_date
|
|
35
|
+
and hasattr(ti, "start_date")
|
|
36
|
+
and ti.start_date
|
|
37
|
+
):
|
|
38
|
+
try:
|
|
39
|
+
duration_seconds = (ti.end_date - ti.start_date).total_seconds()
|
|
40
|
+
return {"duration": str(duration_seconds)}
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.debug(f"Could not calculate duration: {e}")
|
|
43
|
+
|
|
44
|
+
return {}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _get_operator_attribute(ti: "TaskInstance") -> Dict[str, str]:
|
|
48
|
+
"""
|
|
49
|
+
Extract operator name in a version-compatible way.
|
|
50
|
+
|
|
51
|
+
In Airflow 2.x: Available as database column attribute ti.operator
|
|
52
|
+
In Airflow 3.x (RuntimeTaskInstance): Must extract from ti.task.__class__.__name__
|
|
53
|
+
"""
|
|
54
|
+
if hasattr(ti, "operator"):
|
|
55
|
+
operator_from_db = str(ti.operator)
|
|
56
|
+
logger.debug(
|
|
57
|
+
f"Operator from ti.operator (DB): {operator_from_db}, "
|
|
58
|
+
f"hasattr task: {hasattr(ti, 'task')}, "
|
|
59
|
+
f"task class: {ti.task.__class__.__name__ if hasattr(ti, 'task') and ti.task else 'N/A'}"
|
|
60
|
+
)
|
|
61
|
+
return {"operator": operator_from_db}
|
|
62
|
+
|
|
63
|
+
if hasattr(ti, "task") and ti.task is not None:
|
|
64
|
+
try:
|
|
65
|
+
return {"operator": ti.task.__class__.__name__}
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.debug(f"Could not get operator name from task: {e}")
|
|
68
|
+
|
|
69
|
+
return {}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _get_date_attributes(ti: "TaskInstance") -> Dict[str, str]:
|
|
73
|
+
"""
|
|
74
|
+
Extract date-related attributes.
|
|
75
|
+
|
|
76
|
+
Handles execution_date -> logical_date rename in Airflow 3.0.
|
|
77
|
+
"""
|
|
78
|
+
attributes = {}
|
|
79
|
+
|
|
80
|
+
if hasattr(ti, "end_date"):
|
|
81
|
+
attributes["end_date"] = str(ti.end_date)
|
|
82
|
+
|
|
83
|
+
if hasattr(ti, "execution_date"):
|
|
84
|
+
attributes["execution_date"] = str(ti.execution_date)
|
|
85
|
+
elif hasattr(ti, "logical_date"):
|
|
86
|
+
attributes["logical_date"] = str(ti.logical_date)
|
|
87
|
+
|
|
88
|
+
return attributes
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_task_instance_attributes(ti: "TaskInstance") -> Dict[str, str]:
|
|
92
|
+
"""
|
|
93
|
+
Extract attributes from a TaskInstance in a version-compatible way.
|
|
94
|
+
|
|
95
|
+
Airflow 3.0 introduced RuntimeTaskInstance which has different attributes
|
|
96
|
+
than Airflow 2.x TaskInstance.
|
|
97
|
+
|
|
98
|
+
Returns a dict of attribute name -> string value.
|
|
99
|
+
"""
|
|
100
|
+
attributes = {}
|
|
101
|
+
|
|
102
|
+
# Common attributes (both Airflow 2 and 3)
|
|
103
|
+
if hasattr(ti, "run_id"):
|
|
104
|
+
attributes["run_id"] = str(ti.run_id)
|
|
105
|
+
if hasattr(ti, "start_date") and ti.start_date:
|
|
106
|
+
attributes["start_date"] = str(ti.start_date)
|
|
107
|
+
if hasattr(ti, "try_number"):
|
|
108
|
+
attributes["try_number"] = str(ti.try_number - 1)
|
|
109
|
+
if hasattr(ti, "state"):
|
|
110
|
+
attributes["state"] = str(ti.state)
|
|
111
|
+
if hasattr(ti, "task_id"):
|
|
112
|
+
attributes["task_id"] = str(ti.task_id)
|
|
113
|
+
if hasattr(ti, "dag_id"):
|
|
114
|
+
attributes["dag_id"] = str(ti.dag_id)
|
|
115
|
+
|
|
116
|
+
# Complex extractions via helper functions
|
|
117
|
+
attributes.update(_get_duration_attribute(ti))
|
|
118
|
+
attributes.update(_get_date_attributes(ti))
|
|
119
|
+
attributes.update(_get_operator_attribute(ti))
|
|
120
|
+
|
|
121
|
+
# Optional attributes
|
|
122
|
+
if hasattr(ti, "max_tries"):
|
|
123
|
+
attributes["max_tries"] = str(ti.max_tries)
|
|
124
|
+
if hasattr(ti, "external_executor_id"):
|
|
125
|
+
attributes["external_executor_id"] = str(ti.external_executor_id)
|
|
126
|
+
if hasattr(ti, "priority_weight"):
|
|
127
|
+
attributes["priority_weight"] = str(ti.priority_weight)
|
|
128
|
+
if hasattr(ti, "log_url"):
|
|
129
|
+
attributes["log_url"] = ti.log_url
|
|
130
|
+
|
|
131
|
+
return attributes
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def get_airflow_compatible_dag_kwargs(**kwargs): # type: ignore[no-untyped-def]
|
|
135
|
+
"""
|
|
136
|
+
Get DAG kwargs that are compatible with current Airflow version.
|
|
137
|
+
|
|
138
|
+
Handles differences between Airflow 2.x and 3.x:
|
|
139
|
+
- schedule_interval -> schedule in Airflow 3.0
|
|
140
|
+
- default_view removed in Airflow 3.0
|
|
141
|
+
- start_date handling
|
|
142
|
+
"""
|
|
143
|
+
compatible_kwargs = kwargs.copy()
|
|
144
|
+
|
|
145
|
+
if IS_AIRFLOW_3_OR_HIGHER:
|
|
146
|
+
# Airflow 3.0 renamed schedule_interval to schedule
|
|
147
|
+
if "schedule_interval" in compatible_kwargs:
|
|
148
|
+
compatible_kwargs["schedule"] = compatible_kwargs.pop("schedule_interval")
|
|
149
|
+
|
|
150
|
+
# Airflow 3.0 removed default_view
|
|
151
|
+
if "default_view" in compatible_kwargs:
|
|
152
|
+
del compatible_kwargs["default_view"]
|
|
153
|
+
|
|
154
|
+
return compatible_kwargs # type: ignore[no-any-return]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def days_ago(n: int): # type: ignore[no-untyped-def]
|
|
158
|
+
"""
|
|
159
|
+
Compatibility helper for days_ago which was removed in Airflow 3.0.
|
|
160
|
+
|
|
161
|
+
In Airflow 2.x, use airflow.utils.dates.days_ago()
|
|
162
|
+
In Airflow 3.0, use datetime.datetime - datetime.timedelta
|
|
163
|
+
"""
|
|
164
|
+
from datetime import datetime, timedelta, timezone
|
|
165
|
+
|
|
166
|
+
if IS_AIRFLOW_3_OR_HIGHER:
|
|
167
|
+
# Airflow 3.0: use datetime directly
|
|
168
|
+
return datetime.now(timezone.utc) - timedelta(days=n)
|
|
169
|
+
else:
|
|
170
|
+
# Airflow 2.x: use the official helper
|
|
171
|
+
from airflow.utils.dates import ( # type: ignore[attr-defined]
|
|
172
|
+
days_ago as airflow_days_ago,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return airflow_days_ago(n) # type: ignore[no-any-return]
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
__all__ = [
|
|
179
|
+
"AIRFLOW_VERSION",
|
|
180
|
+
"IS_AIRFLOW_3_OR_HIGHER",
|
|
181
|
+
"get_task_instance_attributes",
|
|
182
|
+
"get_airflow_compatible_dag_kwargs",
|
|
183
|
+
"days_ago",
|
|
184
|
+
]
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import TYPE_CHECKING,
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
from airflow.configuration import conf
|
|
5
|
-
from pydantic import
|
|
6
|
-
from pydantic.fields import Field
|
|
5
|
+
from pydantic import Field
|
|
7
6
|
|
|
8
7
|
import datahub.emitter.mce_builder as builder
|
|
9
8
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
9
|
+
from datahub_airflow_plugin._airflow_version_specific import IS_AIRFLOW_3_OR_HIGHER
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from datahub_airflow_plugin.hooks.datahub import (
|
|
@@ -18,16 +18,15 @@ if TYPE_CHECKING:
|
|
|
18
18
|
class DatajobUrl(Enum):
|
|
19
19
|
GRID = "grid"
|
|
20
20
|
TASKINSTANCE = "taskinstance"
|
|
21
|
+
TASKS = "tasks" # Airflow 3.x task URL format: /dags/{dag_id}/tasks/{task_id}
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class DatahubLineageConfig(ConfigModel):
|
|
24
25
|
enabled: bool
|
|
25
26
|
|
|
26
|
-
# DataHub hook connection ID.
|
|
27
|
+
# DataHub hook connection ID (can be comma-separated for multiple connections).
|
|
27
28
|
datahub_conn_id: str
|
|
28
29
|
|
|
29
|
-
_datahub_connection_ids: List[str]
|
|
30
|
-
|
|
31
30
|
# Cluster to associate with the pipelines and tasks. Defaults to "prod".
|
|
32
31
|
cluster: str
|
|
33
32
|
|
|
@@ -53,6 +52,32 @@ class DatahubLineageConfig(ConfigModel):
|
|
|
53
52
|
|
|
54
53
|
enable_extractors: bool
|
|
55
54
|
|
|
55
|
+
# OpenLineage extractor patching/override controls (only apply when enable_extractors=True)
|
|
56
|
+
# These allow fine-grained control over DataHub's enhancements to OpenLineage extractors
|
|
57
|
+
|
|
58
|
+
# If true (default), patch SqlExtractor to use DataHub's SQL parser
|
|
59
|
+
# This enables column-level lineage extraction from SQL queries
|
|
60
|
+
# Works with both Legacy OpenLineage and OpenLineage Provider
|
|
61
|
+
patch_sql_parser: bool
|
|
62
|
+
|
|
63
|
+
# If true (default), patch SnowflakeExtractor's default_schema property
|
|
64
|
+
# Fixes schema detection issues in Snowflake operators
|
|
65
|
+
# Works with both Legacy OpenLineage and OpenLineage Provider
|
|
66
|
+
patch_snowflake_schema: bool
|
|
67
|
+
|
|
68
|
+
# If true (default), use DataHub's custom AthenaOperatorExtractor
|
|
69
|
+
# Provides better Athena lineage with DataHub's SQL parser
|
|
70
|
+
# Only applies to Legacy OpenLineage (OpenLineage Provider has its own)
|
|
71
|
+
extract_athena_operator: bool
|
|
72
|
+
|
|
73
|
+
# If true (default), use DataHub's custom BigQueryInsertJobOperatorExtractor
|
|
74
|
+
# Handles BigQuery job configuration and destination tables
|
|
75
|
+
# Only applies to Legacy OpenLineage (OpenLineage Provider has its own)
|
|
76
|
+
extract_bigquery_insert_job_operator: bool
|
|
77
|
+
|
|
78
|
+
# If true (default) use DataHub's custom TeradataOperator
|
|
79
|
+
extract_teradata_operator: bool
|
|
80
|
+
|
|
56
81
|
# If true, ti.render_templates() will be called in the listener.
|
|
57
82
|
# Makes extraction of jinja-templated fields more accurate.
|
|
58
83
|
render_templates: bool
|
|
@@ -69,6 +94,17 @@ class DatahubLineageConfig(ConfigModel):
|
|
|
69
94
|
|
|
70
95
|
disable_openlineage_plugin: bool
|
|
71
96
|
|
|
97
|
+
@property
|
|
98
|
+
def _datahub_connection_ids(self) -> List[str]:
|
|
99
|
+
"""
|
|
100
|
+
Parse comma-separated connection IDs into a list.
|
|
101
|
+
|
|
102
|
+
This is implemented as a property to avoid the class variable pollution
|
|
103
|
+
bug that would occur with validators. Each instance computes its own
|
|
104
|
+
connection ID list from its datahub_conn_id field.
|
|
105
|
+
"""
|
|
106
|
+
return [conn_id.strip() for conn_id in self.datahub_conn_id.split(",")]
|
|
107
|
+
|
|
72
108
|
def make_emitter_hook(self) -> Union["DatahubGenericHook", "DatahubCompositeHook"]:
|
|
73
109
|
# This is necessary to avoid issues with circular imports.
|
|
74
110
|
from datahub_airflow_plugin.hooks.datahub import (
|
|
@@ -76,18 +112,11 @@ class DatahubLineageConfig(ConfigModel):
|
|
|
76
112
|
DatahubGenericHook,
|
|
77
113
|
)
|
|
78
114
|
|
|
79
|
-
|
|
80
|
-
|
|
115
|
+
connection_ids = self._datahub_connection_ids
|
|
116
|
+
if len(connection_ids) == 1:
|
|
117
|
+
return DatahubGenericHook(connection_ids[0])
|
|
81
118
|
else:
|
|
82
|
-
return DatahubCompositeHook(
|
|
83
|
-
|
|
84
|
-
@root_validator(skip_on_failure=True)
|
|
85
|
-
def split_conn_ids(cls, values: Dict) -> Dict:
|
|
86
|
-
if not values.get("datahub_conn_id"):
|
|
87
|
-
raise ValueError("datahub_conn_id is required")
|
|
88
|
-
conn_ids = values.get("datahub_conn_id", "").split(",")
|
|
89
|
-
cls._datahub_connection_ids = [conn_id.strip() for conn_id in conn_ids]
|
|
90
|
-
return values
|
|
119
|
+
return DatahubCompositeHook(connection_ids)
|
|
91
120
|
|
|
92
121
|
|
|
93
122
|
def get_lineage_config() -> DatahubLineageConfig:
|
|
@@ -107,14 +136,58 @@ def get_lineage_config() -> DatahubLineageConfig:
|
|
|
107
136
|
capture_executions = conf.get("datahub", "capture_executions", fallback=True)
|
|
108
137
|
materialize_iolets = conf.get("datahub", "materialize_iolets", fallback=True)
|
|
109
138
|
enable_extractors = conf.get("datahub", "enable_extractors", fallback=True)
|
|
139
|
+
|
|
140
|
+
# OpenLineage extractor patching/override configuration
|
|
141
|
+
# These only apply when enable_extractors=True
|
|
142
|
+
patch_sql_parser = conf.get("datahub", "patch_sql_parser", fallback=True)
|
|
143
|
+
patch_snowflake_schema = conf.get(
|
|
144
|
+
"datahub", "patch_snowflake_schema", fallback=True
|
|
145
|
+
)
|
|
146
|
+
extract_athena_operator = conf.get(
|
|
147
|
+
"datahub", "extract_athena_operator", fallback=True
|
|
148
|
+
)
|
|
149
|
+
extract_bigquery_insert_job_operator = conf.get(
|
|
150
|
+
"datahub", "extract_bigquery_insert_job_operator", fallback=True
|
|
151
|
+
)
|
|
152
|
+
extract_teradata_operator = conf.get(
|
|
153
|
+
"datahub", "extract_teradata_operator", fallback=True
|
|
154
|
+
)
|
|
155
|
+
|
|
110
156
|
log_level = conf.get("datahub", "log_level", fallback=None)
|
|
111
157
|
debug_emitter = conf.get("datahub", "debug_emitter", fallback=False)
|
|
158
|
+
|
|
159
|
+
# Disable OpenLineage plugin by default (disable_openlineage_plugin=True) for all versions.
|
|
160
|
+
# This is the safest default since most DataHub users only want DataHub's lineage.
|
|
161
|
+
#
|
|
162
|
+
# When disable_openlineage_plugin=True (default):
|
|
163
|
+
# - Only DataHub plugin runs (OpenLineagePlugin.listeners are cleared if present)
|
|
164
|
+
# - In Airflow 3: SQLParser calls only DataHub's enhanced parser
|
|
165
|
+
# - In Airflow 2: DataHub uses its own extractors
|
|
166
|
+
# - DataHub gets enhanced parsing with column-level lineage
|
|
167
|
+
#
|
|
168
|
+
# When disable_openlineage_plugin=False (opt-in for dual plugin mode):
|
|
169
|
+
# - Both DataHub and OpenLineage plugins run side-by-side
|
|
170
|
+
# - In Airflow 3: SQLParser calls BOTH parsers
|
|
171
|
+
# - OpenLineage plugin uses its own parsing results (inputs/outputs)
|
|
172
|
+
# - DataHub extracts its enhanced parsing (with column-level lineage) from run_facets
|
|
173
|
+
# - Both plugins get their expected parsing without interference
|
|
174
|
+
# - In Airflow 2: Not recommended - may cause conflicts
|
|
175
|
+
default_disable_openlineage = True
|
|
176
|
+
|
|
112
177
|
disable_openlineage_plugin = conf.get(
|
|
113
|
-
"datahub", "disable_openlineage_plugin", fallback=
|
|
178
|
+
"datahub", "disable_openlineage_plugin", fallback=default_disable_openlineage
|
|
114
179
|
)
|
|
115
180
|
render_templates = conf.get("datahub", "render_templates", fallback=True)
|
|
181
|
+
|
|
182
|
+
# Use new task URL format for Airflow 3.x, old taskinstance format for Airflow 2.x
|
|
183
|
+
# Airflow 3 changed URL structure: /dags/{dag_id}/tasks/{task_id} instead of /taskinstance/list/...
|
|
184
|
+
default_datajob_url = (
|
|
185
|
+
DatajobUrl.TASKS.value
|
|
186
|
+
if IS_AIRFLOW_3_OR_HIGHER
|
|
187
|
+
else DatajobUrl.TASKINSTANCE.value
|
|
188
|
+
)
|
|
116
189
|
datajob_url_link = conf.get(
|
|
117
|
-
"datahub", "datajob_url_link", fallback=
|
|
190
|
+
"datahub", "datajob_url_link", fallback=default_datajob_url
|
|
118
191
|
)
|
|
119
192
|
dag_filter_pattern = AllowDenyPattern.model_validate_json(
|
|
120
193
|
conf.get("datahub", "dag_filter_str", fallback='{"allow": [".*"]}')
|
|
@@ -132,6 +205,11 @@ def get_lineage_config() -> DatahubLineageConfig:
|
|
|
132
205
|
capture_executions=capture_executions,
|
|
133
206
|
materialize_iolets=materialize_iolets,
|
|
134
207
|
enable_extractors=enable_extractors,
|
|
208
|
+
patch_sql_parser=patch_sql_parser,
|
|
209
|
+
patch_snowflake_schema=patch_snowflake_schema,
|
|
210
|
+
extract_athena_operator=extract_athena_operator,
|
|
211
|
+
extract_bigquery_insert_job_operator=extract_bigquery_insert_job_operator,
|
|
212
|
+
extract_teradata_operator=extract_teradata_operator,
|
|
135
213
|
log_level=log_level,
|
|
136
214
|
debug_emitter=debug_emitter,
|
|
137
215
|
disable_openlineage_plugin=disable_openlineage_plugin,
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared constants for the DataHub Airflow plugin.
|
|
3
|
+
|
|
4
|
+
This module centralizes constant values used across multiple modules
|
|
5
|
+
to avoid duplication and ensure consistency.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# SQL parsing result keys for storing SQL lineage in OpenLineage facets
|
|
9
|
+
|
|
10
|
+
# Key for DataHub's enhanced SQL parsing result (with column-level lineage)
|
|
11
|
+
# Used in Airflow 3.x to pass results from SQLParser patch to DataHub listener
|
|
12
|
+
DATAHUB_SQL_PARSING_RESULT_KEY = "datahub_sql_parsing_result"
|
|
13
|
+
|
|
14
|
+
# Key for DataHub's SQL parsing result in Airflow 2.x extractors
|
|
15
|
+
# Used to pass results from extractors to DataHub listener
|
|
16
|
+
SQL_PARSING_RESULT_KEY = "datahub_sql"
|
|
@@ -1,6 +1,18 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
2
3
|
|
|
3
|
-
|
|
4
|
+
# Conditional import for OpenLineage (may not be installed)
|
|
5
|
+
try:
|
|
6
|
+
from openlineage.client.run import Dataset as OpenLineageDataset
|
|
7
|
+
|
|
8
|
+
OPENLINEAGE_AVAILABLE = True
|
|
9
|
+
except ImportError:
|
|
10
|
+
# Not available when openlineage packages aren't installed
|
|
11
|
+
OpenLineageDataset = None # type: ignore[assignment,misc]
|
|
12
|
+
OPENLINEAGE_AVAILABLE = False
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from openlineage.client.run import Dataset as OpenLineageDataset
|
|
4
16
|
|
|
5
17
|
import datahub.emitter.mce_builder as builder
|
|
6
18
|
|
|
@@ -13,7 +25,7 @@ OL_SCHEME_TWEAKS = {
|
|
|
13
25
|
}
|
|
14
26
|
|
|
15
27
|
|
|
16
|
-
def translate_ol_to_datahub_urn(ol_uri: OpenLineageDataset) -> str:
|
|
28
|
+
def translate_ol_to_datahub_urn(ol_uri: "OpenLineageDataset") -> str:
|
|
17
29
|
namespace = ol_uri.namespace
|
|
18
30
|
name = ol_uri.name
|
|
19
31
|
|