acryl-datahub-airflow-plugin 1.3.1.4__py3-none-any.whl → 1.3.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
- datahub_airflow_plugin/_airflow_compat.py +32 -0
- datahub_airflow_plugin/_airflow_shims.py +64 -31
- datahub_airflow_plugin/_airflow_version_specific.py +184 -0
- datahub_airflow_plugin/_config.py +97 -19
- datahub_airflow_plugin/_constants.py +16 -0
- datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/airflow2/__init__.py +6 -0
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
- datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
- datahub_airflow_plugin/airflow2/_extractors.py +477 -0
- datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
- datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
- datahub_airflow_plugin/airflow2/_shims.py +88 -0
- datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
- datahub_airflow_plugin/airflow3/__init__.py +6 -0
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
- datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
- datahub_airflow_plugin/airflow3/_shims.py +82 -0
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
- datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
- datahub_airflow_plugin/client/airflow_generator.py +147 -43
- datahub_airflow_plugin/datahub_listener.py +19 -790
- datahub_airflow_plugin/example_dags/__init__.py +32 -0
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
- datahub_airflow_plugin/hooks/datahub.py +11 -2
- datahub_airflow_plugin/operators/datahub.py +20 -3
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA +0 -90
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD +0 -33
- datahub_airflow_plugin/_extractors.py +0 -336
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime, tzinfo
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union, cast
|
|
3
4
|
|
|
4
5
|
from airflow.configuration import conf
|
|
5
6
|
|
|
@@ -12,12 +13,60 @@ from datahub.emitter.generic_emitter import Emitter
|
|
|
12
13
|
from datahub.metadata.schema_classes import DataProcessTypeClass
|
|
13
14
|
from datahub.utilities.urns.data_flow_urn import DataFlowUrn
|
|
14
15
|
from datahub.utilities.urns.data_job_urn import DataJobUrn
|
|
16
|
+
from datahub_airflow_plugin._airflow_version_specific import (
|
|
17
|
+
get_task_instance_attributes,
|
|
18
|
+
)
|
|
15
19
|
from datahub_airflow_plugin._config import DatahubLineageConfig, DatajobUrl
|
|
16
20
|
|
|
17
21
|
if TYPE_CHECKING:
|
|
18
22
|
from airflow import DAG
|
|
19
23
|
from airflow.models import DagRun, TaskInstance
|
|
20
|
-
|
|
24
|
+
|
|
25
|
+
from datahub_airflow_plugin._airflow_shims import Operator
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from airflow.serialization.serialized_objects import (
|
|
29
|
+
SerializedBaseOperator,
|
|
30
|
+
SerializedDAG,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
DagType = Union[DAG, SerializedDAG]
|
|
34
|
+
OperatorType = Union[Operator, SerializedBaseOperator]
|
|
35
|
+
except ImportError:
|
|
36
|
+
DagType = DAG # type: ignore[misc]
|
|
37
|
+
OperatorType = Operator # type: ignore[misc]
|
|
38
|
+
|
|
39
|
+
# Add type ignore for ti.task which can be MappedOperator from different modules
|
|
40
|
+
# airflow.models.mappedoperator.MappedOperator (2.x) vs airflow.sdk.definitions.mappedoperator.MappedOperator (3.x)
|
|
41
|
+
TaskType = Union[OperatorType, Any] # type: ignore[misc]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _get_base_url() -> str:
|
|
45
|
+
"""
|
|
46
|
+
Get the Airflow base URL for constructing web UI links.
|
|
47
|
+
|
|
48
|
+
Tries multiple configuration sources for backward compatibility:
|
|
49
|
+
1. webserver.base_url (Airflow 2.x and 3.x with computed default)
|
|
50
|
+
2. api.base_url (Airflow 3.x alternative configuration)
|
|
51
|
+
3. Fallback to http://localhost:8080 (safe default)
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
str: The base URL for the Airflow web UI
|
|
55
|
+
"""
|
|
56
|
+
# Try webserver.base_url first (works in both Airflow 2.x and 3.x)
|
|
57
|
+
# In Airflow 3.x, this is computed from web_server_host + web_server_port
|
|
58
|
+
base_url = conf.get("webserver", "base_url", fallback=None)
|
|
59
|
+
if base_url:
|
|
60
|
+
return base_url
|
|
61
|
+
|
|
62
|
+
# Fallback to api.base_url for environments that use it
|
|
63
|
+
# Some Airflow 3.x deployments may set this explicitly
|
|
64
|
+
api_base_url = conf.get("api", "base_url", fallback=None)
|
|
65
|
+
if api_base_url:
|
|
66
|
+
return api_base_url
|
|
67
|
+
|
|
68
|
+
# Final fallback to localhost (safe default for development/testing)
|
|
69
|
+
return "http://localhost:8080"
|
|
21
70
|
|
|
22
71
|
|
|
23
72
|
def _task_downstream_task_ids(operator: "Operator") -> Set[str]:
|
|
@@ -29,8 +78,8 @@ def _task_downstream_task_ids(operator: "Operator") -> Set[str]:
|
|
|
29
78
|
class AirflowGenerator:
|
|
30
79
|
@staticmethod
|
|
31
80
|
def _get_dependencies(
|
|
32
|
-
task: "
|
|
33
|
-
dag: "
|
|
81
|
+
task: "OperatorType",
|
|
82
|
+
dag: "DagType",
|
|
34
83
|
flow_urn: DataFlowUrn,
|
|
35
84
|
config: Optional[DatahubLineageConfig] = None,
|
|
36
85
|
) -> List[DataJobUrn]:
|
|
@@ -67,14 +116,18 @@ class AirflowGenerator:
|
|
|
67
116
|
|
|
68
117
|
# subdags are always named with 'parent.child' style or Airflow won't run them
|
|
69
118
|
# add connection from subdag trigger(s) if subdag task has no upstreams
|
|
119
|
+
# Note: is_subdag was removed in Airflow 3.x (subdags deprecated in Airflow 2.0)
|
|
120
|
+
parent_dag = getattr(dag, "parent_dag", None)
|
|
70
121
|
if (
|
|
71
|
-
dag
|
|
72
|
-
and
|
|
122
|
+
getattr(dag, "is_subdag", False)
|
|
123
|
+
and parent_dag is not None
|
|
73
124
|
and len(task.upstream_task_ids) == 0
|
|
74
125
|
):
|
|
75
126
|
# filter through the parent dag's tasks and find the subdag trigger(s)
|
|
76
127
|
subdags = [
|
|
77
|
-
x
|
|
128
|
+
x
|
|
129
|
+
for x in parent_dag.task_dict.values()
|
|
130
|
+
if x.subdag is not None # type: ignore[union-attr]
|
|
78
131
|
]
|
|
79
132
|
matched_subdags = [
|
|
80
133
|
x for x in subdags if x.subdag and x.subdag.dag_id == dag.dag_id
|
|
@@ -84,14 +137,14 @@ class AirflowGenerator:
|
|
|
84
137
|
subdag_task_id = matched_subdags[0].task_id
|
|
85
138
|
|
|
86
139
|
# iterate through the parent dag's tasks and find the ones that trigger the subdag
|
|
87
|
-
for upstream_task_id in
|
|
88
|
-
upstream_task =
|
|
140
|
+
for upstream_task_id in parent_dag.task_dict: # type: ignore[union-attr]
|
|
141
|
+
upstream_task = parent_dag.task_dict[upstream_task_id] # type: ignore[union-attr]
|
|
89
142
|
upstream_task_urn = DataJobUrn.create_from_ids(
|
|
90
143
|
data_flow_urn=str(flow_urn), job_id=upstream_task_id
|
|
91
144
|
)
|
|
92
145
|
|
|
93
146
|
# if the task triggers the subdag, link it to this node in the subdag
|
|
94
|
-
if subdag_task_id in sorted(_task_downstream_task_ids(upstream_task)):
|
|
147
|
+
if subdag_task_id in sorted(_task_downstream_task_ids(upstream_task)): # type: ignore[arg-type]
|
|
95
148
|
upstream_subdag_triggers.append(upstream_task_urn)
|
|
96
149
|
|
|
97
150
|
# If the operator is an ExternalTaskSensor then we set the remote task as upstream.
|
|
@@ -100,14 +153,16 @@ class AirflowGenerator:
|
|
|
100
153
|
external_task_upstreams = []
|
|
101
154
|
if isinstance(task, ExternalTaskSensor):
|
|
102
155
|
task = cast(ExternalTaskSensor, task)
|
|
103
|
-
|
|
156
|
+
external_task_id = getattr(task, "external_task_id", None)
|
|
157
|
+
external_dag_id = getattr(task, "external_dag_id", None)
|
|
158
|
+
if external_task_id is not None and external_dag_id is not None:
|
|
104
159
|
external_task_upstreams = [
|
|
105
160
|
DataJobUrn.create_from_ids(
|
|
106
|
-
job_id=
|
|
161
|
+
job_id=external_task_id,
|
|
107
162
|
data_flow_urn=str(
|
|
108
163
|
DataFlowUrn.create_from_ids(
|
|
109
164
|
orchestrator=flow_urn.orchestrator,
|
|
110
|
-
flow_id=
|
|
165
|
+
flow_id=external_dag_id,
|
|
111
166
|
env=flow_urn.cluster,
|
|
112
167
|
platform_instance=config.platform_instance
|
|
113
168
|
if config
|
|
@@ -130,13 +185,13 @@ class AirflowGenerator:
|
|
|
130
185
|
return upstream_tasks
|
|
131
186
|
|
|
132
187
|
@staticmethod
|
|
133
|
-
def _extract_owners(dag: "
|
|
188
|
+
def _extract_owners(dag: "DagType") -> List[str]:
|
|
134
189
|
return [owner.strip() for owner in dag.owner.split(",")]
|
|
135
190
|
|
|
136
191
|
@staticmethod
|
|
137
192
|
def generate_dataflow(
|
|
138
193
|
config: DatahubLineageConfig,
|
|
139
|
-
dag: "
|
|
194
|
+
dag: "DagType",
|
|
140
195
|
) -> DataFlow:
|
|
141
196
|
"""
|
|
142
197
|
Generates a Dataflow object from an Airflow DAG
|
|
@@ -173,12 +228,34 @@ class AirflowGenerator:
|
|
|
173
228
|
"timezone",
|
|
174
229
|
]
|
|
175
230
|
|
|
231
|
+
def _serialize_dag_property(value: Any) -> str:
|
|
232
|
+
"""Serialize DAG property values to string format (JSON-compatible when possible)."""
|
|
233
|
+
if value is None:
|
|
234
|
+
return ""
|
|
235
|
+
elif isinstance(value, bool):
|
|
236
|
+
return "true" if value else "false"
|
|
237
|
+
elif isinstance(value, datetime):
|
|
238
|
+
return value.isoformat()
|
|
239
|
+
elif isinstance(value, (set, frozenset)):
|
|
240
|
+
# Convert set to JSON array string
|
|
241
|
+
return json.dumps(sorted(list(value)))
|
|
242
|
+
elif isinstance(value, tzinfo):
|
|
243
|
+
return str(value.tzname(None))
|
|
244
|
+
elif isinstance(value, (int, float)):
|
|
245
|
+
return str(value)
|
|
246
|
+
elif isinstance(value, str):
|
|
247
|
+
return value
|
|
248
|
+
else:
|
|
249
|
+
# For other types, convert to string but avoid repr() format
|
|
250
|
+
return str(value)
|
|
251
|
+
|
|
176
252
|
for key in allowed_flow_keys:
|
|
177
253
|
if hasattr(dag, key):
|
|
178
|
-
|
|
254
|
+
value = getattr(dag, key)
|
|
255
|
+
flow_property_bag[key] = _serialize_dag_property(value)
|
|
179
256
|
|
|
180
257
|
data_flow.properties = flow_property_bag
|
|
181
|
-
base_url =
|
|
258
|
+
base_url = _get_base_url()
|
|
182
259
|
data_flow.url = f"{base_url}/tree?dag_id={dag.dag_id}"
|
|
183
260
|
|
|
184
261
|
if config.capture_ownership_info and dag.owner:
|
|
@@ -194,8 +271,8 @@ class AirflowGenerator:
|
|
|
194
271
|
return data_flow
|
|
195
272
|
|
|
196
273
|
@staticmethod
|
|
197
|
-
def _get_description(task: "
|
|
198
|
-
from
|
|
274
|
+
def _get_description(task: "OperatorType") -> Optional[str]:
|
|
275
|
+
from datahub_airflow_plugin._airflow_shims import BaseOperator
|
|
199
276
|
|
|
200
277
|
if not isinstance(task, BaseOperator):
|
|
201
278
|
# TODO: Get docs for mapped operators.
|
|
@@ -216,8 +293,8 @@ class AirflowGenerator:
|
|
|
216
293
|
@staticmethod
|
|
217
294
|
def generate_datajob(
|
|
218
295
|
cluster: str,
|
|
219
|
-
task: "
|
|
220
|
-
dag: "
|
|
296
|
+
task: "OperatorType",
|
|
297
|
+
dag: "DagType",
|
|
221
298
|
set_dependencies: bool = True,
|
|
222
299
|
capture_owner: bool = True,
|
|
223
300
|
capture_tags: bool = True,
|
|
@@ -289,11 +366,15 @@ class AirflowGenerator:
|
|
|
289
366
|
break
|
|
290
367
|
|
|
291
368
|
datajob.properties = job_property_bag
|
|
292
|
-
base_url =
|
|
369
|
+
base_url = _get_base_url()
|
|
293
370
|
|
|
294
371
|
if config and config.datajob_url_link == DatajobUrl.GRID:
|
|
295
372
|
datajob.url = f"{base_url}/dags/{dag.dag_id}/grid?task_id={task.task_id}"
|
|
373
|
+
elif config and config.datajob_url_link == DatajobUrl.TASKS:
|
|
374
|
+
# Airflow 3.x task URL format
|
|
375
|
+
datajob.url = f"{base_url}/dags/{dag.dag_id}/tasks/{task.task_id}"
|
|
296
376
|
else:
|
|
377
|
+
# Airflow 2.x taskinstance list URL format
|
|
297
378
|
datajob.url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
|
|
298
379
|
|
|
299
380
|
if capture_owner and dag.owner:
|
|
@@ -447,8 +528,12 @@ class AirflowGenerator:
|
|
|
447
528
|
) -> DataProcessInstance:
|
|
448
529
|
if datajob is None:
|
|
449
530
|
assert ti.task is not None
|
|
531
|
+
# ti.task can be MappedOperator from different modules (airflow.models vs airflow.sdk.definitions)
|
|
450
532
|
datajob = AirflowGenerator.generate_datajob(
|
|
451
|
-
config.cluster,
|
|
533
|
+
config.cluster,
|
|
534
|
+
ti.task, # type: ignore[arg-type]
|
|
535
|
+
dag,
|
|
536
|
+
config=config,
|
|
452
537
|
)
|
|
453
538
|
|
|
454
539
|
assert dag_run.run_id
|
|
@@ -458,26 +543,23 @@ class AirflowGenerator:
|
|
|
458
543
|
clone_inlets=True,
|
|
459
544
|
clone_outlets=True,
|
|
460
545
|
)
|
|
461
|
-
|
|
462
|
-
job_property_bag
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
job_property_bag["end_date"] = str(ti.end_date)
|
|
466
|
-
job_property_bag["execution_date"] = str(ti.execution_date)
|
|
467
|
-
job_property_bag["try_number"] = str(ti.try_number - 1)
|
|
468
|
-
job_property_bag["max_tries"] = str(ti.max_tries)
|
|
469
|
-
# Not compatible with Airflow 1
|
|
470
|
-
if hasattr(ti, "external_executor_id"):
|
|
471
|
-
job_property_bag["external_executor_id"] = str(ti.external_executor_id)
|
|
472
|
-
job_property_bag["state"] = str(ti.state)
|
|
473
|
-
job_property_bag["operator"] = str(ti.operator)
|
|
474
|
-
job_property_bag["priority_weight"] = str(ti.priority_weight)
|
|
475
|
-
job_property_bag["log_url"] = ti.log_url
|
|
546
|
+
|
|
547
|
+
job_property_bag = get_task_instance_attributes(ti)
|
|
548
|
+
|
|
549
|
+
# Add orchestrator and DAG/task IDs
|
|
476
550
|
job_property_bag["orchestrator"] = "airflow"
|
|
477
|
-
|
|
478
|
-
|
|
551
|
+
if "dag_id" not in job_property_bag:
|
|
552
|
+
job_property_bag["dag_id"] = str(dag.dag_id)
|
|
553
|
+
if "task_id" not in job_property_bag:
|
|
554
|
+
job_property_bag["task_id"] = str(ti.task_id)
|
|
555
|
+
if "run_id" not in job_property_bag:
|
|
556
|
+
job_property_bag["run_id"] = str(dag_run.run_id)
|
|
557
|
+
|
|
479
558
|
dpi.properties.update(job_property_bag)
|
|
480
|
-
|
|
559
|
+
|
|
560
|
+
# Set URL if log_url is available
|
|
561
|
+
if "log_url" in job_property_bag:
|
|
562
|
+
dpi.url = job_property_bag["log_url"]
|
|
481
563
|
|
|
482
564
|
# This property only exists in Airflow2
|
|
483
565
|
if hasattr(ti, "dag_run") and hasattr(ti.dag_run, "run_type"):
|
|
@@ -538,8 +620,12 @@ class AirflowGenerator:
|
|
|
538
620
|
"""
|
|
539
621
|
if datajob is None:
|
|
540
622
|
assert ti.task is not None
|
|
623
|
+
# ti.task can be MappedOperator from different modules (airflow.models vs airflow.sdk.definitions)
|
|
541
624
|
datajob = AirflowGenerator.generate_datajob(
|
|
542
|
-
cluster,
|
|
625
|
+
cluster,
|
|
626
|
+
ti.task, # type: ignore[arg-type]
|
|
627
|
+
dag,
|
|
628
|
+
config=config,
|
|
543
629
|
)
|
|
544
630
|
|
|
545
631
|
if end_timestamp_millis is None:
|
|
@@ -566,6 +652,24 @@ class AirflowGenerator:
|
|
|
566
652
|
clone_inlets=True,
|
|
567
653
|
clone_outlets=True,
|
|
568
654
|
)
|
|
655
|
+
|
|
656
|
+
job_property_bag = get_task_instance_attributes(ti)
|
|
657
|
+
|
|
658
|
+
# Add orchestrator and DAG/task IDs
|
|
659
|
+
job_property_bag["orchestrator"] = "airflow"
|
|
660
|
+
if "dag_id" not in job_property_bag:
|
|
661
|
+
job_property_bag["dag_id"] = str(dag.dag_id)
|
|
662
|
+
if "task_id" not in job_property_bag:
|
|
663
|
+
job_property_bag["task_id"] = str(ti.task_id)
|
|
664
|
+
if "run_id" not in job_property_bag:
|
|
665
|
+
job_property_bag["run_id"] = str(dag_run.run_id)
|
|
666
|
+
|
|
667
|
+
dpi.properties.update(job_property_bag)
|
|
668
|
+
|
|
669
|
+
# Set URL if log_url is available
|
|
670
|
+
if "log_url" in job_property_bag:
|
|
671
|
+
dpi.url = job_property_bag["log_url"]
|
|
672
|
+
|
|
569
673
|
dpi.emit_process_end(
|
|
570
674
|
emitter=emitter,
|
|
571
675
|
end_timestamp_millis=end_timestamp_millis,
|