acryl-datahub-airflow-plugin 1.3.1.4__py3-none-any.whl → 1.3.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
  3. datahub_airflow_plugin/_airflow_compat.py +32 -0
  4. datahub_airflow_plugin/_airflow_shims.py +64 -31
  5. datahub_airflow_plugin/_airflow_version_specific.py +184 -0
  6. datahub_airflow_plugin/_config.py +97 -19
  7. datahub_airflow_plugin/_constants.py +16 -0
  8. datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
  9. datahub_airflow_plugin/_version.py +1 -1
  10. datahub_airflow_plugin/airflow2/__init__.py +6 -0
  11. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
  12. datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
  13. datahub_airflow_plugin/airflow2/_extractors.py +477 -0
  14. datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
  15. datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
  16. datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
  17. datahub_airflow_plugin/airflow2/_shims.py +88 -0
  18. datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
  19. datahub_airflow_plugin/airflow3/__init__.py +6 -0
  20. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
  21. datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
  22. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
  23. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
  24. datahub_airflow_plugin/airflow3/_shims.py +82 -0
  25. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
  26. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
  27. datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
  28. datahub_airflow_plugin/client/airflow_generator.py +147 -43
  29. datahub_airflow_plugin/datahub_listener.py +19 -790
  30. datahub_airflow_plugin/example_dags/__init__.py +32 -0
  31. datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
  32. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
  33. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
  34. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
  35. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
  36. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
  37. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
  38. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
  39. datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
  40. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
  41. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
  42. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
  43. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
  44. datahub_airflow_plugin/hooks/datahub.py +11 -2
  45. datahub_airflow_plugin/operators/datahub.py +20 -3
  46. acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA +0 -90
  47. acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD +0 -33
  48. datahub_airflow_plugin/_extractors.py +0 -336
  49. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
- from datetime import datetime
2
- from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, cast
1
+ import json
2
+ from datetime import datetime, tzinfo
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union, cast
3
4
 
4
5
  from airflow.configuration import conf
5
6
 
@@ -12,12 +13,60 @@ from datahub.emitter.generic_emitter import Emitter
12
13
  from datahub.metadata.schema_classes import DataProcessTypeClass
13
14
  from datahub.utilities.urns.data_flow_urn import DataFlowUrn
14
15
  from datahub.utilities.urns.data_job_urn import DataJobUrn
16
+ from datahub_airflow_plugin._airflow_version_specific import (
17
+ get_task_instance_attributes,
18
+ )
15
19
  from datahub_airflow_plugin._config import DatahubLineageConfig, DatajobUrl
16
20
 
17
21
  if TYPE_CHECKING:
18
22
  from airflow import DAG
19
23
  from airflow.models import DagRun, TaskInstance
20
- from airflow.models.operator import Operator
24
+
25
+ from datahub_airflow_plugin._airflow_shims import Operator
26
+
27
+ try:
28
+ from airflow.serialization.serialized_objects import (
29
+ SerializedBaseOperator,
30
+ SerializedDAG,
31
+ )
32
+
33
+ DagType = Union[DAG, SerializedDAG]
34
+ OperatorType = Union[Operator, SerializedBaseOperator]
35
+ except ImportError:
36
+ DagType = DAG # type: ignore[misc]
37
+ OperatorType = Operator # type: ignore[misc]
38
+
39
+ # Add type ignore for ti.task which can be MappedOperator from different modules
40
+ # airflow.models.mappedoperator.MappedOperator (2.x) vs airflow.sdk.definitions.mappedoperator.MappedOperator (3.x)
41
+ TaskType = Union[OperatorType, Any] # type: ignore[misc]
42
+
43
+
44
+ def _get_base_url() -> str:
45
+ """
46
+ Get the Airflow base URL for constructing web UI links.
47
+
48
+ Tries multiple configuration sources for backward compatibility:
49
+ 1. webserver.base_url (Airflow 2.x and 3.x with computed default)
50
+ 2. api.base_url (Airflow 3.x alternative configuration)
51
+ 3. Fallback to http://localhost:8080 (safe default)
52
+
53
+ Returns:
54
+ str: The base URL for the Airflow web UI
55
+ """
56
+ # Try webserver.base_url first (works in both Airflow 2.x and 3.x)
57
+ # In Airflow 3.x, this is computed from web_server_host + web_server_port
58
+ base_url = conf.get("webserver", "base_url", fallback=None)
59
+ if base_url:
60
+ return base_url
61
+
62
+ # Fallback to api.base_url for environments that use it
63
+ # Some Airflow 3.x deployments may set this explicitly
64
+ api_base_url = conf.get("api", "base_url", fallback=None)
65
+ if api_base_url:
66
+ return api_base_url
67
+
68
+ # Final fallback to localhost (safe default for development/testing)
69
+ return "http://localhost:8080"
21
70
 
22
71
 
23
72
  def _task_downstream_task_ids(operator: "Operator") -> Set[str]:
@@ -29,8 +78,8 @@ def _task_downstream_task_ids(operator: "Operator") -> Set[str]:
29
78
  class AirflowGenerator:
30
79
  @staticmethod
31
80
  def _get_dependencies(
32
- task: "Operator",
33
- dag: "DAG",
81
+ task: "OperatorType",
82
+ dag: "DagType",
34
83
  flow_urn: DataFlowUrn,
35
84
  config: Optional[DatahubLineageConfig] = None,
36
85
  ) -> List[DataJobUrn]:
@@ -67,14 +116,18 @@ class AirflowGenerator:
67
116
 
68
117
  # subdags are always named with 'parent.child' style or Airflow won't run them
69
118
  # add connection from subdag trigger(s) if subdag task has no upstreams
119
+ # Note: is_subdag was removed in Airflow 3.x (subdags deprecated in Airflow 2.0)
120
+ parent_dag = getattr(dag, "parent_dag", None)
70
121
  if (
71
- dag.is_subdag
72
- and dag.parent_dag is not None
122
+ getattr(dag, "is_subdag", False)
123
+ and parent_dag is not None
73
124
  and len(task.upstream_task_ids) == 0
74
125
  ):
75
126
  # filter through the parent dag's tasks and find the subdag trigger(s)
76
127
  subdags = [
77
- x for x in dag.parent_dag.task_dict.values() if x.subdag is not None
128
+ x
129
+ for x in parent_dag.task_dict.values()
130
+ if x.subdag is not None # type: ignore[union-attr]
78
131
  ]
79
132
  matched_subdags = [
80
133
  x for x in subdags if x.subdag and x.subdag.dag_id == dag.dag_id
@@ -84,14 +137,14 @@ class AirflowGenerator:
84
137
  subdag_task_id = matched_subdags[0].task_id
85
138
 
86
139
  # iterate through the parent dag's tasks and find the ones that trigger the subdag
87
- for upstream_task_id in dag.parent_dag.task_dict:
88
- upstream_task = dag.parent_dag.task_dict[upstream_task_id]
140
+ for upstream_task_id in parent_dag.task_dict: # type: ignore[union-attr]
141
+ upstream_task = parent_dag.task_dict[upstream_task_id] # type: ignore[union-attr]
89
142
  upstream_task_urn = DataJobUrn.create_from_ids(
90
143
  data_flow_urn=str(flow_urn), job_id=upstream_task_id
91
144
  )
92
145
 
93
146
  # if the task triggers the subdag, link it to this node in the subdag
94
- if subdag_task_id in sorted(_task_downstream_task_ids(upstream_task)):
147
+ if subdag_task_id in sorted(_task_downstream_task_ids(upstream_task)): # type: ignore[arg-type]
95
148
  upstream_subdag_triggers.append(upstream_task_urn)
96
149
 
97
150
  # If the operator is an ExternalTaskSensor then we set the remote task as upstream.
@@ -100,14 +153,16 @@ class AirflowGenerator:
100
153
  external_task_upstreams = []
101
154
  if isinstance(task, ExternalTaskSensor):
102
155
  task = cast(ExternalTaskSensor, task)
103
- if hasattr(task, "external_task_id") and task.external_task_id is not None:
156
+ external_task_id = getattr(task, "external_task_id", None)
157
+ external_dag_id = getattr(task, "external_dag_id", None)
158
+ if external_task_id is not None and external_dag_id is not None:
104
159
  external_task_upstreams = [
105
160
  DataJobUrn.create_from_ids(
106
- job_id=task.external_task_id,
161
+ job_id=external_task_id,
107
162
  data_flow_urn=str(
108
163
  DataFlowUrn.create_from_ids(
109
164
  orchestrator=flow_urn.orchestrator,
110
- flow_id=task.external_dag_id,
165
+ flow_id=external_dag_id,
111
166
  env=flow_urn.cluster,
112
167
  platform_instance=config.platform_instance
113
168
  if config
@@ -130,13 +185,13 @@ class AirflowGenerator:
130
185
  return upstream_tasks
131
186
 
132
187
  @staticmethod
133
- def _extract_owners(dag: "DAG") -> List[str]:
188
+ def _extract_owners(dag: "DagType") -> List[str]:
134
189
  return [owner.strip() for owner in dag.owner.split(",")]
135
190
 
136
191
  @staticmethod
137
192
  def generate_dataflow(
138
193
  config: DatahubLineageConfig,
139
- dag: "DAG",
194
+ dag: "DagType",
140
195
  ) -> DataFlow:
141
196
  """
142
197
  Generates a Dataflow object from an Airflow DAG
@@ -173,12 +228,34 @@ class AirflowGenerator:
173
228
  "timezone",
174
229
  ]
175
230
 
231
+ def _serialize_dag_property(value: Any) -> str:
232
+ """Serialize DAG property values to string format (JSON-compatible when possible)."""
233
+ if value is None:
234
+ return ""
235
+ elif isinstance(value, bool):
236
+ return "true" if value else "false"
237
+ elif isinstance(value, datetime):
238
+ return value.isoformat()
239
+ elif isinstance(value, (set, frozenset)):
240
+ # Convert set to JSON array string
241
+ return json.dumps(sorted(list(value)))
242
+ elif isinstance(value, tzinfo):
243
+ return str(value.tzname(None))
244
+ elif isinstance(value, (int, float)):
245
+ return str(value)
246
+ elif isinstance(value, str):
247
+ return value
248
+ else:
249
+ # For other types, convert to string but avoid repr() format
250
+ return str(value)
251
+
176
252
  for key in allowed_flow_keys:
177
253
  if hasattr(dag, key):
178
- flow_property_bag[key] = repr(getattr(dag, key))
254
+ value = getattr(dag, key)
255
+ flow_property_bag[key] = _serialize_dag_property(value)
179
256
 
180
257
  data_flow.properties = flow_property_bag
181
- base_url = conf.get("webserver", "base_url")
258
+ base_url = _get_base_url()
182
259
  data_flow.url = f"{base_url}/tree?dag_id={dag.dag_id}"
183
260
 
184
261
  if config.capture_ownership_info and dag.owner:
@@ -194,8 +271,8 @@ class AirflowGenerator:
194
271
  return data_flow
195
272
 
196
273
  @staticmethod
197
- def _get_description(task: "Operator") -> Optional[str]:
198
- from airflow.models.baseoperator import BaseOperator
274
+ def _get_description(task: "OperatorType") -> Optional[str]:
275
+ from datahub_airflow_plugin._airflow_shims import BaseOperator
199
276
 
200
277
  if not isinstance(task, BaseOperator):
201
278
  # TODO: Get docs for mapped operators.
@@ -216,8 +293,8 @@ class AirflowGenerator:
216
293
  @staticmethod
217
294
  def generate_datajob(
218
295
  cluster: str,
219
- task: "Operator",
220
- dag: "DAG",
296
+ task: "OperatorType",
297
+ dag: "DagType",
221
298
  set_dependencies: bool = True,
222
299
  capture_owner: bool = True,
223
300
  capture_tags: bool = True,
@@ -289,11 +366,15 @@ class AirflowGenerator:
289
366
  break
290
367
 
291
368
  datajob.properties = job_property_bag
292
- base_url = conf.get("webserver", "base_url")
369
+ base_url = _get_base_url()
293
370
 
294
371
  if config and config.datajob_url_link == DatajobUrl.GRID:
295
372
  datajob.url = f"{base_url}/dags/{dag.dag_id}/grid?task_id={task.task_id}"
373
+ elif config and config.datajob_url_link == DatajobUrl.TASKS:
374
+ # Airflow 3.x task URL format
375
+ datajob.url = f"{base_url}/dags/{dag.dag_id}/tasks/{task.task_id}"
296
376
  else:
377
+ # Airflow 2.x taskinstance list URL format
297
378
  datajob.url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
298
379
 
299
380
  if capture_owner and dag.owner:
@@ -447,8 +528,12 @@ class AirflowGenerator:
447
528
  ) -> DataProcessInstance:
448
529
  if datajob is None:
449
530
  assert ti.task is not None
531
+ # ti.task can be MappedOperator from different modules (airflow.models vs airflow.sdk.definitions)
450
532
  datajob = AirflowGenerator.generate_datajob(
451
- config.cluster, ti.task, dag, config=config
533
+ config.cluster,
534
+ ti.task, # type: ignore[arg-type]
535
+ dag,
536
+ config=config,
452
537
  )
453
538
 
454
539
  assert dag_run.run_id
@@ -458,26 +543,23 @@ class AirflowGenerator:
458
543
  clone_inlets=True,
459
544
  clone_outlets=True,
460
545
  )
461
- job_property_bag: Dict[str, str] = {}
462
- job_property_bag["run_id"] = str(dag_run.run_id)
463
- job_property_bag["duration"] = str(ti.duration)
464
- job_property_bag["start_date"] = str(ti.start_date)
465
- job_property_bag["end_date"] = str(ti.end_date)
466
- job_property_bag["execution_date"] = str(ti.execution_date)
467
- job_property_bag["try_number"] = str(ti.try_number - 1)
468
- job_property_bag["max_tries"] = str(ti.max_tries)
469
- # Not compatible with Airflow 1
470
- if hasattr(ti, "external_executor_id"):
471
- job_property_bag["external_executor_id"] = str(ti.external_executor_id)
472
- job_property_bag["state"] = str(ti.state)
473
- job_property_bag["operator"] = str(ti.operator)
474
- job_property_bag["priority_weight"] = str(ti.priority_weight)
475
- job_property_bag["log_url"] = ti.log_url
546
+
547
+ job_property_bag = get_task_instance_attributes(ti)
548
+
549
+ # Add orchestrator and DAG/task IDs
476
550
  job_property_bag["orchestrator"] = "airflow"
477
- job_property_bag["dag_id"] = str(dag.dag_id)
478
- job_property_bag["task_id"] = str(ti.task_id)
551
+ if "dag_id" not in job_property_bag:
552
+ job_property_bag["dag_id"] = str(dag.dag_id)
553
+ if "task_id" not in job_property_bag:
554
+ job_property_bag["task_id"] = str(ti.task_id)
555
+ if "run_id" not in job_property_bag:
556
+ job_property_bag["run_id"] = str(dag_run.run_id)
557
+
479
558
  dpi.properties.update(job_property_bag)
480
- dpi.url = ti.log_url
559
+
560
+ # Set URL if log_url is available
561
+ if "log_url" in job_property_bag:
562
+ dpi.url = job_property_bag["log_url"]
481
563
 
482
564
  # This property only exists in Airflow2
483
565
  if hasattr(ti, "dag_run") and hasattr(ti.dag_run, "run_type"):
@@ -538,8 +620,12 @@ class AirflowGenerator:
538
620
  """
539
621
  if datajob is None:
540
622
  assert ti.task is not None
623
+ # ti.task can be MappedOperator from different modules (airflow.models vs airflow.sdk.definitions)
541
624
  datajob = AirflowGenerator.generate_datajob(
542
- cluster, ti.task, dag, config=config
625
+ cluster,
626
+ ti.task, # type: ignore[arg-type]
627
+ dag,
628
+ config=config,
543
629
  )
544
630
 
545
631
  if end_timestamp_millis is None:
@@ -566,6 +652,24 @@ class AirflowGenerator:
566
652
  clone_inlets=True,
567
653
  clone_outlets=True,
568
654
  )
655
+
656
+ job_property_bag = get_task_instance_attributes(ti)
657
+
658
+ # Add orchestrator and DAG/task IDs
659
+ job_property_bag["orchestrator"] = "airflow"
660
+ if "dag_id" not in job_property_bag:
661
+ job_property_bag["dag_id"] = str(dag.dag_id)
662
+ if "task_id" not in job_property_bag:
663
+ job_property_bag["task_id"] = str(ti.task_id)
664
+ if "run_id" not in job_property_bag:
665
+ job_property_bag["run_id"] = str(dag_run.run_id)
666
+
667
+ dpi.properties.update(job_property_bag)
668
+
669
+ # Set URL if log_url is available
670
+ if "log_url" in job_property_bag:
671
+ dpi.url = job_property_bag["log_url"]
672
+
569
673
  dpi.emit_process_end(
570
674
  emitter=emitter,
571
675
  end_timestamp_millis=end_timestamp_millis,