acryl-datahub-airflow-plugin 1.3.1.5__py3-none-any.whl → 1.3.1.5rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/METADATA +91 -0
- acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/RECORD +33 -0
- datahub_airflow_plugin/_airflow_shims.py +31 -64
- datahub_airflow_plugin/_config.py +19 -97
- datahub_airflow_plugin/_datahub_ol_adapter.py +2 -14
- datahub_airflow_plugin/_extractors.py +365 -0
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/client/airflow_generator.py +43 -147
- datahub_airflow_plugin/datahub_listener.py +790 -19
- datahub_airflow_plugin/example_dags/__init__.py +0 -32
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +4 -12
- datahub_airflow_plugin/hooks/datahub.py +2 -11
- datahub_airflow_plugin/operators/datahub.py +3 -20
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +0 -303
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +0 -65
- datahub_airflow_plugin/_airflow_compat.py +0 -32
- datahub_airflow_plugin/_airflow_version_specific.py +0 -184
- datahub_airflow_plugin/_constants.py +0 -16
- datahub_airflow_plugin/airflow2/__init__.py +0 -6
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +0 -402
- datahub_airflow_plugin/airflow2/_airflow_compat.py +0 -95
- datahub_airflow_plugin/airflow2/_extractors.py +0 -477
- datahub_airflow_plugin/airflow2/_legacy_shims.py +0 -20
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +0 -123
- datahub_airflow_plugin/airflow2/_provider_shims.py +0 -29
- datahub_airflow_plugin/airflow2/_shims.py +0 -88
- datahub_airflow_plugin/airflow2/datahub_listener.py +0 -1072
- datahub_airflow_plugin/airflow3/__init__.py +0 -6
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +0 -408
- datahub_airflow_plugin/airflow3/_airflow_compat.py +0 -108
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +0 -153
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +0 -273
- datahub_airflow_plugin/airflow3/_shims.py +0 -82
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +0 -88
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +0 -308
- datahub_airflow_plugin/airflow3/datahub_listener.py +0 -1452
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +0 -8
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +0 -54
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +0 -43
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +0 -69
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +0 -69
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +0 -81
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +0 -68
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +0 -99
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +0 -8
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +0 -51
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +0 -51
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +0 -89
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: acryl-datahub-airflow-plugin
|
|
3
|
+
Version: 1.3.1.5rc2
|
|
4
|
+
Summary: Datahub Airflow plugin to capture executions and send to Datahub
|
|
5
|
+
Home-page: https://docs.datahub.com/
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://docs.datahub.com/docs/
|
|
8
|
+
Project-URL: Source, https://github.com/datahub-project/datahub
|
|
9
|
+
Project-URL: Changelog, https://github.com/datahub-project/datahub/releases
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Information Technology
|
|
16
|
+
Classifier: Intended Audience :: System Administrators
|
|
17
|
+
Classifier: Operating System :: Unix
|
|
18
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
19
|
+
Classifier: Environment :: Console
|
|
20
|
+
Classifier: Environment :: MacOS X
|
|
21
|
+
Classifier: Topic :: Software Development
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: acryl-datahub[datahub-rest,sql-parser]==1.3.1.5rc2
|
|
25
|
+
Requires-Dist: openlineage-airflow<=1.30.1,>=1.2.0
|
|
26
|
+
Requires-Dist: apache-airflow<3,>=2.7.0
|
|
27
|
+
Requires-Dist: pydantic>=2.4.0
|
|
28
|
+
Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.5rc2
|
|
29
|
+
Provides-Extra: ignore
|
|
30
|
+
Provides-Extra: datahub-rest
|
|
31
|
+
Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.5rc2; extra == "datahub-rest"
|
|
32
|
+
Provides-Extra: datahub-kafka
|
|
33
|
+
Requires-Dist: acryl-datahub[datahub-kafka]==1.3.1.5rc2; extra == "datahub-kafka"
|
|
34
|
+
Provides-Extra: datahub-file
|
|
35
|
+
Requires-Dist: acryl-datahub[sync-file-emitter]==1.3.1.5rc2; extra == "datahub-file"
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: twine; extra == "dev"
|
|
38
|
+
Requires-Dist: openlineage-airflow<=1.30.1,>=1.2.0; extra == "dev"
|
|
39
|
+
Requires-Dist: deepdiff!=8.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pydantic>=2.4.0; extra == "dev"
|
|
41
|
+
Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.5rc2; extra == "dev"
|
|
42
|
+
Requires-Dist: tox; extra == "dev"
|
|
43
|
+
Requires-Dist: types-tabulate; extra == "dev"
|
|
44
|
+
Requires-Dist: types-dataclasses; extra == "dev"
|
|
45
|
+
Requires-Dist: coverage>=5.1; extra == "dev"
|
|
46
|
+
Requires-Dist: acryl-datahub[datahub-rest,sql-parser]==1.3.1.5rc2; extra == "dev"
|
|
47
|
+
Requires-Dist: tenacity; extra == "dev"
|
|
48
|
+
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
49
|
+
Requires-Dist: apache-airflow<3,>=2.7.0; extra == "dev"
|
|
50
|
+
Requires-Dist: types-click==0.1.12; extra == "dev"
|
|
51
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
52
|
+
Requires-Dist: ruff==0.11.7; extra == "dev"
|
|
53
|
+
Requires-Dist: types-toml; extra == "dev"
|
|
54
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
55
|
+
Requires-Dist: build; extra == "dev"
|
|
56
|
+
Requires-Dist: tox-uv; extra == "dev"
|
|
57
|
+
Requires-Dist: pytest>=6.2.2; extra == "dev"
|
|
58
|
+
Requires-Dist: types-cachetools; extra == "dev"
|
|
59
|
+
Requires-Dist: mypy==1.17.1; extra == "dev"
|
|
60
|
+
Requires-Dist: pytest-cov>=2.8.1; extra == "dev"
|
|
61
|
+
Requires-Dist: types-setuptools; extra == "dev"
|
|
62
|
+
Requires-Dist: packaging; extra == "dev"
|
|
63
|
+
Requires-Dist: types-six; extra == "dev"
|
|
64
|
+
Requires-Dist: sqlalchemy-stubs; extra == "dev"
|
|
65
|
+
Provides-Extra: integration-tests
|
|
66
|
+
Requires-Dist: acryl-datahub[testing-utils]==1.3.1.5rc2; extra == "integration-tests"
|
|
67
|
+
Requires-Dist: acryl-datahub[datahub-kafka]==1.3.1.5rc2; extra == "integration-tests"
|
|
68
|
+
Requires-Dist: snowflake-connector-python>=2.7.10; extra == "integration-tests"
|
|
69
|
+
Requires-Dist: apache-airflow-providers-sqlite; extra == "integration-tests"
|
|
70
|
+
Requires-Dist: apache-airflow[amazon,google,snowflake]>=2.0.2; extra == "integration-tests"
|
|
71
|
+
Requires-Dist: virtualenv; extra == "integration-tests"
|
|
72
|
+
Requires-Dist: acryl-datahub[sync-file-emitter]==1.3.1.5rc2; extra == "integration-tests"
|
|
73
|
+
Requires-Dist: apache-airflow-providers-teradata; extra == "integration-tests"
|
|
74
|
+
Dynamic: classifier
|
|
75
|
+
Dynamic: description
|
|
76
|
+
Dynamic: description-content-type
|
|
77
|
+
Dynamic: home-page
|
|
78
|
+
Dynamic: license
|
|
79
|
+
Dynamic: project-url
|
|
80
|
+
Dynamic: provides-extra
|
|
81
|
+
Dynamic: requires-dist
|
|
82
|
+
Dynamic: requires-python
|
|
83
|
+
Dynamic: summary
|
|
84
|
+
|
|
85
|
+
# Datahub Airflow Plugin
|
|
86
|
+
|
|
87
|
+
See [the DataHub Airflow docs](https://docs.datahub.com/docs/lineage/airflow) for details.
|
|
88
|
+
|
|
89
|
+
## Developing
|
|
90
|
+
|
|
91
|
+
See the [developing docs](../../metadata-ingestion/developing.md).
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
datahub_airflow_plugin/__init__.py,sha256=NScUtA8N-m66Pyg0DO--YbPkrl48PK3UevpdQVW_y6E,1009
|
|
2
|
+
datahub_airflow_plugin/_airflow_shims.py,sha256=hLMTkANJzmH9sEcaiwNb0EZgD11vh-XnDBqFQ9yqjr4,1613
|
|
3
|
+
datahub_airflow_plugin/_config.py,sha256=qNbNC6YUGHf06RfOMsU7jpeNeE2ttxjUQIMEpxIhvyM,5221
|
|
4
|
+
datahub_airflow_plugin/_datahub_ol_adapter.py,sha256=RuzMyWZo7MeJzAFoBfkT4cdDw5g1iWshB_nXG7jLnR0,545
|
|
5
|
+
datahub_airflow_plugin/_extractors.py,sha256=o2amnv1ram58zYNiWpleQcpSREthin33LQ4yRLwoGxA,12759
|
|
6
|
+
datahub_airflow_plugin/_version.py,sha256=Yj5MDuQ4EH0AveHREjNn2naYv2hwT2yZu8PBY2qBR7k,148
|
|
7
|
+
datahub_airflow_plugin/datahub_listener.py,sha256=plAU-DZ7EQmC1Ec4gzOaksqHnJLxaCYHkm2jEPTkgvc,31100
|
|
8
|
+
datahub_airflow_plugin/datahub_plugin.py,sha256=rbZhs7s5O3_MlkQw5aZToC2W5mMic_EpI3oybHB0ofw,1224
|
|
9
|
+
datahub_airflow_plugin/entities.py,sha256=xDZ-mZH7hjUkZbatWYUwI43_9B40wGiotlyQhiO8rEM,1987
|
|
10
|
+
datahub_airflow_plugin/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
datahub_airflow_plugin/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
datahub_airflow_plugin/client/airflow_generator.py,sha256=zWGX6M7hqbNvOPu11VlutkJ-g149Xv2m5_IC3GqfRJk,22120
|
|
13
|
+
datahub_airflow_plugin/example_dags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py,sha256=BbrOErFboKMDFn712RHEKI9T4Vh0q6kYSVet56gPqVk,1319
|
|
15
|
+
datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py,sha256=xYnuXhWXL5b9Tij0BFvFLckjKCobjzPU3xlxLg2_NXc,1015
|
|
16
|
+
datahub_airflow_plugin/example_dags/lineage_backend_demo.py,sha256=Dy6MxwtX7G0mQeALqpLRu4F03IyU9fqIkr-CcKpo2JE,1625
|
|
17
|
+
datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py,sha256=kW2rLFtOnoiMxBJ315GzlmR0Sz1cqQ_wwLbG9UC-u7Y,1499
|
|
18
|
+
datahub_airflow_plugin/example_dags/lineage_emission_dag.py,sha256=LE29DzW51a4ZAl_zrcLrqSyzmy8qElcZagXsIMjaZLU,1946
|
|
19
|
+
datahub_airflow_plugin/example_dags/mysql_sample_dag.py,sha256=Unx9Ger3R9ptEutfV-4NjjEaTIEYJ-tLrZr7OsK608k,1922
|
|
20
|
+
datahub_airflow_plugin/example_dags/snowflake_sample_dag.py,sha256=b9iaE7zChQha9u57F84U6uqavGl7WrUnMNOzXEiZxjE,3234
|
|
21
|
+
datahub_airflow_plugin/hooks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
datahub_airflow_plugin/hooks/datahub.py,sha256=xa-gsJUza3jSAUP1QLJSNBn4bUHxjXo_FdCAf08IWFo,11155
|
|
23
|
+
datahub_airflow_plugin/operators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
datahub_airflow_plugin/operators/datahub.py,sha256=3fT_qg1tUpBEne1XBk8zDIUNBMcox7mxoEoN9O4XIPA,3814
|
|
25
|
+
datahub_airflow_plugin/operators/datahub_assertion_operator.py,sha256=j_P9M1a5qME55pKHAfTqZsVVtIslFBO59r8UQOOBvsk,2914
|
|
26
|
+
datahub_airflow_plugin/operators/datahub_assertion_sensor.py,sha256=QJIZZYQhqscj3bhBN5Sei-ABMRRAl2KiQxXTXcZQ51Q,2917
|
|
27
|
+
datahub_airflow_plugin/operators/datahub_operation_operator.py,sha256=KJ8M8jJ7UWW6kNbiS-rELc-kqCPkZ3ck7z51oAXGPSI,3351
|
|
28
|
+
datahub_airflow_plugin/operators/datahub_operation_sensor.py,sha256=U19fi5DpjBRWm_1ljXcjnspUzfa3mqYfOQZHjLk-ufI,3618
|
|
29
|
+
acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/METADATA,sha256=xwLFF-XsoGpGb60_fL1km0lEyKbqa9crCtyphoMMRtc,4088
|
|
30
|
+
acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/entry_points.txt,sha256=HqmajDHtrsz0b5Lswe1-eeuObxdtucd9YoxH77jJBA8,179
|
|
32
|
+
acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/top_level.txt,sha256=VBzisOQfzqL1WRbNyItaruf3kTigXltjzgqzbheaFp0,23
|
|
33
|
+
acryl_datahub_airflow_plugin-1.3.1.5rc2.dist-info/RECORD,,
|
|
@@ -1,75 +1,42 @@
|
|
|
1
|
-
|
|
2
|
-
Pure dispatcher for version-specific Airflow shims.
|
|
3
|
-
|
|
4
|
-
This module automatically imports the correct shims based on the installed
|
|
5
|
-
Airflow version, dispatching to either airflow2 or airflow3 implementations.
|
|
6
|
-
|
|
7
|
-
No logic lives here - just clean version detection and re-export.
|
|
8
|
-
"""
|
|
1
|
+
from typing import List
|
|
9
2
|
|
|
3
|
+
import airflow.version
|
|
10
4
|
import packaging.version
|
|
5
|
+
from airflow.models.operator import Operator
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from airflow.sensors.external_task import ExternalTaskSensor
|
|
9
|
+
except ImportError:
|
|
10
|
+
from airflow.sensors.external_task_sensor import ExternalTaskSensor # type: ignore
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
# Approach suggested by https://stackoverflow.com/a/11887885/5004662.
|
|
13
|
+
AIRFLOW_VERSION = packaging.version.parse(airflow.version.version)
|
|
14
|
+
HAS_AIRFLOW_DAG_LISTENER_API = True # this is in Airflow 2.5+
|
|
15
|
+
HAS_AIRFLOW_DATASET_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse(
|
|
16
|
+
"2.8.0.dev0"
|
|
15
17
|
)
|
|
16
18
|
|
|
17
|
-
# Version feature flags - hardcode based on Airflow version
|
|
18
|
-
# These were previously in the old _airflow_shims but are better kept simple
|
|
19
|
-
HAS_AIRFLOW_STANDALONE_CMD = AIRFLOW_VERSION >= packaging.version.parse("2.2")
|
|
20
|
-
HAS_AIRFLOW_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse("2.3")
|
|
21
|
-
HAS_AIRFLOW_DAG_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse("2.5")
|
|
22
|
-
HAS_AIRFLOW_DATASET_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse("2.5")
|
|
23
19
|
|
|
24
|
-
|
|
25
|
-
# Airflow
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
BaseOperator,
|
|
43
|
-
ExternalTaskSensor,
|
|
44
|
-
MappedOperator,
|
|
45
|
-
OpenLineagePlugin,
|
|
46
|
-
Operator,
|
|
47
|
-
TaskHolder,
|
|
48
|
-
get_operator_class,
|
|
49
|
-
get_task_inlets,
|
|
50
|
-
get_task_outlets,
|
|
51
|
-
redact_with_exclusions,
|
|
52
|
-
try_import_from_string,
|
|
53
|
-
)
|
|
20
|
+
def get_task_inlets(operator: "Operator") -> List:
|
|
21
|
+
# From Airflow 2.4 _inlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _inlets
|
|
22
|
+
if hasattr(operator, "_inlets"):
|
|
23
|
+
return operator._inlets # type: ignore[attr-defined, union-attr]
|
|
24
|
+
if hasattr(operator, "get_inlet_defs"):
|
|
25
|
+
return operator.get_inlet_defs() # type: ignore[attr-defined]
|
|
26
|
+
return operator.inlets or []
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_task_outlets(operator: "Operator") -> List:
|
|
30
|
+
# From Airflow 2.4 _outlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _outlets
|
|
31
|
+
# We have to use _outlets because outlets is empty in Airflow < 2.4.0
|
|
32
|
+
if hasattr(operator, "_outlets"):
|
|
33
|
+
return operator._outlets # type: ignore[attr-defined, union-attr]
|
|
34
|
+
if hasattr(operator, "get_outlet_defs"):
|
|
35
|
+
return operator.get_outlet_defs()
|
|
36
|
+
return operator.outlets or []
|
|
37
|
+
|
|
54
38
|
|
|
55
39
|
__all__ = [
|
|
56
|
-
# Airflow version and feature flags
|
|
57
40
|
"AIRFLOW_VERSION",
|
|
58
|
-
"IS_AIRFLOW_3_OR_HIGHER",
|
|
59
|
-
"HAS_AIRFLOW_STANDALONE_CMD",
|
|
60
|
-
"HAS_AIRFLOW_LISTENER_API",
|
|
61
|
-
"HAS_AIRFLOW_DAG_LISTENER_API",
|
|
62
|
-
"HAS_AIRFLOW_DATASET_LISTENER_API",
|
|
63
|
-
# Airflow objects
|
|
64
|
-
"BaseOperator",
|
|
65
|
-
"Operator",
|
|
66
|
-
"MappedOperator",
|
|
67
41
|
"ExternalTaskSensor",
|
|
68
|
-
"TaskHolder",
|
|
69
|
-
"OpenLineagePlugin",
|
|
70
|
-
"get_operator_class",
|
|
71
|
-
"try_import_from_string",
|
|
72
|
-
"redact_with_exclusions",
|
|
73
|
-
"get_task_inlets",
|
|
74
|
-
"get_task_outlets",
|
|
75
42
|
]
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import TYPE_CHECKING, List, Optional, Union
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
from airflow.configuration import conf
|
|
5
|
-
from pydantic import
|
|
5
|
+
from pydantic import root_validator
|
|
6
|
+
from pydantic.fields import Field
|
|
6
7
|
|
|
7
8
|
import datahub.emitter.mce_builder as builder
|
|
8
9
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
9
|
-
from datahub_airflow_plugin._airflow_version_specific import IS_AIRFLOW_3_OR_HIGHER
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from datahub_airflow_plugin.hooks.datahub import (
|
|
@@ -18,15 +18,16 @@ if TYPE_CHECKING:
|
|
|
18
18
|
class DatajobUrl(Enum):
|
|
19
19
|
GRID = "grid"
|
|
20
20
|
TASKINSTANCE = "taskinstance"
|
|
21
|
-
TASKS = "tasks" # Airflow 3.x task URL format: /dags/{dag_id}/tasks/{task_id}
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
class DatahubLineageConfig(ConfigModel):
|
|
25
24
|
enabled: bool
|
|
26
25
|
|
|
27
|
-
# DataHub hook connection ID
|
|
26
|
+
# DataHub hook connection ID.
|
|
28
27
|
datahub_conn_id: str
|
|
29
28
|
|
|
29
|
+
_datahub_connection_ids: List[str]
|
|
30
|
+
|
|
30
31
|
# Cluster to associate with the pipelines and tasks. Defaults to "prod".
|
|
31
32
|
cluster: str
|
|
32
33
|
|
|
@@ -52,32 +53,6 @@ class DatahubLineageConfig(ConfigModel):
|
|
|
52
53
|
|
|
53
54
|
enable_extractors: bool
|
|
54
55
|
|
|
55
|
-
# OpenLineage extractor patching/override controls (only apply when enable_extractors=True)
|
|
56
|
-
# These allow fine-grained control over DataHub's enhancements to OpenLineage extractors
|
|
57
|
-
|
|
58
|
-
# If true (default), patch SqlExtractor to use DataHub's SQL parser
|
|
59
|
-
# This enables column-level lineage extraction from SQL queries
|
|
60
|
-
# Works with both Legacy OpenLineage and OpenLineage Provider
|
|
61
|
-
patch_sql_parser: bool
|
|
62
|
-
|
|
63
|
-
# If true (default), patch SnowflakeExtractor's default_schema property
|
|
64
|
-
# Fixes schema detection issues in Snowflake operators
|
|
65
|
-
# Works with both Legacy OpenLineage and OpenLineage Provider
|
|
66
|
-
patch_snowflake_schema: bool
|
|
67
|
-
|
|
68
|
-
# If true (default), use DataHub's custom AthenaOperatorExtractor
|
|
69
|
-
# Provides better Athena lineage with DataHub's SQL parser
|
|
70
|
-
# Only applies to Legacy OpenLineage (OpenLineage Provider has its own)
|
|
71
|
-
extract_athena_operator: bool
|
|
72
|
-
|
|
73
|
-
# If true (default), use DataHub's custom BigQueryInsertJobOperatorExtractor
|
|
74
|
-
# Handles BigQuery job configuration and destination tables
|
|
75
|
-
# Only applies to Legacy OpenLineage (OpenLineage Provider has its own)
|
|
76
|
-
extract_bigquery_insert_job_operator: bool
|
|
77
|
-
|
|
78
|
-
# If true (default) use DataHub's custom TeradataOperator
|
|
79
|
-
extract_teradata_operator: bool
|
|
80
|
-
|
|
81
56
|
# If true, ti.render_templates() will be called in the listener.
|
|
82
57
|
# Makes extraction of jinja-templated fields more accurate.
|
|
83
58
|
render_templates: bool
|
|
@@ -94,17 +69,6 @@ class DatahubLineageConfig(ConfigModel):
|
|
|
94
69
|
|
|
95
70
|
disable_openlineage_plugin: bool
|
|
96
71
|
|
|
97
|
-
@property
|
|
98
|
-
def _datahub_connection_ids(self) -> List[str]:
|
|
99
|
-
"""
|
|
100
|
-
Parse comma-separated connection IDs into a list.
|
|
101
|
-
|
|
102
|
-
This is implemented as a property to avoid the class variable pollution
|
|
103
|
-
bug that would occur with validators. Each instance computes its own
|
|
104
|
-
connection ID list from its datahub_conn_id field.
|
|
105
|
-
"""
|
|
106
|
-
return [conn_id.strip() for conn_id in self.datahub_conn_id.split(",")]
|
|
107
|
-
|
|
108
72
|
def make_emitter_hook(self) -> Union["DatahubGenericHook", "DatahubCompositeHook"]:
|
|
109
73
|
# This is necessary to avoid issues with circular imports.
|
|
110
74
|
from datahub_airflow_plugin.hooks.datahub import (
|
|
@@ -112,11 +76,18 @@ class DatahubLineageConfig(ConfigModel):
|
|
|
112
76
|
DatahubGenericHook,
|
|
113
77
|
)
|
|
114
78
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
return DatahubGenericHook(connection_ids[0])
|
|
79
|
+
if len(self._datahub_connection_ids) == 1:
|
|
80
|
+
return DatahubGenericHook(self._datahub_connection_ids[0])
|
|
118
81
|
else:
|
|
119
|
-
return DatahubCompositeHook(
|
|
82
|
+
return DatahubCompositeHook(self._datahub_connection_ids)
|
|
83
|
+
|
|
84
|
+
@root_validator(skip_on_failure=True)
|
|
85
|
+
def split_conn_ids(cls, values: Dict) -> Dict:
|
|
86
|
+
if not values.get("datahub_conn_id"):
|
|
87
|
+
raise ValueError("datahub_conn_id is required")
|
|
88
|
+
conn_ids = values.get("datahub_conn_id", "").split(",")
|
|
89
|
+
cls._datahub_connection_ids = [conn_id.strip() for conn_id in conn_ids]
|
|
90
|
+
return values
|
|
120
91
|
|
|
121
92
|
|
|
122
93
|
def get_lineage_config() -> DatahubLineageConfig:
|
|
@@ -136,58 +107,14 @@ def get_lineage_config() -> DatahubLineageConfig:
|
|
|
136
107
|
capture_executions = conf.get("datahub", "capture_executions", fallback=True)
|
|
137
108
|
materialize_iolets = conf.get("datahub", "materialize_iolets", fallback=True)
|
|
138
109
|
enable_extractors = conf.get("datahub", "enable_extractors", fallback=True)
|
|
139
|
-
|
|
140
|
-
# OpenLineage extractor patching/override configuration
|
|
141
|
-
# These only apply when enable_extractors=True
|
|
142
|
-
patch_sql_parser = conf.get("datahub", "patch_sql_parser", fallback=True)
|
|
143
|
-
patch_snowflake_schema = conf.get(
|
|
144
|
-
"datahub", "patch_snowflake_schema", fallback=True
|
|
145
|
-
)
|
|
146
|
-
extract_athena_operator = conf.get(
|
|
147
|
-
"datahub", "extract_athena_operator", fallback=True
|
|
148
|
-
)
|
|
149
|
-
extract_bigquery_insert_job_operator = conf.get(
|
|
150
|
-
"datahub", "extract_bigquery_insert_job_operator", fallback=True
|
|
151
|
-
)
|
|
152
|
-
extract_teradata_operator = conf.get(
|
|
153
|
-
"datahub", "extract_teradata_operator", fallback=True
|
|
154
|
-
)
|
|
155
|
-
|
|
156
110
|
log_level = conf.get("datahub", "log_level", fallback=None)
|
|
157
111
|
debug_emitter = conf.get("datahub", "debug_emitter", fallback=False)
|
|
158
|
-
|
|
159
|
-
# Disable OpenLineage plugin by default (disable_openlineage_plugin=True) for all versions.
|
|
160
|
-
# This is the safest default since most DataHub users only want DataHub's lineage.
|
|
161
|
-
#
|
|
162
|
-
# When disable_openlineage_plugin=True (default):
|
|
163
|
-
# - Only DataHub plugin runs (OpenLineagePlugin.listeners are cleared if present)
|
|
164
|
-
# - In Airflow 3: SQLParser calls only DataHub's enhanced parser
|
|
165
|
-
# - In Airflow 2: DataHub uses its own extractors
|
|
166
|
-
# - DataHub gets enhanced parsing with column-level lineage
|
|
167
|
-
#
|
|
168
|
-
# When disable_openlineage_plugin=False (opt-in for dual plugin mode):
|
|
169
|
-
# - Both DataHub and OpenLineage plugins run side-by-side
|
|
170
|
-
# - In Airflow 3: SQLParser calls BOTH parsers
|
|
171
|
-
# - OpenLineage plugin uses its own parsing results (inputs/outputs)
|
|
172
|
-
# - DataHub extracts its enhanced parsing (with column-level lineage) from run_facets
|
|
173
|
-
# - Both plugins get their expected parsing without interference
|
|
174
|
-
# - In Airflow 2: Not recommended - may cause conflicts
|
|
175
|
-
default_disable_openlineage = True
|
|
176
|
-
|
|
177
112
|
disable_openlineage_plugin = conf.get(
|
|
178
|
-
"datahub", "disable_openlineage_plugin", fallback=
|
|
113
|
+
"datahub", "disable_openlineage_plugin", fallback=True
|
|
179
114
|
)
|
|
180
115
|
render_templates = conf.get("datahub", "render_templates", fallback=True)
|
|
181
|
-
|
|
182
|
-
# Use new task URL format for Airflow 3.x, old taskinstance format for Airflow 2.x
|
|
183
|
-
# Airflow 3 changed URL structure: /dags/{dag_id}/tasks/{task_id} instead of /taskinstance/list/...
|
|
184
|
-
default_datajob_url = (
|
|
185
|
-
DatajobUrl.TASKS.value
|
|
186
|
-
if IS_AIRFLOW_3_OR_HIGHER
|
|
187
|
-
else DatajobUrl.TASKINSTANCE.value
|
|
188
|
-
)
|
|
189
116
|
datajob_url_link = conf.get(
|
|
190
|
-
"datahub", "datajob_url_link", fallback=
|
|
117
|
+
"datahub", "datajob_url_link", fallback=DatajobUrl.TASKINSTANCE.value
|
|
191
118
|
)
|
|
192
119
|
dag_filter_pattern = AllowDenyPattern.model_validate_json(
|
|
193
120
|
conf.get("datahub", "dag_filter_str", fallback='{"allow": [".*"]}')
|
|
@@ -205,11 +132,6 @@ def get_lineage_config() -> DatahubLineageConfig:
|
|
|
205
132
|
capture_executions=capture_executions,
|
|
206
133
|
materialize_iolets=materialize_iolets,
|
|
207
134
|
enable_extractors=enable_extractors,
|
|
208
|
-
patch_sql_parser=patch_sql_parser,
|
|
209
|
-
patch_snowflake_schema=patch_snowflake_schema,
|
|
210
|
-
extract_athena_operator=extract_athena_operator,
|
|
211
|
-
extract_bigquery_insert_job_operator=extract_bigquery_insert_job_operator,
|
|
212
|
-
extract_teradata_operator=extract_teradata_operator,
|
|
213
135
|
log_level=log_level,
|
|
214
136
|
debug_emitter=debug_emitter,
|
|
215
137
|
disable_openlineage_plugin=disable_openlineage_plugin,
|
|
@@ -1,18 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
try:
|
|
6
|
-
from openlineage.client.run import Dataset as OpenLineageDataset
|
|
7
|
-
|
|
8
|
-
OPENLINEAGE_AVAILABLE = True
|
|
9
|
-
except ImportError:
|
|
10
|
-
# Not available when openlineage packages aren't installed
|
|
11
|
-
OpenLineageDataset = None # type: ignore[assignment,misc]
|
|
12
|
-
OPENLINEAGE_AVAILABLE = False
|
|
13
|
-
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from openlineage.client.run import Dataset as OpenLineageDataset
|
|
3
|
+
from openlineage.client.run import Dataset as OpenLineageDataset
|
|
16
4
|
|
|
17
5
|
import datahub.emitter.mce_builder as builder
|
|
18
6
|
|
|
@@ -25,7 +13,7 @@ OL_SCHEME_TWEAKS = {
|
|
|
25
13
|
}
|
|
26
14
|
|
|
27
15
|
|
|
28
|
-
def translate_ol_to_datahub_urn(ol_uri:
|
|
16
|
+
def translate_ol_to_datahub_urn(ol_uri: OpenLineageDataset) -> str:
|
|
29
17
|
namespace = ol_uri.namespace
|
|
30
18
|
name = ol_uri.name
|
|
31
19
|
|