acryl-datahub-airflow-plugin 1.3.1.5__py3-none-any.whl → 1.3.1.5rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/METADATA +91 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/RECORD +33 -0
  3. datahub_airflow_plugin/_airflow_shims.py +31 -64
  4. datahub_airflow_plugin/_config.py +19 -97
  5. datahub_airflow_plugin/_datahub_ol_adapter.py +2 -14
  6. datahub_airflow_plugin/_extractors.py +365 -0
  7. datahub_airflow_plugin/_version.py +1 -1
  8. datahub_airflow_plugin/client/airflow_generator.py +43 -147
  9. datahub_airflow_plugin/datahub_listener.py +790 -19
  10. datahub_airflow_plugin/example_dags/__init__.py +0 -32
  11. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +4 -12
  12. datahub_airflow_plugin/hooks/datahub.py +2 -11
  13. datahub_airflow_plugin/operators/datahub.py +3 -20
  14. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +0 -303
  15. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +0 -65
  16. datahub_airflow_plugin/_airflow_compat.py +0 -32
  17. datahub_airflow_plugin/_airflow_version_specific.py +0 -184
  18. datahub_airflow_plugin/_constants.py +0 -16
  19. datahub_airflow_plugin/airflow2/__init__.py +0 -6
  20. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +0 -402
  21. datahub_airflow_plugin/airflow2/_airflow_compat.py +0 -95
  22. datahub_airflow_plugin/airflow2/_extractors.py +0 -477
  23. datahub_airflow_plugin/airflow2/_legacy_shims.py +0 -20
  24. datahub_airflow_plugin/airflow2/_openlineage_compat.py +0 -123
  25. datahub_airflow_plugin/airflow2/_provider_shims.py +0 -29
  26. datahub_airflow_plugin/airflow2/_shims.py +0 -88
  27. datahub_airflow_plugin/airflow2/datahub_listener.py +0 -1072
  28. datahub_airflow_plugin/airflow3/__init__.py +0 -6
  29. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +0 -408
  30. datahub_airflow_plugin/airflow3/_airflow_compat.py +0 -108
  31. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +0 -153
  32. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +0 -273
  33. datahub_airflow_plugin/airflow3/_shims.py +0 -82
  34. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +0 -88
  35. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +0 -308
  36. datahub_airflow_plugin/airflow3/datahub_listener.py +0 -1452
  37. datahub_airflow_plugin/example_dags/airflow2/__init__.py +0 -8
  38. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +0 -54
  39. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +0 -43
  40. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +0 -69
  41. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +0 -69
  42. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +0 -81
  43. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +0 -68
  44. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +0 -99
  45. datahub_airflow_plugin/example_dags/airflow3/__init__.py +0 -8
  46. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +0 -51
  47. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +0 -51
  48. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +0 -89
  49. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,91 @@
1
+ Metadata-Version: 2.4
2
+ Name: acryl-datahub-airflow-plugin
3
+ Version: 1.3.1.5rc1
4
+ Summary: Datahub Airflow plugin to capture executions and send to Datahub
5
+ Home-page: https://docs.datahub.com/
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://docs.datahub.com/docs/
8
+ Project-URL: Source, https://github.com/datahub-project/datahub
9
+ Project-URL: Changelog, https://github.com/datahub-project/datahub/releases
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Programming Language :: Python
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Information Technology
16
+ Classifier: Intended Audience :: System Administrators
17
+ Classifier: Operating System :: Unix
18
+ Classifier: Operating System :: POSIX :: Linux
19
+ Classifier: Environment :: Console
20
+ Classifier: Environment :: MacOS X
21
+ Classifier: Topic :: Software Development
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: openlineage-airflow<=1.30.1,>=1.2.0
25
+ Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.5rc1
26
+ Requires-Dist: pydantic>=2.4.0
27
+ Requires-Dist: apache-airflow<3,>=2.7.0
28
+ Requires-Dist: acryl-datahub[datahub-rest,sql-parser]==1.3.1.5rc1
29
+ Provides-Extra: ignore
30
+ Provides-Extra: datahub-rest
31
+ Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.5rc1; extra == "datahub-rest"
32
+ Provides-Extra: datahub-kafka
33
+ Requires-Dist: acryl-datahub[datahub-kafka]==1.3.1.5rc1; extra == "datahub-kafka"
34
+ Provides-Extra: datahub-file
35
+ Requires-Dist: acryl-datahub[sync-file-emitter]==1.3.1.5rc1; extra == "datahub-file"
36
+ Provides-Extra: dev
37
+ Requires-Dist: openlineage-airflow<=1.30.1,>=1.2.0; extra == "dev"
38
+ Requires-Dist: types-PyYAML; extra == "dev"
39
+ Requires-Dist: pydantic>=2.4.0; extra == "dev"
40
+ Requires-Dist: mypy==1.17.1; extra == "dev"
41
+ Requires-Dist: tox-uv; extra == "dev"
42
+ Requires-Dist: packaging; extra == "dev"
43
+ Requires-Dist: sqlalchemy-stubs; extra == "dev"
44
+ Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.5rc1; extra == "dev"
45
+ Requires-Dist: types-cachetools; extra == "dev"
46
+ Requires-Dist: coverage>=5.1; extra == "dev"
47
+ Requires-Dist: types-six; extra == "dev"
48
+ Requires-Dist: types-requests; extra == "dev"
49
+ Requires-Dist: types-toml; extra == "dev"
50
+ Requires-Dist: pytest-cov>=2.8.1; extra == "dev"
51
+ Requires-Dist: pytest>=6.2.2; extra == "dev"
52
+ Requires-Dist: tenacity; extra == "dev"
53
+ Requires-Dist: types-setuptools; extra == "dev"
54
+ Requires-Dist: build; extra == "dev"
55
+ Requires-Dist: deepdiff!=8.0.0; extra == "dev"
56
+ Requires-Dist: types-click==0.1.12; extra == "dev"
57
+ Requires-Dist: types-dataclasses; extra == "dev"
58
+ Requires-Dist: ruff==0.11.7; extra == "dev"
59
+ Requires-Dist: tox; extra == "dev"
60
+ Requires-Dist: apache-airflow<3,>=2.7.0; extra == "dev"
61
+ Requires-Dist: types-tabulate; extra == "dev"
62
+ Requires-Dist: types-python-dateutil; extra == "dev"
63
+ Requires-Dist: twine; extra == "dev"
64
+ Requires-Dist: acryl-datahub[datahub-rest,sql-parser]==1.3.1.5rc1; extra == "dev"
65
+ Provides-Extra: integration-tests
66
+ Requires-Dist: apache-airflow-providers-teradata; extra == "integration-tests"
67
+ Requires-Dist: snowflake-connector-python>=2.7.10; extra == "integration-tests"
68
+ Requires-Dist: acryl-datahub[testing-utils]==1.3.1.5rc1; extra == "integration-tests"
69
+ Requires-Dist: apache-airflow[amazon,google,snowflake]>=2.0.2; extra == "integration-tests"
70
+ Requires-Dist: acryl-datahub[datahub-kafka]==1.3.1.5rc1; extra == "integration-tests"
71
+ Requires-Dist: acryl-datahub[sync-file-emitter]==1.3.1.5rc1; extra == "integration-tests"
72
+ Requires-Dist: apache-airflow-providers-sqlite; extra == "integration-tests"
73
+ Requires-Dist: virtualenv; extra == "integration-tests"
74
+ Dynamic: classifier
75
+ Dynamic: description
76
+ Dynamic: description-content-type
77
+ Dynamic: home-page
78
+ Dynamic: license
79
+ Dynamic: project-url
80
+ Dynamic: provides-extra
81
+ Dynamic: requires-dist
82
+ Dynamic: requires-python
83
+ Dynamic: summary
84
+
85
+ # Datahub Airflow Plugin
86
+
87
+ See [the DataHub Airflow docs](https://docs.datahub.com/docs/lineage/airflow) for details.
88
+
89
+ ## Developing
90
+
91
+ See the [developing docs](../../metadata-ingestion/developing.md).
@@ -0,0 +1,33 @@
1
+ datahub_airflow_plugin/__init__.py,sha256=NScUtA8N-m66Pyg0DO--YbPkrl48PK3UevpdQVW_y6E,1009
2
+ datahub_airflow_plugin/_airflow_shims.py,sha256=hLMTkANJzmH9sEcaiwNb0EZgD11vh-XnDBqFQ9yqjr4,1613
3
+ datahub_airflow_plugin/_config.py,sha256=qNbNC6YUGHf06RfOMsU7jpeNeE2ttxjUQIMEpxIhvyM,5221
4
+ datahub_airflow_plugin/_datahub_ol_adapter.py,sha256=RuzMyWZo7MeJzAFoBfkT4cdDw5g1iWshB_nXG7jLnR0,545
5
+ datahub_airflow_plugin/_extractors.py,sha256=o2amnv1ram58zYNiWpleQcpSREthin33LQ4yRLwoGxA,12759
6
+ datahub_airflow_plugin/_version.py,sha256=mFUiAN31A5PUxNLnvOGfHnDXajphGmegPa0MM9whufg,148
7
+ datahub_airflow_plugin/datahub_listener.py,sha256=plAU-DZ7EQmC1Ec4gzOaksqHnJLxaCYHkm2jEPTkgvc,31100
8
+ datahub_airflow_plugin/datahub_plugin.py,sha256=rbZhs7s5O3_MlkQw5aZToC2W5mMic_EpI3oybHB0ofw,1224
9
+ datahub_airflow_plugin/entities.py,sha256=xDZ-mZH7hjUkZbatWYUwI43_9B40wGiotlyQhiO8rEM,1987
10
+ datahub_airflow_plugin/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ datahub_airflow_plugin/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ datahub_airflow_plugin/client/airflow_generator.py,sha256=zWGX6M7hqbNvOPu11VlutkJ-g149Xv2m5_IC3GqfRJk,22120
13
+ datahub_airflow_plugin/example_dags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py,sha256=BbrOErFboKMDFn712RHEKI9T4Vh0q6kYSVet56gPqVk,1319
15
+ datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py,sha256=xYnuXhWXL5b9Tij0BFvFLckjKCobjzPU3xlxLg2_NXc,1015
16
+ datahub_airflow_plugin/example_dags/lineage_backend_demo.py,sha256=Dy6MxwtX7G0mQeALqpLRu4F03IyU9fqIkr-CcKpo2JE,1625
17
+ datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py,sha256=kW2rLFtOnoiMxBJ315GzlmR0Sz1cqQ_wwLbG9UC-u7Y,1499
18
+ datahub_airflow_plugin/example_dags/lineage_emission_dag.py,sha256=LE29DzW51a4ZAl_zrcLrqSyzmy8qElcZagXsIMjaZLU,1946
19
+ datahub_airflow_plugin/example_dags/mysql_sample_dag.py,sha256=Unx9Ger3R9ptEutfV-4NjjEaTIEYJ-tLrZr7OsK608k,1922
20
+ datahub_airflow_plugin/example_dags/snowflake_sample_dag.py,sha256=b9iaE7zChQha9u57F84U6uqavGl7WrUnMNOzXEiZxjE,3234
21
+ datahub_airflow_plugin/hooks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ datahub_airflow_plugin/hooks/datahub.py,sha256=xa-gsJUza3jSAUP1QLJSNBn4bUHxjXo_FdCAf08IWFo,11155
23
+ datahub_airflow_plugin/operators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ datahub_airflow_plugin/operators/datahub.py,sha256=3fT_qg1tUpBEne1XBk8zDIUNBMcox7mxoEoN9O4XIPA,3814
25
+ datahub_airflow_plugin/operators/datahub_assertion_operator.py,sha256=j_P9M1a5qME55pKHAfTqZsVVtIslFBO59r8UQOOBvsk,2914
26
+ datahub_airflow_plugin/operators/datahub_assertion_sensor.py,sha256=QJIZZYQhqscj3bhBN5Sei-ABMRRAl2KiQxXTXcZQ51Q,2917
27
+ datahub_airflow_plugin/operators/datahub_operation_operator.py,sha256=KJ8M8jJ7UWW6kNbiS-rELc-kqCPkZ3ck7z51oAXGPSI,3351
28
+ datahub_airflow_plugin/operators/datahub_operation_sensor.py,sha256=U19fi5DpjBRWm_1ljXcjnspUzfa3mqYfOQZHjLk-ufI,3618
29
+ acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/METADATA,sha256=qSybyzWbxjUDvL88MWC5HVumK3DTVtsT4Ezl4fTnyWw,4088
30
+ acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/entry_points.txt,sha256=HqmajDHtrsz0b5Lswe1-eeuObxdtucd9YoxH77jJBA8,179
32
+ acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/top_level.txt,sha256=VBzisOQfzqL1WRbNyItaruf3kTigXltjzgqzbheaFp0,23
33
+ acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/RECORD,,
@@ -1,75 +1,42 @@
1
- """
2
- Pure dispatcher for version-specific Airflow shims.
3
-
4
- This module automatically imports the correct shims based on the installed
5
- Airflow version, dispatching to either airflow2 or airflow3 implementations.
6
-
7
- No logic lives here - just clean version detection and re-export.
8
- """
1
+ from typing import List
9
2
 
3
+ import airflow.version
10
4
  import packaging.version
5
+ from airflow.models.operator import Operator
6
+
7
+ try:
8
+ from airflow.sensors.external_task import ExternalTaskSensor
9
+ except ImportError:
10
+ from airflow.sensors.external_task_sensor import ExternalTaskSensor # type: ignore
11
11
 
12
- from datahub_airflow_plugin._airflow_version_specific import (
13
- AIRFLOW_VERSION,
14
- IS_AIRFLOW_3_OR_HIGHER,
12
+ # Approach suggested by https://stackoverflow.com/a/11887885/5004662.
13
+ AIRFLOW_VERSION = packaging.version.parse(airflow.version.version)
14
+ HAS_AIRFLOW_DAG_LISTENER_API = True # this is in Airflow 2.5+
15
+ HAS_AIRFLOW_DATASET_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse(
16
+ "2.8.0.dev0"
15
17
  )
16
18
 
17
- # Version feature flags - hardcode based on Airflow version
18
- # These were previously in the old _airflow_shims but are better kept simple
19
- HAS_AIRFLOW_STANDALONE_CMD = AIRFLOW_VERSION >= packaging.version.parse("2.2")
20
- HAS_AIRFLOW_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse("2.3")
21
- HAS_AIRFLOW_DAG_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse("2.5")
22
- HAS_AIRFLOW_DATASET_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse("2.5")
23
19
 
24
- if IS_AIRFLOW_3_OR_HIGHER:
25
- # Airflow 3.x - use airflow3 shims
26
- from datahub_airflow_plugin.airflow3._shims import (
27
- BaseOperator,
28
- ExternalTaskSensor,
29
- MappedOperator,
30
- OpenLineagePlugin,
31
- Operator,
32
- TaskHolder,
33
- get_operator_class,
34
- get_task_inlets,
35
- get_task_outlets,
36
- redact_with_exclusions,
37
- try_import_from_string,
38
- )
39
- else:
40
- # Airflow 2.x - use airflow2 shims
41
- from datahub_airflow_plugin.airflow2._shims import ( # type: ignore[assignment]
42
- BaseOperator,
43
- ExternalTaskSensor,
44
- MappedOperator,
45
- OpenLineagePlugin,
46
- Operator,
47
- TaskHolder,
48
- get_operator_class,
49
- get_task_inlets,
50
- get_task_outlets,
51
- redact_with_exclusions,
52
- try_import_from_string,
53
- )
20
+ def get_task_inlets(operator: "Operator") -> List:
21
+ # From Airflow 2.4 _inlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _inlets
22
+ if hasattr(operator, "_inlets"):
23
+ return operator._inlets # type: ignore[attr-defined, union-attr]
24
+ if hasattr(operator, "get_inlet_defs"):
25
+ return operator.get_inlet_defs() # type: ignore[attr-defined]
26
+ return operator.inlets or []
27
+
28
+
29
+ def get_task_outlets(operator: "Operator") -> List:
30
+ # From Airflow 2.4 _outlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _outlets
31
+ # We have to use _outlets because outlets is empty in Airflow < 2.4.0
32
+ if hasattr(operator, "_outlets"):
33
+ return operator._outlets # type: ignore[attr-defined, union-attr]
34
+ if hasattr(operator, "get_outlet_defs"):
35
+ return operator.get_outlet_defs()
36
+ return operator.outlets or []
37
+
54
38
 
55
39
  __all__ = [
56
- # Airflow version and feature flags
57
40
  "AIRFLOW_VERSION",
58
- "IS_AIRFLOW_3_OR_HIGHER",
59
- "HAS_AIRFLOW_STANDALONE_CMD",
60
- "HAS_AIRFLOW_LISTENER_API",
61
- "HAS_AIRFLOW_DAG_LISTENER_API",
62
- "HAS_AIRFLOW_DATASET_LISTENER_API",
63
- # Airflow objects
64
- "BaseOperator",
65
- "Operator",
66
- "MappedOperator",
67
41
  "ExternalTaskSensor",
68
- "TaskHolder",
69
- "OpenLineagePlugin",
70
- "get_operator_class",
71
- "try_import_from_string",
72
- "redact_with_exclusions",
73
- "get_task_inlets",
74
- "get_task_outlets",
75
42
  ]
@@ -1,12 +1,12 @@
1
1
  from enum import Enum
2
- from typing import TYPE_CHECKING, List, Optional, Union
2
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
3
3
 
4
4
  from airflow.configuration import conf
5
- from pydantic import Field
5
+ from pydantic import root_validator
6
+ from pydantic.fields import Field
6
7
 
7
8
  import datahub.emitter.mce_builder as builder
8
9
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
9
- from datahub_airflow_plugin._airflow_version_specific import IS_AIRFLOW_3_OR_HIGHER
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  from datahub_airflow_plugin.hooks.datahub import (
@@ -18,15 +18,16 @@ if TYPE_CHECKING:
18
18
  class DatajobUrl(Enum):
19
19
  GRID = "grid"
20
20
  TASKINSTANCE = "taskinstance"
21
- TASKS = "tasks" # Airflow 3.x task URL format: /dags/{dag_id}/tasks/{task_id}
22
21
 
23
22
 
24
23
  class DatahubLineageConfig(ConfigModel):
25
24
  enabled: bool
26
25
 
27
- # DataHub hook connection ID (can be comma-separated for multiple connections).
26
+ # DataHub hook connection ID.
28
27
  datahub_conn_id: str
29
28
 
29
+ _datahub_connection_ids: List[str]
30
+
30
31
  # Cluster to associate with the pipelines and tasks. Defaults to "prod".
31
32
  cluster: str
32
33
 
@@ -52,32 +53,6 @@ class DatahubLineageConfig(ConfigModel):
52
53
 
53
54
  enable_extractors: bool
54
55
 
55
- # OpenLineage extractor patching/override controls (only apply when enable_extractors=True)
56
- # These allow fine-grained control over DataHub's enhancements to OpenLineage extractors
57
-
58
- # If true (default), patch SqlExtractor to use DataHub's SQL parser
59
- # This enables column-level lineage extraction from SQL queries
60
- # Works with both Legacy OpenLineage and OpenLineage Provider
61
- patch_sql_parser: bool
62
-
63
- # If true (default), patch SnowflakeExtractor's default_schema property
64
- # Fixes schema detection issues in Snowflake operators
65
- # Works with both Legacy OpenLineage and OpenLineage Provider
66
- patch_snowflake_schema: bool
67
-
68
- # If true (default), use DataHub's custom AthenaOperatorExtractor
69
- # Provides better Athena lineage with DataHub's SQL parser
70
- # Only applies to Legacy OpenLineage (OpenLineage Provider has its own)
71
- extract_athena_operator: bool
72
-
73
- # If true (default), use DataHub's custom BigQueryInsertJobOperatorExtractor
74
- # Handles BigQuery job configuration and destination tables
75
- # Only applies to Legacy OpenLineage (OpenLineage Provider has its own)
76
- extract_bigquery_insert_job_operator: bool
77
-
78
- # If true (default) use DataHub's custom TeradataOperator
79
- extract_teradata_operator: bool
80
-
81
56
  # If true, ti.render_templates() will be called in the listener.
82
57
  # Makes extraction of jinja-templated fields more accurate.
83
58
  render_templates: bool
@@ -94,17 +69,6 @@ class DatahubLineageConfig(ConfigModel):
94
69
 
95
70
  disable_openlineage_plugin: bool
96
71
 
97
- @property
98
- def _datahub_connection_ids(self) -> List[str]:
99
- """
100
- Parse comma-separated connection IDs into a list.
101
-
102
- This is implemented as a property to avoid the class variable pollution
103
- bug that would occur with validators. Each instance computes its own
104
- connection ID list from its datahub_conn_id field.
105
- """
106
- return [conn_id.strip() for conn_id in self.datahub_conn_id.split(",")]
107
-
108
72
  def make_emitter_hook(self) -> Union["DatahubGenericHook", "DatahubCompositeHook"]:
109
73
  # This is necessary to avoid issues with circular imports.
110
74
  from datahub_airflow_plugin.hooks.datahub import (
@@ -112,11 +76,18 @@ class DatahubLineageConfig(ConfigModel):
112
76
  DatahubGenericHook,
113
77
  )
114
78
 
115
- connection_ids = self._datahub_connection_ids
116
- if len(connection_ids) == 1:
117
- return DatahubGenericHook(connection_ids[0])
79
+ if len(self._datahub_connection_ids) == 1:
80
+ return DatahubGenericHook(self._datahub_connection_ids[0])
118
81
  else:
119
- return DatahubCompositeHook(connection_ids)
82
+ return DatahubCompositeHook(self._datahub_connection_ids)
83
+
84
+ @root_validator(skip_on_failure=True)
85
+ def split_conn_ids(cls, values: Dict) -> Dict:
86
+ if not values.get("datahub_conn_id"):
87
+ raise ValueError("datahub_conn_id is required")
88
+ conn_ids = values.get("datahub_conn_id", "").split(",")
89
+ cls._datahub_connection_ids = [conn_id.strip() for conn_id in conn_ids]
90
+ return values
120
91
 
121
92
 
122
93
  def get_lineage_config() -> DatahubLineageConfig:
@@ -136,58 +107,14 @@ def get_lineage_config() -> DatahubLineageConfig:
136
107
  capture_executions = conf.get("datahub", "capture_executions", fallback=True)
137
108
  materialize_iolets = conf.get("datahub", "materialize_iolets", fallback=True)
138
109
  enable_extractors = conf.get("datahub", "enable_extractors", fallback=True)
139
-
140
- # OpenLineage extractor patching/override configuration
141
- # These only apply when enable_extractors=True
142
- patch_sql_parser = conf.get("datahub", "patch_sql_parser", fallback=True)
143
- patch_snowflake_schema = conf.get(
144
- "datahub", "patch_snowflake_schema", fallback=True
145
- )
146
- extract_athena_operator = conf.get(
147
- "datahub", "extract_athena_operator", fallback=True
148
- )
149
- extract_bigquery_insert_job_operator = conf.get(
150
- "datahub", "extract_bigquery_insert_job_operator", fallback=True
151
- )
152
- extract_teradata_operator = conf.get(
153
- "datahub", "extract_teradata_operator", fallback=True
154
- )
155
-
156
110
  log_level = conf.get("datahub", "log_level", fallback=None)
157
111
  debug_emitter = conf.get("datahub", "debug_emitter", fallback=False)
158
-
159
- # Disable OpenLineage plugin by default (disable_openlineage_plugin=True) for all versions.
160
- # This is the safest default since most DataHub users only want DataHub's lineage.
161
- #
162
- # When disable_openlineage_plugin=True (default):
163
- # - Only DataHub plugin runs (OpenLineagePlugin.listeners are cleared if present)
164
- # - In Airflow 3: SQLParser calls only DataHub's enhanced parser
165
- # - In Airflow 2: DataHub uses its own extractors
166
- # - DataHub gets enhanced parsing with column-level lineage
167
- #
168
- # When disable_openlineage_plugin=False (opt-in for dual plugin mode):
169
- # - Both DataHub and OpenLineage plugins run side-by-side
170
- # - In Airflow 3: SQLParser calls BOTH parsers
171
- # - OpenLineage plugin uses its own parsing results (inputs/outputs)
172
- # - DataHub extracts its enhanced parsing (with column-level lineage) from run_facets
173
- # - Both plugins get their expected parsing without interference
174
- # - In Airflow 2: Not recommended - may cause conflicts
175
- default_disable_openlineage = True
176
-
177
112
  disable_openlineage_plugin = conf.get(
178
- "datahub", "disable_openlineage_plugin", fallback=default_disable_openlineage
113
+ "datahub", "disable_openlineage_plugin", fallback=True
179
114
  )
180
115
  render_templates = conf.get("datahub", "render_templates", fallback=True)
181
-
182
- # Use new task URL format for Airflow 3.x, old taskinstance format for Airflow 2.x
183
- # Airflow 3 changed URL structure: /dags/{dag_id}/tasks/{task_id} instead of /taskinstance/list/...
184
- default_datajob_url = (
185
- DatajobUrl.TASKS.value
186
- if IS_AIRFLOW_3_OR_HIGHER
187
- else DatajobUrl.TASKINSTANCE.value
188
- )
189
116
  datajob_url_link = conf.get(
190
- "datahub", "datajob_url_link", fallback=default_datajob_url
117
+ "datahub", "datajob_url_link", fallback=DatajobUrl.TASKINSTANCE.value
191
118
  )
192
119
  dag_filter_pattern = AllowDenyPattern.model_validate_json(
193
120
  conf.get("datahub", "dag_filter_str", fallback='{"allow": [".*"]}')
@@ -205,11 +132,6 @@ def get_lineage_config() -> DatahubLineageConfig:
205
132
  capture_executions=capture_executions,
206
133
  materialize_iolets=materialize_iolets,
207
134
  enable_extractors=enable_extractors,
208
- patch_sql_parser=patch_sql_parser,
209
- patch_snowflake_schema=patch_snowflake_schema,
210
- extract_athena_operator=extract_athena_operator,
211
- extract_bigquery_insert_job_operator=extract_bigquery_insert_job_operator,
212
- extract_teradata_operator=extract_teradata_operator,
213
135
  log_level=log_level,
214
136
  debug_emitter=debug_emitter,
215
137
  disable_openlineage_plugin=disable_openlineage_plugin,
@@ -1,18 +1,6 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING
3
2
 
4
- # Conditional import for OpenLineage (may not be installed)
5
- try:
6
- from openlineage.client.run import Dataset as OpenLineageDataset
7
-
8
- OPENLINEAGE_AVAILABLE = True
9
- except ImportError:
10
- # Not available when openlineage packages aren't installed
11
- OpenLineageDataset = None # type: ignore[assignment,misc]
12
- OPENLINEAGE_AVAILABLE = False
13
-
14
- if TYPE_CHECKING:
15
- from openlineage.client.run import Dataset as OpenLineageDataset
3
+ from openlineage.client.run import Dataset as OpenLineageDataset
16
4
 
17
5
  import datahub.emitter.mce_builder as builder
18
6
 
@@ -25,7 +13,7 @@ OL_SCHEME_TWEAKS = {
25
13
  }
26
14
 
27
15
 
28
- def translate_ol_to_datahub_urn(ol_uri: "OpenLineageDataset") -> str:
16
+ def translate_ol_to_datahub_urn(ol_uri: OpenLineageDataset) -> str:
29
17
  namespace = ol_uri.namespace
30
18
  name = ol_uri.name
31
19