acryl-datahub-airflow-plugin 1.3.1.4__py3-none-any.whl → 1.3.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
  3. datahub_airflow_plugin/_airflow_compat.py +32 -0
  4. datahub_airflow_plugin/_airflow_shims.py +64 -31
  5. datahub_airflow_plugin/_airflow_version_specific.py +184 -0
  6. datahub_airflow_plugin/_config.py +97 -19
  7. datahub_airflow_plugin/_constants.py +16 -0
  8. datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
  9. datahub_airflow_plugin/_version.py +1 -1
  10. datahub_airflow_plugin/airflow2/__init__.py +6 -0
  11. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
  12. datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
  13. datahub_airflow_plugin/airflow2/_extractors.py +477 -0
  14. datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
  15. datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
  16. datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
  17. datahub_airflow_plugin/airflow2/_shims.py +88 -0
  18. datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
  19. datahub_airflow_plugin/airflow3/__init__.py +6 -0
  20. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
  21. datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
  22. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
  23. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
  24. datahub_airflow_plugin/airflow3/_shims.py +82 -0
  25. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
  26. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
  27. datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
  28. datahub_airflow_plugin/client/airflow_generator.py +147 -43
  29. datahub_airflow_plugin/datahub_listener.py +19 -790
  30. datahub_airflow_plugin/example_dags/__init__.py +32 -0
  31. datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
  32. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
  33. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
  34. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
  35. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
  36. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
  37. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
  38. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
  39. datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
  40. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
  41. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
  42. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
  43. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
  44. datahub_airflow_plugin/hooks/datahub.py +11 -2
  45. datahub_airflow_plugin/operators/datahub.py +20 -3
  46. acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA +0 -90
  47. acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD +0 -33
  48. datahub_airflow_plugin/_extractors.py +0 -336
  49. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,89 @@
1
+ """Snowflake DataHub Ingest DAG
2
+
3
+ This example demonstrates how to ingest metadata from Snowflake into DataHub
4
+ from within an Airflow DAG. In contrast to the MySQL example, this DAG
5
+ pulls the DB connection configuration from Airflow's connection store.
6
+
7
+ This is the Airflow 3.0+ version.
8
+ """
9
+
10
+ from datetime import datetime, timedelta
11
+
12
+ from airflow import DAG
13
+ from airflow.hooks.base_hook import BaseHook
14
+ from airflow.operators.python import PythonVirtualenvOperator
15
+
16
+
17
+ def ingest_from_snowflake(snowflake_credentials, datahub_gms_server):
18
+ from datahub.ingestion.run.pipeline import Pipeline
19
+
20
+ pipeline = Pipeline.create(
21
+ # This configuration is analogous to a recipe configuration.
22
+ {
23
+ "source": {
24
+ "type": "snowflake",
25
+ "config": {
26
+ **snowflake_credentials,
27
+ # Other Snowflake config can be added here.
28
+ "profiling": {"enabled": False},
29
+ },
30
+ },
31
+ # Other ingestion features, like transformers, are also supported.
32
+ # "transformers": [
33
+ # {
34
+ # "type": "simple_add_dataset_ownership",
35
+ # "config": {
36
+ # "owner_urns": [
37
+ # "urn:li:corpuser:example",
38
+ # ]
39
+ # },
40
+ # }
41
+ # ],
42
+ "sink": {
43
+ "type": "datahub-rest",
44
+ "config": {"server": datahub_gms_server},
45
+ },
46
+ }
47
+ )
48
+ pipeline.run()
49
+ pipeline.pretty_print_summary()
50
+ pipeline.raise_from_status()
51
+
52
+
53
+ with DAG(
54
+ "datahub_snowflake_ingest",
55
+ default_args={
56
+ "owner": "airflow",
57
+ },
58
+ description="An example DAG which ingests metadata from Snowflake to DataHub",
59
+ start_date=datetime(2022, 1, 1),
60
+ schedule=timedelta(days=1),
61
+ catchup=False,
62
+ ) as dag:
63
+ # This example pulls credentials from Airflow's connection store.
64
+ # For this to work, you must have previously configured these connections in Airflow.
65
+ # See the Airflow docs for details: https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html
66
+ snowflake_conn = BaseHook.get_connection("snowflake_admin_default")
67
+ datahub_conn = BaseHook.get_connection("datahub_rest_default")
68
+
69
+ # While it is also possible to use the PythonOperator, we recommend using
70
+ # the PythonVirtualenvOperator to ensure that there are no dependency
71
+ # conflicts between DataHub and the rest of your Airflow environment.
72
+ ingest_task = PythonVirtualenvOperator(
73
+ task_id="ingest_from_snowflake",
74
+ requirements=[
75
+ "acryl-datahub[snowflake]",
76
+ ],
77
+ system_site_packages=False,
78
+ python_callable=ingest_from_snowflake,
79
+ op_kwargs={
80
+ "snowflake_credentials": {
81
+ "username": snowflake_conn.login,
82
+ "password": snowflake_conn.password,
83
+ "account_id": snowflake_conn.extra_dejson["account"],
84
+ "warehouse": snowflake_conn.extra_dejson.get("warehouse"),
85
+ "role": snowflake_conn.extra_dejson.get("role"),
86
+ },
87
+ "datahub_gms_server": datahub_conn.host,
88
+ },
89
+ )
@@ -3,17 +3,25 @@
3
3
  from datetime import timedelta
4
4
 
5
5
  import pendulum
6
- from airflow.decorators import dag, task
6
+ from airflow.decorators import ( # type: ignore[attr-defined] # Decorators not available in all Airflow versions
7
+ dag,
8
+ task,
9
+ )
7
10
 
8
11
  from datahub.ingestion.graph.client import DataHubGraph, RemovedStatusFilter
12
+ from datahub_airflow_plugin._airflow_version_specific import (
13
+ get_airflow_compatible_dag_kwargs,
14
+ )
9
15
  from datahub_airflow_plugin.hooks.datahub import DatahubRestHook
10
16
 
11
-
12
- @dag(
13
- schedule_interval=timedelta(days=1),
17
+ dag_decorator_kwargs = get_airflow_compatible_dag_kwargs(
14
18
  start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
19
+ schedule_interval=timedelta(days=1),
15
20
  catchup=False,
16
21
  )
22
+
23
+
24
+ @dag(**dag_decorator_kwargs)
17
25
  def datahub_graph_usage_sample_dag():
18
26
  @task()
19
27
  def use_the_graph():
@@ -1,7 +1,12 @@
1
1
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
2
2
 
3
3
  from airflow.exceptions import AirflowException
4
- from airflow.hooks.base import BaseHook
4
+
5
+ # BaseHook import - prefer new location in Airflow 3.x
6
+ try:
7
+ from airflow.sdk.bases.hook import BaseHook
8
+ except (ModuleNotFoundError, ImportError):
9
+ from airflow.hooks.base import BaseHook # type: ignore
5
10
 
6
11
  from datahub.emitter.composite_emitter import CompositeEmitter
7
12
  from datahub.emitter.generic_emitter import Emitter
@@ -10,9 +15,13 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
10
15
  MetadataChangeEvent,
11
16
  MetadataChangeProposal,
12
17
  )
18
+ from datahub_airflow_plugin._airflow_version_specific import IS_AIRFLOW_3_OR_HIGHER
13
19
 
14
20
  if TYPE_CHECKING:
15
- from airflow.models.connection import Connection
21
+ if IS_AIRFLOW_3_OR_HIGHER:
22
+ from airflow.sdk.definitions.connection import Connection
23
+ else:
24
+ from airflow.models.connection import Connection # type: ignore[assignment]
16
25
 
17
26
  from datahub.emitter.kafka_emitter import DatahubKafkaEmitter
18
27
  from datahub.emitter.rest_emitter import DataHubRestEmitter
@@ -1,4 +1,4 @@
1
- from typing import Any, List, Union
1
+ from typing import TYPE_CHECKING, Any, List, Union
2
2
 
3
3
  from airflow.models import BaseOperator
4
4
  from avrogen.dict_wrapper import DictWrapper
@@ -11,6 +11,21 @@ from datahub_airflow_plugin.hooks.datahub import (
11
11
  DatahubRestHook,
12
12
  )
13
13
 
14
+ if TYPE_CHECKING:
15
+ from jinja2 import Environment
16
+
17
+ # Import Context with version compatibility for type checking
18
+ # Import to different names to avoid redefinition errors, then assign to Context
19
+ Context: Any
20
+ try:
21
+ from airflow.utils.context import Context as _AirflowContext
22
+
23
+ Context = _AirflowContext
24
+ except ImportError:
25
+ from airflow.sdk.definitions.context import Context as _Airflow3Context
26
+
27
+ Context = _Airflow3Context # type: ignore[no-redef]
28
+
14
29
 
15
30
  class DatahubBaseOperator(BaseOperator):
16
31
  """
@@ -56,7 +71,9 @@ class DatahubEmitterOperator(DatahubBaseOperator):
56
71
  )
57
72
  self.metadata = mces
58
73
 
59
- def _render_template_fields(self, field_value, context, jinja_env):
74
+ def _render_template_fields(
75
+ self, field_value: Any, context: "Context", jinja_env: "Environment"
76
+ ) -> Any:
60
77
  if isinstance(field_value, DictWrapper):
61
78
  for key, value in field_value.items():
62
79
  setattr(
@@ -73,7 +90,7 @@ class DatahubEmitterOperator(DatahubBaseOperator):
73
90
  return super().render_template(field_value, context, jinja_env)
74
91
  return field_value
75
92
 
76
- def execute(self, context):
93
+ def execute(self, context: "Context") -> None:
77
94
  if context:
78
95
  jinja_env = self.get_template_env()
79
96
 
@@ -1,90 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: acryl-datahub-airflow-plugin
3
- Version: 1.3.1.4
4
- Summary: Datahub Airflow plugin to capture executions and send to Datahub
5
- Home-page: https://docs.datahub.com/
6
- License: Apache-2.0
7
- Project-URL: Documentation, https://docs.datahub.com/docs/
8
- Project-URL: Source, https://github.com/datahub-project/datahub
9
- Project-URL: Changelog, https://github.com/datahub-project/datahub/releases
10
- Classifier: Development Status :: 5 - Production/Stable
11
- Classifier: Programming Language :: Python
12
- Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3 :: Only
14
- Classifier: Intended Audience :: Developers
15
- Classifier: Intended Audience :: Information Technology
16
- Classifier: Intended Audience :: System Administrators
17
- Classifier: Operating System :: Unix
18
- Classifier: Operating System :: POSIX :: Linux
19
- Classifier: Environment :: Console
20
- Classifier: Environment :: MacOS X
21
- Classifier: Topic :: Software Development
22
- Requires-Python: >=3.9
23
- Description-Content-Type: text/markdown
24
- Requires-Dist: apache-airflow<3,>=2.7.0
25
- Requires-Dist: openlineage-airflow<=1.30.1,>=1.2.0
26
- Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.4
27
- Requires-Dist: acryl-datahub[datahub-rest,sql-parser]==1.3.1.4
28
- Requires-Dist: pydantic>=2.4.0
29
- Provides-Extra: ignore
30
- Provides-Extra: datahub-rest
31
- Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.4; extra == "datahub-rest"
32
- Provides-Extra: datahub-kafka
33
- Requires-Dist: acryl-datahub[datahub-kafka]==1.3.1.4; extra == "datahub-kafka"
34
- Provides-Extra: datahub-file
35
- Requires-Dist: acryl-datahub[sync-file-emitter]==1.3.1.4; extra == "datahub-file"
36
- Provides-Extra: dev
37
- Requires-Dist: ruff==0.11.7; extra == "dev"
38
- Requires-Dist: apache-airflow<3,>=2.7.0; extra == "dev"
39
- Requires-Dist: packaging; extra == "dev"
40
- Requires-Dist: openlineage-airflow<=1.30.1,>=1.2.0; extra == "dev"
41
- Requires-Dist: tenacity; extra == "dev"
42
- Requires-Dist: tox; extra == "dev"
43
- Requires-Dist: types-PyYAML; extra == "dev"
44
- Requires-Dist: mypy==1.17.1; extra == "dev"
45
- Requires-Dist: tox-uv; extra == "dev"
46
- Requires-Dist: twine; extra == "dev"
47
- Requires-Dist: types-click==0.1.12; extra == "dev"
48
- Requires-Dist: deepdiff!=8.0.0; extra == "dev"
49
- Requires-Dist: coverage>=5.1; extra == "dev"
50
- Requires-Dist: pytest-cov>=2.8.1; extra == "dev"
51
- Requires-Dist: types-setuptools; extra == "dev"
52
- Requires-Dist: pydantic>=2.4.0; extra == "dev"
53
- Requires-Dist: types-dataclasses; extra == "dev"
54
- Requires-Dist: build; extra == "dev"
55
- Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.4; extra == "dev"
56
- Requires-Dist: types-tabulate; extra == "dev"
57
- Requires-Dist: types-six; extra == "dev"
58
- Requires-Dist: types-requests; extra == "dev"
59
- Requires-Dist: types-cachetools; extra == "dev"
60
- Requires-Dist: acryl-datahub[datahub-rest,sql-parser]==1.3.1.4; extra == "dev"
61
- Requires-Dist: sqlalchemy-stubs; extra == "dev"
62
- Requires-Dist: pytest>=6.2.2; extra == "dev"
63
- Requires-Dist: types-toml; extra == "dev"
64
- Requires-Dist: types-python-dateutil; extra == "dev"
65
- Provides-Extra: integration-tests
66
- Requires-Dist: acryl-datahub[sync-file-emitter]==1.3.1.4; extra == "integration-tests"
67
- Requires-Dist: apache-airflow[amazon,google,snowflake]>=2.0.2; extra == "integration-tests"
68
- Requires-Dist: acryl-datahub[testing-utils]==1.3.1.4; extra == "integration-tests"
69
- Requires-Dist: acryl-datahub[datahub-kafka]==1.3.1.4; extra == "integration-tests"
70
- Requires-Dist: snowflake-connector-python>=2.7.10; extra == "integration-tests"
71
- Requires-Dist: apache-airflow-providers-sqlite; extra == "integration-tests"
72
- Requires-Dist: virtualenv; extra == "integration-tests"
73
- Dynamic: classifier
74
- Dynamic: description
75
- Dynamic: description-content-type
76
- Dynamic: home-page
77
- Dynamic: license
78
- Dynamic: project-url
79
- Dynamic: provides-extra
80
- Dynamic: requires-dist
81
- Dynamic: requires-python
82
- Dynamic: summary
83
-
84
- # Datahub Airflow Plugin
85
-
86
- See [the DataHub Airflow docs](https://docs.datahub.com/docs/lineage/airflow) for details.
87
-
88
- ## Developing
89
-
90
- See the [developing docs](../../metadata-ingestion/developing.md).
@@ -1,33 +0,0 @@
1
- datahub_airflow_plugin/__init__.py,sha256=NScUtA8N-m66Pyg0DO--YbPkrl48PK3UevpdQVW_y6E,1009
2
- datahub_airflow_plugin/_airflow_shims.py,sha256=hLMTkANJzmH9sEcaiwNb0EZgD11vh-XnDBqFQ9yqjr4,1613
3
- datahub_airflow_plugin/_config.py,sha256=qNbNC6YUGHf06RfOMsU7jpeNeE2ttxjUQIMEpxIhvyM,5221
4
- datahub_airflow_plugin/_datahub_ol_adapter.py,sha256=RuzMyWZo7MeJzAFoBfkT4cdDw5g1iWshB_nXG7jLnR0,545
5
- datahub_airflow_plugin/_extractors.py,sha256=HzYf6Nm8HhRfXGVTQvVgFVRUe54RSOq0CDv8oxtor1A,11875
6
- datahub_airflow_plugin/_version.py,sha256=RJL3ep1RXk_49LNT5HC_UqXUDu65_KSs6TJ87R1_CA0,145
7
- datahub_airflow_plugin/datahub_listener.py,sha256=plAU-DZ7EQmC1Ec4gzOaksqHnJLxaCYHkm2jEPTkgvc,31100
8
- datahub_airflow_plugin/datahub_plugin.py,sha256=rbZhs7s5O3_MlkQw5aZToC2W5mMic_EpI3oybHB0ofw,1224
9
- datahub_airflow_plugin/entities.py,sha256=xDZ-mZH7hjUkZbatWYUwI43_9B40wGiotlyQhiO8rEM,1987
10
- datahub_airflow_plugin/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- datahub_airflow_plugin/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- datahub_airflow_plugin/client/airflow_generator.py,sha256=zWGX6M7hqbNvOPu11VlutkJ-g149Xv2m5_IC3GqfRJk,22120
13
- datahub_airflow_plugin/example_dags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py,sha256=BbrOErFboKMDFn712RHEKI9T4Vh0q6kYSVet56gPqVk,1319
15
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py,sha256=xYnuXhWXL5b9Tij0BFvFLckjKCobjzPU3xlxLg2_NXc,1015
16
- datahub_airflow_plugin/example_dags/lineage_backend_demo.py,sha256=Dy6MxwtX7G0mQeALqpLRu4F03IyU9fqIkr-CcKpo2JE,1625
17
- datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py,sha256=kW2rLFtOnoiMxBJ315GzlmR0Sz1cqQ_wwLbG9UC-u7Y,1499
18
- datahub_airflow_plugin/example_dags/lineage_emission_dag.py,sha256=LE29DzW51a4ZAl_zrcLrqSyzmy8qElcZagXsIMjaZLU,1946
19
- datahub_airflow_plugin/example_dags/mysql_sample_dag.py,sha256=Unx9Ger3R9ptEutfV-4NjjEaTIEYJ-tLrZr7OsK608k,1922
20
- datahub_airflow_plugin/example_dags/snowflake_sample_dag.py,sha256=b9iaE7zChQha9u57F84U6uqavGl7WrUnMNOzXEiZxjE,3234
21
- datahub_airflow_plugin/hooks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- datahub_airflow_plugin/hooks/datahub.py,sha256=xa-gsJUza3jSAUP1QLJSNBn4bUHxjXo_FdCAf08IWFo,11155
23
- datahub_airflow_plugin/operators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- datahub_airflow_plugin/operators/datahub.py,sha256=3fT_qg1tUpBEne1XBk8zDIUNBMcox7mxoEoN9O4XIPA,3814
25
- datahub_airflow_plugin/operators/datahub_assertion_operator.py,sha256=j_P9M1a5qME55pKHAfTqZsVVtIslFBO59r8UQOOBvsk,2914
26
- datahub_airflow_plugin/operators/datahub_assertion_sensor.py,sha256=QJIZZYQhqscj3bhBN5Sei-ABMRRAl2KiQxXTXcZQ51Q,2917
27
- datahub_airflow_plugin/operators/datahub_operation_operator.py,sha256=KJ8M8jJ7UWW6kNbiS-rELc-kqCPkZ3ck7z51oAXGPSI,3351
28
- datahub_airflow_plugin/operators/datahub_operation_sensor.py,sha256=U19fi5DpjBRWm_1ljXcjnspUzfa3mqYfOQZHjLk-ufI,3618
29
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA,sha256=USLZazGCYGGkU-KVH1etSzsIvnLyhMuQANkHGhKdzf0,3976
30
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/entry_points.txt,sha256=HqmajDHtrsz0b5Lswe1-eeuObxdtucd9YoxH77jJBA8,179
32
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/top_level.txt,sha256=VBzisOQfzqL1WRbNyItaruf3kTigXltjzgqzbheaFp0,23
33
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD,,
@@ -1,336 +0,0 @@
1
- import contextlib
2
- import logging
3
- import unittest.mock
4
- from typing import TYPE_CHECKING, Optional
5
-
6
- from airflow.models.operator import Operator
7
- from openlineage.airflow.extractors import (
8
- BaseExtractor,
9
- ExtractorManager as OLExtractorManager,
10
- TaskMetadata,
11
- )
12
- from openlineage.airflow.extractors.snowflake_extractor import SnowflakeExtractor
13
- from openlineage.airflow.extractors.sql_extractor import SqlExtractor
14
- from openlineage.airflow.utils import get_operator_class, try_import_from_string
15
- from openlineage.client.facet import (
16
- ExtractionError,
17
- ExtractionErrorRunFacet,
18
- SqlJobFacet,
19
- )
20
-
21
- import datahub.emitter.mce_builder as builder
22
- from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
23
- get_platform_from_sqlalchemy_uri,
24
- )
25
- from datahub.sql_parsing.sqlglot_lineage import (
26
- SqlParsingResult,
27
- create_lineage_sql_parsed_result,
28
- )
29
- from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
30
-
31
- if TYPE_CHECKING:
32
- from airflow.models import DagRun, TaskInstance
33
-
34
- from datahub.ingestion.graph.client import DataHubGraph
35
-
36
- logger = logging.getLogger(__name__)
37
- _DATAHUB_GRAPH_CONTEXT_KEY = "datahub_graph"
38
- SQL_PARSING_RESULT_KEY = "datahub_sql"
39
-
40
-
41
- class ExtractorManager(OLExtractorManager):
42
- # TODO: On Airflow 2.7, the OLExtractorManager is part of the built-in Airflow API.
43
- # When available, we should use that instead. The same goe for most of the OL
44
- # extractors.
45
-
46
- def __init__(self):
47
- super().__init__()
48
-
49
- _sql_operator_overrides = [
50
- # The OL BigQuery extractor has some complex logic to fetch detect
51
- # the BigQuery job_id and fetch lineage from there. However, it can't
52
- # generate CLL, so we disable it and use our own extractor instead.
53
- "BigQueryOperator",
54
- "BigQueryExecuteQueryOperator",
55
- # Athena also does something similar.
56
- "AWSAthenaOperator",
57
- # Additional types that OL doesn't support. This is only necessary because
58
- # on older versions of Airflow, these operators don't inherit from SQLExecuteQueryOperator.
59
- "SqliteOperator",
60
- ]
61
- for operator in _sql_operator_overrides:
62
- self.task_to_extractor.extractors[operator] = GenericSqlExtractor
63
-
64
- self.task_to_extractor.extractors["AthenaOperator"] = AthenaOperatorExtractor
65
-
66
- self.task_to_extractor.extractors["BigQueryInsertJobOperator"] = (
67
- BigQueryInsertJobOperatorExtractor
68
- )
69
-
70
- self._graph: Optional["DataHubGraph"] = None
71
-
72
- @contextlib.contextmanager
73
- def _patch_extractors(self):
74
- with contextlib.ExitStack() as stack:
75
- # Patch the SqlExtractor.extract() method.
76
- stack.enter_context(
77
- unittest.mock.patch.object(
78
- SqlExtractor,
79
- "extract",
80
- _sql_extractor_extract,
81
- )
82
- )
83
-
84
- # Patch the SnowflakeExtractor.default_schema property.
85
- stack.enter_context(
86
- unittest.mock.patch.object(
87
- SnowflakeExtractor,
88
- "default_schema",
89
- property(_snowflake_default_schema),
90
- )
91
- )
92
-
93
- # TODO: Override the BigQuery extractor to use the DataHub SQL parser.
94
- # self.extractor_manager.add_extractor()
95
-
96
- # TODO: Override the Athena extractor to use the DataHub SQL parser.
97
-
98
- yield
99
-
100
- def extract_metadata(
101
- self,
102
- dagrun: "DagRun",
103
- task: "Operator",
104
- complete: bool = False,
105
- task_instance: Optional["TaskInstance"] = None,
106
- task_uuid: Optional[str] = None,
107
- graph: Optional["DataHubGraph"] = None,
108
- ) -> TaskMetadata:
109
- self._graph = graph
110
- with self._patch_extractors():
111
- return super().extract_metadata(
112
- dagrun, task, complete, task_instance, task_uuid
113
- )
114
-
115
- def _get_extractor(self, task: "Operator") -> Optional[BaseExtractor]:
116
- # By adding this, we can use the generic extractor as a fallback for
117
- # any operator that inherits from SQLExecuteQueryOperator.
118
- clazz = get_operator_class(task)
119
- SQLExecuteQueryOperator = try_import_from_string(
120
- "airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator"
121
- )
122
- if SQLExecuteQueryOperator and issubclass(clazz, SQLExecuteQueryOperator):
123
- self.task_to_extractor.extractors.setdefault(
124
- clazz.__name__, GenericSqlExtractor
125
- )
126
-
127
- extractor = super()._get_extractor(task)
128
- if extractor:
129
- extractor.set_context(_DATAHUB_GRAPH_CONTEXT_KEY, self._graph)
130
- return extractor
131
-
132
-
133
- class GenericSqlExtractor(SqlExtractor):
134
- # Note that the extract() method is patched elsewhere.
135
-
136
- @property
137
- def default_schema(self):
138
- return super().default_schema
139
-
140
- def _get_scheme(self) -> Optional[str]:
141
- # Best effort conversion to DataHub platform names.
142
-
143
- with contextlib.suppress(Exception):
144
- if self.hook:
145
- if hasattr(self.hook, "get_uri"):
146
- uri = self.hook.get_uri()
147
- return get_platform_from_sqlalchemy_uri(uri)
148
-
149
- return self.conn.conn_type or super().dialect
150
-
151
- def _get_database(self) -> Optional[str]:
152
- if self.conn:
153
- # For BigQuery, the "database" is the project name.
154
- if hasattr(self.conn, "project_id"):
155
- return self.conn.project_id
156
-
157
- return self.conn.schema
158
- return None
159
-
160
-
161
- def _sql_extractor_extract(self: "SqlExtractor") -> TaskMetadata:
162
- # Why not override the OL sql_parse method directly, instead of overriding
163
- # extract()? A few reasons:
164
- #
165
- # 1. We would want to pass the default_db and graph instance into our sql parser
166
- # method. The OL code doesn't pass the default_db (despite having it available),
167
- # and it's not clear how to get the graph instance into that method.
168
- # 2. OL has some janky logic to fetch table schemas as part of the sql extractor.
169
- # We don't want that behavior and this lets us disable it.
170
- # 3. Our SqlParsingResult already has DataHub urns, whereas using SqlMeta would
171
- # require us to convert those urns to OL uris, just for them to get converted
172
- # back to urns later on in our processing.
173
-
174
- task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
175
- sql = self.operator.sql
176
-
177
- default_database = getattr(self.operator, "database", None)
178
- if not default_database:
179
- default_database = self.database
180
- default_schema = self.default_schema
181
-
182
- # TODO: Add better handling for sql being a list of statements.
183
- if isinstance(sql, list):
184
- logger.info(f"Got list of SQL statements for {task_name}. Using first one.")
185
- sql = sql[0]
186
-
187
- # Run the SQL parser.
188
- scheme = self.scheme
189
- platform = OL_SCHEME_TWEAKS.get(scheme, scheme)
190
-
191
- return _parse_sql_into_task_metadata(
192
- self,
193
- sql,
194
- platform=platform,
195
- default_database=default_database,
196
- default_schema=default_schema,
197
- )
198
-
199
-
200
- def _parse_sql_into_task_metadata(
201
- self: "BaseExtractor",
202
- sql: str,
203
- platform: str,
204
- default_database: Optional[str],
205
- default_schema: Optional[str],
206
- ) -> TaskMetadata:
207
- task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
208
-
209
- run_facets = {}
210
- job_facets = {"sql": SqlJobFacet(query=SqlExtractor._normalize_sql(sql))}
211
-
212
- # Prepare to run the SQL parser.
213
- graph = self.context.get(_DATAHUB_GRAPH_CONTEXT_KEY, None)
214
-
215
- self.log.debug(
216
- "Running the SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
217
- "with graph client" if graph else "in offline mode",
218
- platform,
219
- default_database,
220
- default_schema,
221
- sql,
222
- )
223
- sql_parsing_result: SqlParsingResult = create_lineage_sql_parsed_result(
224
- query=sql,
225
- graph=graph,
226
- platform=platform,
227
- platform_instance=None,
228
- env=builder.DEFAULT_ENV,
229
- default_db=default_database,
230
- default_schema=default_schema,
231
- )
232
- self.log.debug(f"Got sql lineage {sql_parsing_result}")
233
-
234
- if sql_parsing_result.debug_info.error:
235
- error = sql_parsing_result.debug_info.error
236
- run_facets["extractionError"] = ExtractionErrorRunFacet(
237
- totalTasks=1,
238
- failedTasks=1,
239
- errors=[
240
- ExtractionError(
241
- errorMessage=str(error),
242
- stackTrace=None,
243
- task="datahub_sql_parser",
244
- taskNumber=None,
245
- )
246
- ],
247
- )
248
-
249
- # Save sql_parsing_result to the facets dict. It is removed from the
250
- # facet dict in the extractor's processing logic.
251
- run_facets[SQL_PARSING_RESULT_KEY] = sql_parsing_result # type: ignore
252
-
253
- return TaskMetadata(
254
- name=task_name,
255
- inputs=[],
256
- outputs=[],
257
- run_facets=run_facets,
258
- job_facets=job_facets,
259
- )
260
-
261
-
262
- class BigQueryInsertJobOperatorExtractor(BaseExtractor):
263
- def extract(self) -> Optional[TaskMetadata]:
264
- from airflow.providers.google.cloud.operators.bigquery import (
265
- BigQueryInsertJobOperator, # type: ignore
266
- )
267
-
268
- operator: "BigQueryInsertJobOperator" = self.operator
269
- sql = operator.configuration.get("query", {}).get("query")
270
- if not sql:
271
- self.log.warning("No query found in BigQueryInsertJobOperator")
272
- return None
273
-
274
- destination_table = operator.configuration.get("query", {}).get(
275
- "destinationTable"
276
- )
277
- destination_table_urn = None
278
- if destination_table:
279
- project_id = destination_table.get("projectId")
280
- dataset_id = destination_table.get("datasetId")
281
- table_id = destination_table.get("tableId")
282
-
283
- if project_id and dataset_id and table_id:
284
- destination_table_urn = builder.make_dataset_urn(
285
- platform="bigquery",
286
- name=f"{project_id}.{dataset_id}.{table_id}",
287
- env=builder.DEFAULT_ENV,
288
- )
289
-
290
- task_metadata = _parse_sql_into_task_metadata(
291
- self,
292
- sql,
293
- platform="bigquery",
294
- default_database=operator.project_id,
295
- default_schema=None,
296
- )
297
-
298
- if destination_table_urn and task_metadata:
299
- sql_parsing_result = task_metadata.run_facets.get(SQL_PARSING_RESULT_KEY)
300
- if sql_parsing_result and isinstance(sql_parsing_result, SqlParsingResult):
301
- sql_parsing_result.out_tables.append(destination_table_urn)
302
-
303
- return task_metadata
304
-
305
-
306
- class AthenaOperatorExtractor(BaseExtractor):
307
- def extract(self) -> Optional[TaskMetadata]:
308
- from airflow.providers.amazon.aws.operators.athena import (
309
- AthenaOperator, # type: ignore
310
- )
311
-
312
- operator: "AthenaOperator" = self.operator
313
- sql = operator.query
314
- if not sql:
315
- self.log.warning("No query found in AthenaOperator")
316
- return None
317
-
318
- return _parse_sql_into_task_metadata(
319
- self,
320
- sql,
321
- platform="athena",
322
- default_database=None,
323
- default_schema=self.operator.database,
324
- )
325
-
326
-
327
- def _snowflake_default_schema(self: "SnowflakeExtractor") -> Optional[str]:
328
- if hasattr(self.operator, "schema") and self.operator.schema is not None:
329
- return self.operator.schema
330
- return (
331
- self.conn.extra_dejson.get("extra__snowflake__schema", "")
332
- or self.conn.extra_dejson.get("schema", "")
333
- or self.conn.schema
334
- )
335
- # TODO: Should we try a fallback of:
336
- # execute_query_on_hook(self.hook, "SELECT current_schema();")[0][0]