acryl-datahub-airflow-plugin 1.3.1.5rc6__py3-none-any.whl → 1.3.1.5rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5rc8.dist-info/METADATA +303 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5rc8.dist-info/RECORD +65 -0
  3. datahub_airflow_plugin/_airflow_compat.py +32 -0
  4. datahub_airflow_plugin/_airflow_shims.py +64 -31
  5. datahub_airflow_plugin/_airflow_version_specific.py +184 -0
  6. datahub_airflow_plugin/_config.py +97 -19
  7. datahub_airflow_plugin/_constants.py +16 -0
  8. datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
  9. datahub_airflow_plugin/_version.py +1 -1
  10. datahub_airflow_plugin/airflow2/__init__.py +6 -0
  11. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
  12. datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
  13. datahub_airflow_plugin/airflow2/_extractors.py +477 -0
  14. datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
  15. datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
  16. datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
  17. datahub_airflow_plugin/airflow2/_shims.py +88 -0
  18. datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
  19. datahub_airflow_plugin/airflow3/__init__.py +6 -0
  20. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
  21. datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
  22. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
  23. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
  24. datahub_airflow_plugin/airflow3/_shims.py +82 -0
  25. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
  26. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
  27. datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
  28. datahub_airflow_plugin/client/airflow_generator.py +147 -43
  29. datahub_airflow_plugin/datahub_listener.py +19 -790
  30. datahub_airflow_plugin/example_dags/__init__.py +32 -0
  31. datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
  32. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
  33. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
  34. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
  35. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
  36. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
  37. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
  38. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
  39. datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
  40. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
  41. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
  42. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
  43. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
  44. datahub_airflow_plugin/hooks/datahub.py +11 -2
  45. datahub_airflow_plugin/operators/datahub.py +20 -3
  46. acryl_datahub_airflow_plugin-1.3.1.5rc6.dist-info/METADATA +0 -91
  47. acryl_datahub_airflow_plugin-1.3.1.5rc6.dist-info/RECORD +0 -33
  48. datahub_airflow_plugin/_extractors.py +0 -365
  49. {acryl_datahub_airflow_plugin-1.3.1.5rc6.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc8.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.5rc6.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc8.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.5rc6.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,89 @@
1
+ """Snowflake DataHub Ingest DAG
2
+
3
+ This example demonstrates how to ingest metadata from Snowflake into DataHub
4
+ from within an Airflow DAG. In contrast to the MySQL example, this DAG
5
+ pulls the DB connection configuration from Airflow's connection store.
6
+
7
+ This is the Airflow 3.0+ version.
8
+ """
9
+
10
+ from datetime import datetime, timedelta
11
+
12
+ from airflow import DAG
13
+ from airflow.hooks.base_hook import BaseHook
14
+ from airflow.operators.python import PythonVirtualenvOperator
15
+
16
+
17
+ def ingest_from_snowflake(snowflake_credentials, datahub_gms_server):
18
+ from datahub.ingestion.run.pipeline import Pipeline
19
+
20
+ pipeline = Pipeline.create(
21
+ # This configuration is analogous to a recipe configuration.
22
+ {
23
+ "source": {
24
+ "type": "snowflake",
25
+ "config": {
26
+ **snowflake_credentials,
27
+ # Other Snowflake config can be added here.
28
+ "profiling": {"enabled": False},
29
+ },
30
+ },
31
+ # Other ingestion features, like transformers, are also supported.
32
+ # "transformers": [
33
+ # {
34
+ # "type": "simple_add_dataset_ownership",
35
+ # "config": {
36
+ # "owner_urns": [
37
+ # "urn:li:corpuser:example",
38
+ # ]
39
+ # },
40
+ # }
41
+ # ],
42
+ "sink": {
43
+ "type": "datahub-rest",
44
+ "config": {"server": datahub_gms_server},
45
+ },
46
+ }
47
+ )
48
+ pipeline.run()
49
+ pipeline.pretty_print_summary()
50
+ pipeline.raise_from_status()
51
+
52
+
53
+ with DAG(
54
+ "datahub_snowflake_ingest",
55
+ default_args={
56
+ "owner": "airflow",
57
+ },
58
+ description="An example DAG which ingests metadata from Snowflake to DataHub",
59
+ start_date=datetime(2022, 1, 1),
60
+ schedule=timedelta(days=1),
61
+ catchup=False,
62
+ ) as dag:
63
+ # This example pulls credentials from Airflow's connection store.
64
+ # For this to work, you must have previously configured these connections in Airflow.
65
+ # See the Airflow docs for details: https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html
66
+ snowflake_conn = BaseHook.get_connection("snowflake_admin_default")
67
+ datahub_conn = BaseHook.get_connection("datahub_rest_default")
68
+
69
+ # While it is also possible to use the PythonOperator, we recommend using
70
+ # the PythonVirtualenvOperator to ensure that there are no dependency
71
+ # conflicts between DataHub and the rest of your Airflow environment.
72
+ ingest_task = PythonVirtualenvOperator(
73
+ task_id="ingest_from_snowflake",
74
+ requirements=[
75
+ "acryl-datahub[snowflake]",
76
+ ],
77
+ system_site_packages=False,
78
+ python_callable=ingest_from_snowflake,
79
+ op_kwargs={
80
+ "snowflake_credentials": {
81
+ "username": snowflake_conn.login,
82
+ "password": snowflake_conn.password,
83
+ "account_id": snowflake_conn.extra_dejson["account"],
84
+ "warehouse": snowflake_conn.extra_dejson.get("warehouse"),
85
+ "role": snowflake_conn.extra_dejson.get("role"),
86
+ },
87
+ "datahub_gms_server": datahub_conn.host,
88
+ },
89
+ )
@@ -3,17 +3,25 @@
3
3
  from datetime import timedelta
4
4
 
5
5
  import pendulum
6
- from airflow.decorators import dag, task
6
+ from airflow.decorators import ( # type: ignore[attr-defined] # Decorators not available in all Airflow versions
7
+ dag,
8
+ task,
9
+ )
7
10
 
8
11
  from datahub.ingestion.graph.client import DataHubGraph, RemovedStatusFilter
12
+ from datahub_airflow_plugin._airflow_version_specific import (
13
+ get_airflow_compatible_dag_kwargs,
14
+ )
9
15
  from datahub_airflow_plugin.hooks.datahub import DatahubRestHook
10
16
 
11
-
12
- @dag(
13
- schedule_interval=timedelta(days=1),
17
+ dag_decorator_kwargs = get_airflow_compatible_dag_kwargs(
14
18
  start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
19
+ schedule_interval=timedelta(days=1),
15
20
  catchup=False,
16
21
  )
22
+
23
+
24
+ @dag(**dag_decorator_kwargs)
17
25
  def datahub_graph_usage_sample_dag():
18
26
  @task()
19
27
  def use_the_graph():
@@ -1,7 +1,12 @@
1
1
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
2
2
 
3
3
  from airflow.exceptions import AirflowException
4
- from airflow.hooks.base import BaseHook
4
+
5
+ # BaseHook import - prefer new location in Airflow 3.x
6
+ try:
7
+ from airflow.sdk.bases.hook import BaseHook
8
+ except (ModuleNotFoundError, ImportError):
9
+ from airflow.hooks.base import BaseHook # type: ignore
5
10
 
6
11
  from datahub.emitter.composite_emitter import CompositeEmitter
7
12
  from datahub.emitter.generic_emitter import Emitter
@@ -10,9 +15,13 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
10
15
  MetadataChangeEvent,
11
16
  MetadataChangeProposal,
12
17
  )
18
+ from datahub_airflow_plugin._airflow_version_specific import IS_AIRFLOW_3_OR_HIGHER
13
19
 
14
20
  if TYPE_CHECKING:
15
- from airflow.models.connection import Connection
21
+ if IS_AIRFLOW_3_OR_HIGHER:
22
+ from airflow.sdk.definitions.connection import Connection
23
+ else:
24
+ from airflow.models.connection import Connection # type: ignore[assignment]
16
25
 
17
26
  from datahub.emitter.kafka_emitter import DatahubKafkaEmitter
18
27
  from datahub.emitter.rest_emitter import DataHubRestEmitter
@@ -1,4 +1,4 @@
1
- from typing import Any, List, Union
1
+ from typing import TYPE_CHECKING, Any, List, Union
2
2
 
3
3
  from airflow.models import BaseOperator
4
4
  from avrogen.dict_wrapper import DictWrapper
@@ -11,6 +11,21 @@ from datahub_airflow_plugin.hooks.datahub import (
11
11
  DatahubRestHook,
12
12
  )
13
13
 
14
+ if TYPE_CHECKING:
15
+ from jinja2 import Environment
16
+
17
+ # Import Context with version compatibility for type checking
18
+ # Import to different names to avoid redefinition errors, then assign to Context
19
+ Context: Any
20
+ try:
21
+ from airflow.utils.context import Context as _AirflowContext
22
+
23
+ Context = _AirflowContext
24
+ except ImportError:
25
+ from airflow.sdk.definitions.context import Context as _Airflow3Context
26
+
27
+ Context = _Airflow3Context # type: ignore[no-redef]
28
+
14
29
 
15
30
  class DatahubBaseOperator(BaseOperator):
16
31
  """
@@ -56,7 +71,9 @@ class DatahubEmitterOperator(DatahubBaseOperator):
56
71
  )
57
72
  self.metadata = mces
58
73
 
59
- def _render_template_fields(self, field_value, context, jinja_env):
74
+ def _render_template_fields(
75
+ self, field_value: Any, context: "Context", jinja_env: "Environment"
76
+ ) -> Any:
60
77
  if isinstance(field_value, DictWrapper):
61
78
  for key, value in field_value.items():
62
79
  setattr(
@@ -73,7 +90,7 @@ class DatahubEmitterOperator(DatahubBaseOperator):
73
90
  return super().render_template(field_value, context, jinja_env)
74
91
  return field_value
75
92
 
76
- def execute(self, context):
93
+ def execute(self, context: "Context") -> None:
77
94
  if context:
78
95
  jinja_env = self.get_template_env()
79
96
 
@@ -1,91 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: acryl-datahub-airflow-plugin
3
- Version: 1.3.1.5rc6
4
- Summary: Datahub Airflow plugin to capture executions and send to Datahub
5
- Home-page: https://docs.datahub.com/
6
- License: Apache-2.0
7
- Project-URL: Documentation, https://docs.datahub.com/docs/
8
- Project-URL: Source, https://github.com/datahub-project/datahub
9
- Project-URL: Changelog, https://github.com/datahub-project/datahub/releases
10
- Classifier: Development Status :: 5 - Production/Stable
11
- Classifier: Programming Language :: Python
12
- Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3 :: Only
14
- Classifier: Intended Audience :: Developers
15
- Classifier: Intended Audience :: Information Technology
16
- Classifier: Intended Audience :: System Administrators
17
- Classifier: Operating System :: Unix
18
- Classifier: Operating System :: POSIX :: Linux
19
- Classifier: Environment :: Console
20
- Classifier: Environment :: MacOS X
21
- Classifier: Topic :: Software Development
22
- Requires-Python: >=3.9
23
- Description-Content-Type: text/markdown
24
- Requires-Dist: apache-airflow<3,>=2.7.0
25
- Requires-Dist: openlineage-airflow<=1.30.1,>=1.2.0
26
- Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.5rc6
27
- Requires-Dist: acryl-datahub[datahub-rest,sql-parser]==1.3.1.5rc6
28
- Requires-Dist: pydantic>=2.4.0
29
- Provides-Extra: ignore
30
- Provides-Extra: datahub-rest
31
- Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.5rc6; extra == "datahub-rest"
32
- Provides-Extra: datahub-kafka
33
- Requires-Dist: acryl-datahub[datahub-kafka]==1.3.1.5rc6; extra == "datahub-kafka"
34
- Provides-Extra: datahub-file
35
- Requires-Dist: acryl-datahub[sync-file-emitter]==1.3.1.5rc6; extra == "datahub-file"
36
- Provides-Extra: dev
37
- Requires-Dist: build; extra == "dev"
38
- Requires-Dist: types-requests; extra == "dev"
39
- Requires-Dist: pytest-cov>=2.8.1; extra == "dev"
40
- Requires-Dist: acryl-datahub[datahub-rest,sql-parser]==1.3.1.5rc6; extra == "dev"
41
- Requires-Dist: packaging; extra == "dev"
42
- Requires-Dist: types-dataclasses; extra == "dev"
43
- Requires-Dist: types-setuptools; extra == "dev"
44
- Requires-Dist: types-click==0.1.12; extra == "dev"
45
- Requires-Dist: tox; extra == "dev"
46
- Requires-Dist: tenacity; extra == "dev"
47
- Requires-Dist: types-python-dateutil; extra == "dev"
48
- Requires-Dist: types-six; extra == "dev"
49
- Requires-Dist: coverage>=5.1; extra == "dev"
50
- Requires-Dist: pydantic>=2.4.0; extra == "dev"
51
- Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.5rc6; extra == "dev"
52
- Requires-Dist: mypy==1.17.1; extra == "dev"
53
- Requires-Dist: types-cachetools; extra == "dev"
54
- Requires-Dist: types-toml; extra == "dev"
55
- Requires-Dist: types-PyYAML; extra == "dev"
56
- Requires-Dist: twine; extra == "dev"
57
- Requires-Dist: apache-airflow<3,>=2.7.0; extra == "dev"
58
- Requires-Dist: pytest>=6.2.2; extra == "dev"
59
- Requires-Dist: deepdiff!=8.0.0; extra == "dev"
60
- Requires-Dist: openlineage-airflow<=1.30.1,>=1.2.0; extra == "dev"
61
- Requires-Dist: sqlalchemy-stubs; extra == "dev"
62
- Requires-Dist: types-tabulate; extra == "dev"
63
- Requires-Dist: tox-uv; extra == "dev"
64
- Requires-Dist: ruff==0.11.7; extra == "dev"
65
- Provides-Extra: integration-tests
66
- Requires-Dist: acryl-datahub[datahub-kafka]==1.3.1.5rc6; extra == "integration-tests"
67
- Requires-Dist: apache-airflow-providers-teradata; extra == "integration-tests"
68
- Requires-Dist: acryl-datahub[sync-file-emitter]==1.3.1.5rc6; extra == "integration-tests"
69
- Requires-Dist: apache-airflow[amazon,google,snowflake]>=2.0.2; extra == "integration-tests"
70
- Requires-Dist: snowflake-connector-python>=2.7.10; extra == "integration-tests"
71
- Requires-Dist: virtualenv; extra == "integration-tests"
72
- Requires-Dist: acryl-datahub[testing-utils]==1.3.1.5rc6; extra == "integration-tests"
73
- Requires-Dist: apache-airflow-providers-sqlite; extra == "integration-tests"
74
- Dynamic: classifier
75
- Dynamic: description
76
- Dynamic: description-content-type
77
- Dynamic: home-page
78
- Dynamic: license
79
- Dynamic: project-url
80
- Dynamic: provides-extra
81
- Dynamic: requires-dist
82
- Dynamic: requires-python
83
- Dynamic: summary
84
-
85
- # Datahub Airflow Plugin
86
-
87
- See [the DataHub Airflow docs](https://docs.datahub.com/docs/lineage/airflow) for details.
88
-
89
- ## Developing
90
-
91
- See the [developing docs](../../metadata-ingestion/developing.md).
@@ -1,33 +0,0 @@
1
- datahub_airflow_plugin/__init__.py,sha256=NScUtA8N-m66Pyg0DO--YbPkrl48PK3UevpdQVW_y6E,1009
2
- datahub_airflow_plugin/_airflow_shims.py,sha256=hLMTkANJzmH9sEcaiwNb0EZgD11vh-XnDBqFQ9yqjr4,1613
3
- datahub_airflow_plugin/_config.py,sha256=qNbNC6YUGHf06RfOMsU7jpeNeE2ttxjUQIMEpxIhvyM,5221
4
- datahub_airflow_plugin/_datahub_ol_adapter.py,sha256=RuzMyWZo7MeJzAFoBfkT4cdDw5g1iWshB_nXG7jLnR0,545
5
- datahub_airflow_plugin/_extractors.py,sha256=o2amnv1ram58zYNiWpleQcpSREthin33LQ4yRLwoGxA,12759
6
- datahub_airflow_plugin/_version.py,sha256=8yBBlejaA5Ap4YAMySiD81_p3qoga05axFNp6GxmOa8,148
7
- datahub_airflow_plugin/datahub_listener.py,sha256=plAU-DZ7EQmC1Ec4gzOaksqHnJLxaCYHkm2jEPTkgvc,31100
8
- datahub_airflow_plugin/datahub_plugin.py,sha256=rbZhs7s5O3_MlkQw5aZToC2W5mMic_EpI3oybHB0ofw,1224
9
- datahub_airflow_plugin/entities.py,sha256=xDZ-mZH7hjUkZbatWYUwI43_9B40wGiotlyQhiO8rEM,1987
10
- datahub_airflow_plugin/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- datahub_airflow_plugin/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- datahub_airflow_plugin/client/airflow_generator.py,sha256=zWGX6M7hqbNvOPu11VlutkJ-g149Xv2m5_IC3GqfRJk,22120
13
- datahub_airflow_plugin/example_dags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py,sha256=BbrOErFboKMDFn712RHEKI9T4Vh0q6kYSVet56gPqVk,1319
15
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py,sha256=xYnuXhWXL5b9Tij0BFvFLckjKCobjzPU3xlxLg2_NXc,1015
16
- datahub_airflow_plugin/example_dags/lineage_backend_demo.py,sha256=Dy6MxwtX7G0mQeALqpLRu4F03IyU9fqIkr-CcKpo2JE,1625
17
- datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py,sha256=kW2rLFtOnoiMxBJ315GzlmR0Sz1cqQ_wwLbG9UC-u7Y,1499
18
- datahub_airflow_plugin/example_dags/lineage_emission_dag.py,sha256=LE29DzW51a4ZAl_zrcLrqSyzmy8qElcZagXsIMjaZLU,1946
19
- datahub_airflow_plugin/example_dags/mysql_sample_dag.py,sha256=Unx9Ger3R9ptEutfV-4NjjEaTIEYJ-tLrZr7OsK608k,1922
20
- datahub_airflow_plugin/example_dags/snowflake_sample_dag.py,sha256=b9iaE7zChQha9u57F84U6uqavGl7WrUnMNOzXEiZxjE,3234
21
- datahub_airflow_plugin/hooks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- datahub_airflow_plugin/hooks/datahub.py,sha256=xa-gsJUza3jSAUP1QLJSNBn4bUHxjXo_FdCAf08IWFo,11155
23
- datahub_airflow_plugin/operators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- datahub_airflow_plugin/operators/datahub.py,sha256=3fT_qg1tUpBEne1XBk8zDIUNBMcox7mxoEoN9O4XIPA,3814
25
- datahub_airflow_plugin/operators/datahub_assertion_operator.py,sha256=j_P9M1a5qME55pKHAfTqZsVVtIslFBO59r8UQOOBvsk,2914
26
- datahub_airflow_plugin/operators/datahub_assertion_sensor.py,sha256=QJIZZYQhqscj3bhBN5Sei-ABMRRAl2KiQxXTXcZQ51Q,2917
27
- datahub_airflow_plugin/operators/datahub_operation_operator.py,sha256=KJ8M8jJ7UWW6kNbiS-rELc-kqCPkZ3ck7z51oAXGPSI,3351
28
- datahub_airflow_plugin/operators/datahub_operation_sensor.py,sha256=U19fi5DpjBRWm_1ljXcjnspUzfa3mqYfOQZHjLk-ufI,3618
29
- acryl_datahub_airflow_plugin-1.3.1.5rc6.dist-info/METADATA,sha256=WzzHlVK7D8MhVhdd-Q9XxFzFAEPsWCejwo1TAEf_og8,4088
30
- acryl_datahub_airflow_plugin-1.3.1.5rc6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
- acryl_datahub_airflow_plugin-1.3.1.5rc6.dist-info/entry_points.txt,sha256=HqmajDHtrsz0b5Lswe1-eeuObxdtucd9YoxH77jJBA8,179
32
- acryl_datahub_airflow_plugin-1.3.1.5rc6.dist-info/top_level.txt,sha256=VBzisOQfzqL1WRbNyItaruf3kTigXltjzgqzbheaFp0,23
33
- acryl_datahub_airflow_plugin-1.3.1.5rc6.dist-info/RECORD,,
@@ -1,365 +0,0 @@
1
- import contextlib
2
- import logging
3
- import unittest.mock
4
- from typing import TYPE_CHECKING, Optional
5
-
6
- from airflow.models.operator import Operator
7
- from openlineage.airflow.extractors import (
8
- BaseExtractor,
9
- ExtractorManager as OLExtractorManager,
10
- TaskMetadata,
11
- )
12
- from openlineage.airflow.extractors.snowflake_extractor import SnowflakeExtractor
13
- from openlineage.airflow.extractors.sql_extractor import SqlExtractor
14
- from openlineage.airflow.utils import get_operator_class, try_import_from_string
15
- from openlineage.client.facet import (
16
- ExtractionError,
17
- ExtractionErrorRunFacet,
18
- SqlJobFacet,
19
- )
20
-
21
- import datahub.emitter.mce_builder as builder
22
- from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
23
- get_platform_from_sqlalchemy_uri,
24
- )
25
- from datahub.sql_parsing.sqlglot_lineage import (
26
- SqlParsingResult,
27
- create_lineage_sql_parsed_result,
28
- )
29
- from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
30
-
31
- if TYPE_CHECKING:
32
- from airflow.models import DagRun, TaskInstance
33
-
34
- from datahub.ingestion.graph.client import DataHubGraph
35
-
36
- logger = logging.getLogger(__name__)
37
- _DATAHUB_GRAPH_CONTEXT_KEY = "datahub_graph"
38
- SQL_PARSING_RESULT_KEY = "datahub_sql"
39
-
40
-
41
- class ExtractorManager(OLExtractorManager):
42
- # TODO: On Airflow 2.7, the OLExtractorManager is part of the built-in Airflow API.
43
- # When available, we should use that instead. The same goe for most of the OL
44
- # extractors.
45
-
46
- def __init__(self):
47
- super().__init__()
48
-
49
- _sql_operator_overrides = [
50
- # The OL BigQuery extractor has some complex logic to fetch detect
51
- # the BigQuery job_id and fetch lineage from there. However, it can't
52
- # generate CLL, so we disable it and use our own extractor instead.
53
- "BigQueryOperator",
54
- "BigQueryExecuteQueryOperator",
55
- # Athena also does something similar.
56
- "AWSAthenaOperator",
57
- # Additional types that OL doesn't support. This is only necessary because
58
- # on older versions of Airflow, these operators don't inherit from SQLExecuteQueryOperator.
59
- "SqliteOperator",
60
- ]
61
- for operator in _sql_operator_overrides:
62
- self.task_to_extractor.extractors[operator] = GenericSqlExtractor
63
-
64
- self.task_to_extractor.extractors["AthenaOperator"] = AthenaOperatorExtractor
65
-
66
- self.task_to_extractor.extractors["BigQueryInsertJobOperator"] = (
67
- BigQueryInsertJobOperatorExtractor
68
- )
69
-
70
- self.task_to_extractor.extractors["TeradataOperator"] = (
71
- TeradataOperatorExtractor
72
- )
73
-
74
- self._graph: Optional["DataHubGraph"] = None
75
-
76
- @contextlib.contextmanager
77
- def _patch_extractors(self):
78
- with contextlib.ExitStack() as stack:
79
- # Patch the SqlExtractor.extract() method.
80
- stack.enter_context(
81
- unittest.mock.patch.object(
82
- SqlExtractor,
83
- "extract",
84
- _sql_extractor_extract,
85
- )
86
- )
87
-
88
- # Patch the SnowflakeExtractor.default_schema property.
89
- stack.enter_context(
90
- unittest.mock.patch.object(
91
- SnowflakeExtractor,
92
- "default_schema",
93
- property(_snowflake_default_schema),
94
- )
95
- )
96
-
97
- # TODO: Override the BigQuery extractor to use the DataHub SQL parser.
98
- # self.extractor_manager.add_extractor()
99
-
100
- # TODO: Override the Athena extractor to use the DataHub SQL parser.
101
-
102
- yield
103
-
104
- def extract_metadata(
105
- self,
106
- dagrun: "DagRun",
107
- task: "Operator",
108
- complete: bool = False,
109
- task_instance: Optional["TaskInstance"] = None,
110
- task_uuid: Optional[str] = None,
111
- graph: Optional["DataHubGraph"] = None,
112
- ) -> TaskMetadata:
113
- self._graph = graph
114
- with self._patch_extractors():
115
- return super().extract_metadata(
116
- dagrun, task, complete, task_instance, task_uuid
117
- )
118
-
119
- def _get_extractor(self, task: "Operator") -> Optional[BaseExtractor]:
120
- # By adding this, we can use the generic extractor as a fallback for
121
- # any operator that inherits from SQLExecuteQueryOperator.
122
- clazz = get_operator_class(task)
123
- SQLExecuteQueryOperator = try_import_from_string(
124
- "airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator"
125
- )
126
- if SQLExecuteQueryOperator and issubclass(clazz, SQLExecuteQueryOperator):
127
- self.task_to_extractor.extractors.setdefault(
128
- clazz.__name__, GenericSqlExtractor
129
- )
130
-
131
- extractor = super()._get_extractor(task)
132
- if extractor:
133
- extractor.set_context(_DATAHUB_GRAPH_CONTEXT_KEY, self._graph)
134
- return extractor
135
-
136
-
137
- class GenericSqlExtractor(SqlExtractor):
138
- # Note that the extract() method is patched elsewhere.
139
-
140
- @property
141
- def default_schema(self):
142
- return super().default_schema
143
-
144
- def _get_scheme(self) -> Optional[str]:
145
- # Best effort conversion to DataHub platform names.
146
-
147
- with contextlib.suppress(Exception):
148
- if self.hook:
149
- if hasattr(self.hook, "get_uri"):
150
- uri = self.hook.get_uri()
151
- return get_platform_from_sqlalchemy_uri(uri)
152
-
153
- return self.conn.conn_type or super().dialect
154
-
155
- def _get_database(self) -> Optional[str]:
156
- if self.conn:
157
- # For BigQuery, the "database" is the project name.
158
- if hasattr(self.conn, "project_id"):
159
- return self.conn.project_id
160
-
161
- return self.conn.schema
162
- return None
163
-
164
-
165
- def _sql_extractor_extract(self: "SqlExtractor") -> TaskMetadata:
166
- # Why not override the OL sql_parse method directly, instead of overriding
167
- # extract()? A few reasons:
168
- #
169
- # 1. We would want to pass the default_db and graph instance into our sql parser
170
- # method. The OL code doesn't pass the default_db (despite having it available),
171
- # and it's not clear how to get the graph instance into that method.
172
- # 2. OL has some janky logic to fetch table schemas as part of the sql extractor.
173
- # We don't want that behavior and this lets us disable it.
174
- # 3. Our SqlParsingResult already has DataHub urns, whereas using SqlMeta would
175
- # require us to convert those urns to OL uris, just for them to get converted
176
- # back to urns later on in our processing.
177
-
178
- task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
179
- sql = self.operator.sql
180
-
181
- default_database = getattr(self.operator, "database", None)
182
- if not default_database:
183
- default_database = self.database
184
- default_schema = self.default_schema
185
-
186
- # TODO: Add better handling for sql being a list of statements.
187
- if isinstance(sql, list):
188
- logger.info(f"Got list of SQL statements for {task_name}. Using first one.")
189
- sql = sql[0]
190
-
191
- # Run the SQL parser.
192
- scheme = self.scheme
193
- platform = OL_SCHEME_TWEAKS.get(scheme, scheme)
194
-
195
- return _parse_sql_into_task_metadata(
196
- self,
197
- sql,
198
- platform=platform,
199
- default_database=default_database,
200
- default_schema=default_schema,
201
- )
202
-
203
-
204
- def _parse_sql_into_task_metadata(
205
- self: "BaseExtractor",
206
- sql: str,
207
- platform: str,
208
- default_database: Optional[str],
209
- default_schema: Optional[str],
210
- ) -> TaskMetadata:
211
- task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
212
-
213
- run_facets = {}
214
- job_facets = {"sql": SqlJobFacet(query=SqlExtractor._normalize_sql(sql))}
215
-
216
- # Prepare to run the SQL parser.
217
- graph = self.context.get(_DATAHUB_GRAPH_CONTEXT_KEY, None)
218
-
219
- self.log.debug(
220
- "Running the SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
221
- "with graph client" if graph else "in offline mode",
222
- platform,
223
- default_database,
224
- default_schema,
225
- sql,
226
- )
227
- sql_parsing_result: SqlParsingResult = create_lineage_sql_parsed_result(
228
- query=sql,
229
- graph=graph,
230
- platform=platform,
231
- platform_instance=None,
232
- env=builder.DEFAULT_ENV,
233
- default_db=default_database,
234
- default_schema=default_schema,
235
- )
236
- self.log.debug(f"Got sql lineage {sql_parsing_result}")
237
-
238
- if sql_parsing_result.debug_info.error:
239
- error = sql_parsing_result.debug_info.error
240
- run_facets["extractionError"] = ExtractionErrorRunFacet(
241
- totalTasks=1,
242
- failedTasks=1,
243
- errors=[
244
- ExtractionError(
245
- errorMessage=str(error),
246
- stackTrace=None,
247
- task="datahub_sql_parser",
248
- taskNumber=None,
249
- )
250
- ],
251
- )
252
-
253
- # Save sql_parsing_result to the facets dict. It is removed from the
254
- # facet dict in the extractor's processing logic.
255
- run_facets[SQL_PARSING_RESULT_KEY] = sql_parsing_result # type: ignore
256
-
257
- return TaskMetadata(
258
- name=task_name,
259
- inputs=[],
260
- outputs=[],
261
- run_facets=run_facets,
262
- job_facets=job_facets,
263
- )
264
-
265
-
266
- class BigQueryInsertJobOperatorExtractor(BaseExtractor):
267
- def extract(self) -> Optional[TaskMetadata]:
268
- from airflow.providers.google.cloud.operators.bigquery import (
269
- BigQueryInsertJobOperator, # type: ignore
270
- )
271
-
272
- operator: "BigQueryInsertJobOperator" = self.operator
273
- sql = operator.configuration.get("query", {}).get("query")
274
- if not sql:
275
- self.log.warning("No query found in BigQueryInsertJobOperator")
276
- return None
277
-
278
- destination_table = operator.configuration.get("query", {}).get(
279
- "destinationTable"
280
- )
281
- destination_table_urn = None
282
- if destination_table:
283
- project_id = destination_table.get("projectId")
284
- dataset_id = destination_table.get("datasetId")
285
- table_id = destination_table.get("tableId")
286
-
287
- if project_id and dataset_id and table_id:
288
- destination_table_urn = builder.make_dataset_urn(
289
- platform="bigquery",
290
- name=f"{project_id}.{dataset_id}.{table_id}",
291
- env=builder.DEFAULT_ENV,
292
- )
293
-
294
- task_metadata = _parse_sql_into_task_metadata(
295
- self,
296
- sql,
297
- platform="bigquery",
298
- default_database=operator.project_id,
299
- default_schema=None,
300
- )
301
-
302
- if destination_table_urn and task_metadata:
303
- sql_parsing_result = task_metadata.run_facets.get(SQL_PARSING_RESULT_KEY)
304
- if sql_parsing_result and isinstance(sql_parsing_result, SqlParsingResult):
305
- sql_parsing_result.out_tables.append(destination_table_urn)
306
-
307
- return task_metadata
308
-
309
-
310
- class AthenaOperatorExtractor(BaseExtractor):
311
- def extract(self) -> Optional[TaskMetadata]:
312
- from airflow.providers.amazon.aws.operators.athena import (
313
- AthenaOperator, # type: ignore
314
- )
315
-
316
- operator: "AthenaOperator" = self.operator
317
- sql = operator.query
318
- if not sql:
319
- self.log.warning("No query found in AthenaOperator")
320
- return None
321
-
322
- return _parse_sql_into_task_metadata(
323
- self,
324
- sql,
325
- platform="athena",
326
- default_database=None,
327
- default_schema=self.operator.database,
328
- )
329
-
330
-
331
- def _snowflake_default_schema(self: "SnowflakeExtractor") -> Optional[str]:
332
- if hasattr(self.operator, "schema") and self.operator.schema is not None:
333
- return self.operator.schema
334
- return (
335
- self.conn.extra_dejson.get("extra__snowflake__schema", "")
336
- or self.conn.extra_dejson.get("schema", "")
337
- or self.conn.schema
338
- )
339
- # TODO: Should we try a fallback of:
340
- # execute_query_on_hook(self.hook, "SELECT current_schema();")
341
-
342
-
343
- class TeradataOperatorExtractor(BaseExtractor):
344
- """Extractor for Teradata SQL operations.
345
-
346
- Extracts lineage from TeradataOperator tasks by parsing the SQL queries
347
- and understanding Teradata's two-tier database.table naming convention.
348
- """
349
-
350
- def extract(self) -> Optional[TaskMetadata]:
351
- from airflow.providers.teradata.operators.teradata import TeradataOperator
352
-
353
- operator: "TeradataOperator" = self.operator
354
- sql = operator.sql
355
- if not sql:
356
- self.log.warning("No query found in TeradataOperator")
357
- return None
358
-
359
- return _parse_sql_into_task_metadata(
360
- self,
361
- sql,
362
- platform="teradata",
363
- default_database=None,
364
- default_schema=None,
365
- )