acryl-datahub-airflow-plugin 1.3.1.4__py3-none-any.whl → 1.3.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
- datahub_airflow_plugin/_airflow_compat.py +32 -0
- datahub_airflow_plugin/_airflow_shims.py +64 -31
- datahub_airflow_plugin/_airflow_version_specific.py +184 -0
- datahub_airflow_plugin/_config.py +97 -19
- datahub_airflow_plugin/_constants.py +16 -0
- datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/airflow2/__init__.py +6 -0
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
- datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
- datahub_airflow_plugin/airflow2/_extractors.py +477 -0
- datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
- datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
- datahub_airflow_plugin/airflow2/_shims.py +88 -0
- datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
- datahub_airflow_plugin/airflow3/__init__.py +6 -0
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
- datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
- datahub_airflow_plugin/airflow3/_shims.py +82 -0
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
- datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
- datahub_airflow_plugin/client/airflow_generator.py +147 -43
- datahub_airflow_plugin/datahub_listener.py +19 -790
- datahub_airflow_plugin/example_dags/__init__.py +32 -0
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
- datahub_airflow_plugin/hooks/datahub.py +11 -2
- datahub_airflow_plugin/operators/datahub.py +20 -3
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA +0 -90
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD +0 -33
- datahub_airflow_plugin/_extractors.py +0 -336
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Snowflake DataHub Ingest DAG
|
|
2
|
+
|
|
3
|
+
This example demonstrates how to ingest metadata from Snowflake into DataHub
|
|
4
|
+
from within an Airflow DAG. In contrast to the MySQL example, this DAG
|
|
5
|
+
pulls the DB connection configuration from Airflow's connection store.
|
|
6
|
+
|
|
7
|
+
This is the Airflow 3.0+ version.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
|
|
12
|
+
from airflow import DAG
|
|
13
|
+
from airflow.hooks.base_hook import BaseHook
|
|
14
|
+
from airflow.operators.python import PythonVirtualenvOperator
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def ingest_from_snowflake(snowflake_credentials, datahub_gms_server):
|
|
18
|
+
from datahub.ingestion.run.pipeline import Pipeline
|
|
19
|
+
|
|
20
|
+
pipeline = Pipeline.create(
|
|
21
|
+
# This configuration is analogous to a recipe configuration.
|
|
22
|
+
{
|
|
23
|
+
"source": {
|
|
24
|
+
"type": "snowflake",
|
|
25
|
+
"config": {
|
|
26
|
+
**snowflake_credentials,
|
|
27
|
+
# Other Snowflake config can be added here.
|
|
28
|
+
"profiling": {"enabled": False},
|
|
29
|
+
},
|
|
30
|
+
},
|
|
31
|
+
# Other ingestion features, like transformers, are also supported.
|
|
32
|
+
# "transformers": [
|
|
33
|
+
# {
|
|
34
|
+
# "type": "simple_add_dataset_ownership",
|
|
35
|
+
# "config": {
|
|
36
|
+
# "owner_urns": [
|
|
37
|
+
# "urn:li:corpuser:example",
|
|
38
|
+
# ]
|
|
39
|
+
# },
|
|
40
|
+
# }
|
|
41
|
+
# ],
|
|
42
|
+
"sink": {
|
|
43
|
+
"type": "datahub-rest",
|
|
44
|
+
"config": {"server": datahub_gms_server},
|
|
45
|
+
},
|
|
46
|
+
}
|
|
47
|
+
)
|
|
48
|
+
pipeline.run()
|
|
49
|
+
pipeline.pretty_print_summary()
|
|
50
|
+
pipeline.raise_from_status()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
with DAG(
|
|
54
|
+
"datahub_snowflake_ingest",
|
|
55
|
+
default_args={
|
|
56
|
+
"owner": "airflow",
|
|
57
|
+
},
|
|
58
|
+
description="An example DAG which ingests metadata from Snowflake to DataHub",
|
|
59
|
+
start_date=datetime(2022, 1, 1),
|
|
60
|
+
schedule=timedelta(days=1),
|
|
61
|
+
catchup=False,
|
|
62
|
+
) as dag:
|
|
63
|
+
# This example pulls credentials from Airflow's connection store.
|
|
64
|
+
# For this to work, you must have previously configured these connections in Airflow.
|
|
65
|
+
# See the Airflow docs for details: https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html
|
|
66
|
+
snowflake_conn = BaseHook.get_connection("snowflake_admin_default")
|
|
67
|
+
datahub_conn = BaseHook.get_connection("datahub_rest_default")
|
|
68
|
+
|
|
69
|
+
# While it is also possible to use the PythonOperator, we recommend using
|
|
70
|
+
# the PythonVirtualenvOperator to ensure that there are no dependency
|
|
71
|
+
# conflicts between DataHub and the rest of your Airflow environment.
|
|
72
|
+
ingest_task = PythonVirtualenvOperator(
|
|
73
|
+
task_id="ingest_from_snowflake",
|
|
74
|
+
requirements=[
|
|
75
|
+
"acryl-datahub[snowflake]",
|
|
76
|
+
],
|
|
77
|
+
system_site_packages=False,
|
|
78
|
+
python_callable=ingest_from_snowflake,
|
|
79
|
+
op_kwargs={
|
|
80
|
+
"snowflake_credentials": {
|
|
81
|
+
"username": snowflake_conn.login,
|
|
82
|
+
"password": snowflake_conn.password,
|
|
83
|
+
"account_id": snowflake_conn.extra_dejson["account"],
|
|
84
|
+
"warehouse": snowflake_conn.extra_dejson.get("warehouse"),
|
|
85
|
+
"role": snowflake_conn.extra_dejson.get("role"),
|
|
86
|
+
},
|
|
87
|
+
"datahub_gms_server": datahub_conn.host,
|
|
88
|
+
},
|
|
89
|
+
)
|
|
@@ -3,17 +3,25 @@
|
|
|
3
3
|
from datetime import timedelta
|
|
4
4
|
|
|
5
5
|
import pendulum
|
|
6
|
-
from airflow.decorators import
|
|
6
|
+
from airflow.decorators import ( # type: ignore[attr-defined] # Decorators not available in all Airflow versions
|
|
7
|
+
dag,
|
|
8
|
+
task,
|
|
9
|
+
)
|
|
7
10
|
|
|
8
11
|
from datahub.ingestion.graph.client import DataHubGraph, RemovedStatusFilter
|
|
12
|
+
from datahub_airflow_plugin._airflow_version_specific import (
|
|
13
|
+
get_airflow_compatible_dag_kwargs,
|
|
14
|
+
)
|
|
9
15
|
from datahub_airflow_plugin.hooks.datahub import DatahubRestHook
|
|
10
16
|
|
|
11
|
-
|
|
12
|
-
@dag(
|
|
13
|
-
schedule_interval=timedelta(days=1),
|
|
17
|
+
dag_decorator_kwargs = get_airflow_compatible_dag_kwargs(
|
|
14
18
|
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
|
19
|
+
schedule_interval=timedelta(days=1),
|
|
15
20
|
catchup=False,
|
|
16
21
|
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dag(**dag_decorator_kwargs)
|
|
17
25
|
def datahub_graph_usage_sample_dag():
|
|
18
26
|
@task()
|
|
19
27
|
def use_the_graph():
|
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
2
2
|
|
|
3
3
|
from airflow.exceptions import AirflowException
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
# BaseHook import - prefer new location in Airflow 3.x
|
|
6
|
+
try:
|
|
7
|
+
from airflow.sdk.bases.hook import BaseHook
|
|
8
|
+
except (ModuleNotFoundError, ImportError):
|
|
9
|
+
from airflow.hooks.base import BaseHook # type: ignore
|
|
5
10
|
|
|
6
11
|
from datahub.emitter.composite_emitter import CompositeEmitter
|
|
7
12
|
from datahub.emitter.generic_emitter import Emitter
|
|
@@ -10,9 +15,13 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|
|
10
15
|
MetadataChangeEvent,
|
|
11
16
|
MetadataChangeProposal,
|
|
12
17
|
)
|
|
18
|
+
from datahub_airflow_plugin._airflow_version_specific import IS_AIRFLOW_3_OR_HIGHER
|
|
13
19
|
|
|
14
20
|
if TYPE_CHECKING:
|
|
15
|
-
|
|
21
|
+
if IS_AIRFLOW_3_OR_HIGHER:
|
|
22
|
+
from airflow.sdk.definitions.connection import Connection
|
|
23
|
+
else:
|
|
24
|
+
from airflow.models.connection import Connection # type: ignore[assignment]
|
|
16
25
|
|
|
17
26
|
from datahub.emitter.kafka_emitter import DatahubKafkaEmitter
|
|
18
27
|
from datahub.emitter.rest_emitter import DataHubRestEmitter
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, List, Union
|
|
1
|
+
from typing import TYPE_CHECKING, Any, List, Union
|
|
2
2
|
|
|
3
3
|
from airflow.models import BaseOperator
|
|
4
4
|
from avrogen.dict_wrapper import DictWrapper
|
|
@@ -11,6 +11,21 @@ from datahub_airflow_plugin.hooks.datahub import (
|
|
|
11
11
|
DatahubRestHook,
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from jinja2 import Environment
|
|
16
|
+
|
|
17
|
+
# Import Context with version compatibility for type checking
|
|
18
|
+
# Import to different names to avoid redefinition errors, then assign to Context
|
|
19
|
+
Context: Any
|
|
20
|
+
try:
|
|
21
|
+
from airflow.utils.context import Context as _AirflowContext
|
|
22
|
+
|
|
23
|
+
Context = _AirflowContext
|
|
24
|
+
except ImportError:
|
|
25
|
+
from airflow.sdk.definitions.context import Context as _Airflow3Context
|
|
26
|
+
|
|
27
|
+
Context = _Airflow3Context # type: ignore[no-redef]
|
|
28
|
+
|
|
14
29
|
|
|
15
30
|
class DatahubBaseOperator(BaseOperator):
|
|
16
31
|
"""
|
|
@@ -56,7 +71,9 @@ class DatahubEmitterOperator(DatahubBaseOperator):
|
|
|
56
71
|
)
|
|
57
72
|
self.metadata = mces
|
|
58
73
|
|
|
59
|
-
def _render_template_fields(
|
|
74
|
+
def _render_template_fields(
|
|
75
|
+
self, field_value: Any, context: "Context", jinja_env: "Environment"
|
|
76
|
+
) -> Any:
|
|
60
77
|
if isinstance(field_value, DictWrapper):
|
|
61
78
|
for key, value in field_value.items():
|
|
62
79
|
setattr(
|
|
@@ -73,7 +90,7 @@ class DatahubEmitterOperator(DatahubBaseOperator):
|
|
|
73
90
|
return super().render_template(field_value, context, jinja_env)
|
|
74
91
|
return field_value
|
|
75
92
|
|
|
76
|
-
def execute(self, context):
|
|
93
|
+
def execute(self, context: "Context") -> None:
|
|
77
94
|
if context:
|
|
78
95
|
jinja_env = self.get_template_env()
|
|
79
96
|
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: acryl-datahub-airflow-plugin
|
|
3
|
-
Version: 1.3.1.4
|
|
4
|
-
Summary: Datahub Airflow plugin to capture executions and send to Datahub
|
|
5
|
-
Home-page: https://docs.datahub.com/
|
|
6
|
-
License: Apache-2.0
|
|
7
|
-
Project-URL: Documentation, https://docs.datahub.com/docs/
|
|
8
|
-
Project-URL: Source, https://github.com/datahub-project/datahub
|
|
9
|
-
Project-URL: Changelog, https://github.com/datahub-project/datahub/releases
|
|
10
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
-
Classifier: Programming Language :: Python
|
|
12
|
-
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
|
14
|
-
Classifier: Intended Audience :: Developers
|
|
15
|
-
Classifier: Intended Audience :: Information Technology
|
|
16
|
-
Classifier: Intended Audience :: System Administrators
|
|
17
|
-
Classifier: Operating System :: Unix
|
|
18
|
-
Classifier: Operating System :: POSIX :: Linux
|
|
19
|
-
Classifier: Environment :: Console
|
|
20
|
-
Classifier: Environment :: MacOS X
|
|
21
|
-
Classifier: Topic :: Software Development
|
|
22
|
-
Requires-Python: >=3.9
|
|
23
|
-
Description-Content-Type: text/markdown
|
|
24
|
-
Requires-Dist: apache-airflow<3,>=2.7.0
|
|
25
|
-
Requires-Dist: openlineage-airflow<=1.30.1,>=1.2.0
|
|
26
|
-
Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.4
|
|
27
|
-
Requires-Dist: acryl-datahub[datahub-rest,sql-parser]==1.3.1.4
|
|
28
|
-
Requires-Dist: pydantic>=2.4.0
|
|
29
|
-
Provides-Extra: ignore
|
|
30
|
-
Provides-Extra: datahub-rest
|
|
31
|
-
Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.4; extra == "datahub-rest"
|
|
32
|
-
Provides-Extra: datahub-kafka
|
|
33
|
-
Requires-Dist: acryl-datahub[datahub-kafka]==1.3.1.4; extra == "datahub-kafka"
|
|
34
|
-
Provides-Extra: datahub-file
|
|
35
|
-
Requires-Dist: acryl-datahub[sync-file-emitter]==1.3.1.4; extra == "datahub-file"
|
|
36
|
-
Provides-Extra: dev
|
|
37
|
-
Requires-Dist: ruff==0.11.7; extra == "dev"
|
|
38
|
-
Requires-Dist: apache-airflow<3,>=2.7.0; extra == "dev"
|
|
39
|
-
Requires-Dist: packaging; extra == "dev"
|
|
40
|
-
Requires-Dist: openlineage-airflow<=1.30.1,>=1.2.0; extra == "dev"
|
|
41
|
-
Requires-Dist: tenacity; extra == "dev"
|
|
42
|
-
Requires-Dist: tox; extra == "dev"
|
|
43
|
-
Requires-Dist: types-PyYAML; extra == "dev"
|
|
44
|
-
Requires-Dist: mypy==1.17.1; extra == "dev"
|
|
45
|
-
Requires-Dist: tox-uv; extra == "dev"
|
|
46
|
-
Requires-Dist: twine; extra == "dev"
|
|
47
|
-
Requires-Dist: types-click==0.1.12; extra == "dev"
|
|
48
|
-
Requires-Dist: deepdiff!=8.0.0; extra == "dev"
|
|
49
|
-
Requires-Dist: coverage>=5.1; extra == "dev"
|
|
50
|
-
Requires-Dist: pytest-cov>=2.8.1; extra == "dev"
|
|
51
|
-
Requires-Dist: types-setuptools; extra == "dev"
|
|
52
|
-
Requires-Dist: pydantic>=2.4.0; extra == "dev"
|
|
53
|
-
Requires-Dist: types-dataclasses; extra == "dev"
|
|
54
|
-
Requires-Dist: build; extra == "dev"
|
|
55
|
-
Requires-Dist: acryl-datahub[datahub-rest]==1.3.1.4; extra == "dev"
|
|
56
|
-
Requires-Dist: types-tabulate; extra == "dev"
|
|
57
|
-
Requires-Dist: types-six; extra == "dev"
|
|
58
|
-
Requires-Dist: types-requests; extra == "dev"
|
|
59
|
-
Requires-Dist: types-cachetools; extra == "dev"
|
|
60
|
-
Requires-Dist: acryl-datahub[datahub-rest,sql-parser]==1.3.1.4; extra == "dev"
|
|
61
|
-
Requires-Dist: sqlalchemy-stubs; extra == "dev"
|
|
62
|
-
Requires-Dist: pytest>=6.2.2; extra == "dev"
|
|
63
|
-
Requires-Dist: types-toml; extra == "dev"
|
|
64
|
-
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
65
|
-
Provides-Extra: integration-tests
|
|
66
|
-
Requires-Dist: acryl-datahub[sync-file-emitter]==1.3.1.4; extra == "integration-tests"
|
|
67
|
-
Requires-Dist: apache-airflow[amazon,google,snowflake]>=2.0.2; extra == "integration-tests"
|
|
68
|
-
Requires-Dist: acryl-datahub[testing-utils]==1.3.1.4; extra == "integration-tests"
|
|
69
|
-
Requires-Dist: acryl-datahub[datahub-kafka]==1.3.1.4; extra == "integration-tests"
|
|
70
|
-
Requires-Dist: snowflake-connector-python>=2.7.10; extra == "integration-tests"
|
|
71
|
-
Requires-Dist: apache-airflow-providers-sqlite; extra == "integration-tests"
|
|
72
|
-
Requires-Dist: virtualenv; extra == "integration-tests"
|
|
73
|
-
Dynamic: classifier
|
|
74
|
-
Dynamic: description
|
|
75
|
-
Dynamic: description-content-type
|
|
76
|
-
Dynamic: home-page
|
|
77
|
-
Dynamic: license
|
|
78
|
-
Dynamic: project-url
|
|
79
|
-
Dynamic: provides-extra
|
|
80
|
-
Dynamic: requires-dist
|
|
81
|
-
Dynamic: requires-python
|
|
82
|
-
Dynamic: summary
|
|
83
|
-
|
|
84
|
-
# Datahub Airflow Plugin
|
|
85
|
-
|
|
86
|
-
See [the DataHub Airflow docs](https://docs.datahub.com/docs/lineage/airflow) for details.
|
|
87
|
-
|
|
88
|
-
## Developing
|
|
89
|
-
|
|
90
|
-
See the [developing docs](../../metadata-ingestion/developing.md).
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
datahub_airflow_plugin/__init__.py,sha256=NScUtA8N-m66Pyg0DO--YbPkrl48PK3UevpdQVW_y6E,1009
|
|
2
|
-
datahub_airflow_plugin/_airflow_shims.py,sha256=hLMTkANJzmH9sEcaiwNb0EZgD11vh-XnDBqFQ9yqjr4,1613
|
|
3
|
-
datahub_airflow_plugin/_config.py,sha256=qNbNC6YUGHf06RfOMsU7jpeNeE2ttxjUQIMEpxIhvyM,5221
|
|
4
|
-
datahub_airflow_plugin/_datahub_ol_adapter.py,sha256=RuzMyWZo7MeJzAFoBfkT4cdDw5g1iWshB_nXG7jLnR0,545
|
|
5
|
-
datahub_airflow_plugin/_extractors.py,sha256=HzYf6Nm8HhRfXGVTQvVgFVRUe54RSOq0CDv8oxtor1A,11875
|
|
6
|
-
datahub_airflow_plugin/_version.py,sha256=RJL3ep1RXk_49LNT5HC_UqXUDu65_KSs6TJ87R1_CA0,145
|
|
7
|
-
datahub_airflow_plugin/datahub_listener.py,sha256=plAU-DZ7EQmC1Ec4gzOaksqHnJLxaCYHkm2jEPTkgvc,31100
|
|
8
|
-
datahub_airflow_plugin/datahub_plugin.py,sha256=rbZhs7s5O3_MlkQw5aZToC2W5mMic_EpI3oybHB0ofw,1224
|
|
9
|
-
datahub_airflow_plugin/entities.py,sha256=xDZ-mZH7hjUkZbatWYUwI43_9B40wGiotlyQhiO8rEM,1987
|
|
10
|
-
datahub_airflow_plugin/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
datahub_airflow_plugin/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
datahub_airflow_plugin/client/airflow_generator.py,sha256=zWGX6M7hqbNvOPu11VlutkJ-g149Xv2m5_IC3GqfRJk,22120
|
|
13
|
-
datahub_airflow_plugin/example_dags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py,sha256=BbrOErFboKMDFn712RHEKI9T4Vh0q6kYSVet56gPqVk,1319
|
|
15
|
-
datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py,sha256=xYnuXhWXL5b9Tij0BFvFLckjKCobjzPU3xlxLg2_NXc,1015
|
|
16
|
-
datahub_airflow_plugin/example_dags/lineage_backend_demo.py,sha256=Dy6MxwtX7G0mQeALqpLRu4F03IyU9fqIkr-CcKpo2JE,1625
|
|
17
|
-
datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py,sha256=kW2rLFtOnoiMxBJ315GzlmR0Sz1cqQ_wwLbG9UC-u7Y,1499
|
|
18
|
-
datahub_airflow_plugin/example_dags/lineage_emission_dag.py,sha256=LE29DzW51a4ZAl_zrcLrqSyzmy8qElcZagXsIMjaZLU,1946
|
|
19
|
-
datahub_airflow_plugin/example_dags/mysql_sample_dag.py,sha256=Unx9Ger3R9ptEutfV-4NjjEaTIEYJ-tLrZr7OsK608k,1922
|
|
20
|
-
datahub_airflow_plugin/example_dags/snowflake_sample_dag.py,sha256=b9iaE7zChQha9u57F84U6uqavGl7WrUnMNOzXEiZxjE,3234
|
|
21
|
-
datahub_airflow_plugin/hooks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
-
datahub_airflow_plugin/hooks/datahub.py,sha256=xa-gsJUza3jSAUP1QLJSNBn4bUHxjXo_FdCAf08IWFo,11155
|
|
23
|
-
datahub_airflow_plugin/operators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
datahub_airflow_plugin/operators/datahub.py,sha256=3fT_qg1tUpBEne1XBk8zDIUNBMcox7mxoEoN9O4XIPA,3814
|
|
25
|
-
datahub_airflow_plugin/operators/datahub_assertion_operator.py,sha256=j_P9M1a5qME55pKHAfTqZsVVtIslFBO59r8UQOOBvsk,2914
|
|
26
|
-
datahub_airflow_plugin/operators/datahub_assertion_sensor.py,sha256=QJIZZYQhqscj3bhBN5Sei-ABMRRAl2KiQxXTXcZQ51Q,2917
|
|
27
|
-
datahub_airflow_plugin/operators/datahub_operation_operator.py,sha256=KJ8M8jJ7UWW6kNbiS-rELc-kqCPkZ3ck7z51oAXGPSI,3351
|
|
28
|
-
datahub_airflow_plugin/operators/datahub_operation_sensor.py,sha256=U19fi5DpjBRWm_1ljXcjnspUzfa3mqYfOQZHjLk-ufI,3618
|
|
29
|
-
acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA,sha256=USLZazGCYGGkU-KVH1etSzsIvnLyhMuQANkHGhKdzf0,3976
|
|
30
|
-
acryl_datahub_airflow_plugin-1.3.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
-
acryl_datahub_airflow_plugin-1.3.1.4.dist-info/entry_points.txt,sha256=HqmajDHtrsz0b5Lswe1-eeuObxdtucd9YoxH77jJBA8,179
|
|
32
|
-
acryl_datahub_airflow_plugin-1.3.1.4.dist-info/top_level.txt,sha256=VBzisOQfzqL1WRbNyItaruf3kTigXltjzgqzbheaFp0,23
|
|
33
|
-
acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD,,
|
|
@@ -1,336 +0,0 @@
|
|
|
1
|
-
import contextlib
|
|
2
|
-
import logging
|
|
3
|
-
import unittest.mock
|
|
4
|
-
from typing import TYPE_CHECKING, Optional
|
|
5
|
-
|
|
6
|
-
from airflow.models.operator import Operator
|
|
7
|
-
from openlineage.airflow.extractors import (
|
|
8
|
-
BaseExtractor,
|
|
9
|
-
ExtractorManager as OLExtractorManager,
|
|
10
|
-
TaskMetadata,
|
|
11
|
-
)
|
|
12
|
-
from openlineage.airflow.extractors.snowflake_extractor import SnowflakeExtractor
|
|
13
|
-
from openlineage.airflow.extractors.sql_extractor import SqlExtractor
|
|
14
|
-
from openlineage.airflow.utils import get_operator_class, try_import_from_string
|
|
15
|
-
from openlineage.client.facet import (
|
|
16
|
-
ExtractionError,
|
|
17
|
-
ExtractionErrorRunFacet,
|
|
18
|
-
SqlJobFacet,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
import datahub.emitter.mce_builder as builder
|
|
22
|
-
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
23
|
-
get_platform_from_sqlalchemy_uri,
|
|
24
|
-
)
|
|
25
|
-
from datahub.sql_parsing.sqlglot_lineage import (
|
|
26
|
-
SqlParsingResult,
|
|
27
|
-
create_lineage_sql_parsed_result,
|
|
28
|
-
)
|
|
29
|
-
from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
|
|
30
|
-
|
|
31
|
-
if TYPE_CHECKING:
|
|
32
|
-
from airflow.models import DagRun, TaskInstance
|
|
33
|
-
|
|
34
|
-
from datahub.ingestion.graph.client import DataHubGraph
|
|
35
|
-
|
|
36
|
-
logger = logging.getLogger(__name__)
|
|
37
|
-
_DATAHUB_GRAPH_CONTEXT_KEY = "datahub_graph"
|
|
38
|
-
SQL_PARSING_RESULT_KEY = "datahub_sql"
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
class ExtractorManager(OLExtractorManager):
|
|
42
|
-
# TODO: On Airflow 2.7, the OLExtractorManager is part of the built-in Airflow API.
|
|
43
|
-
# When available, we should use that instead. The same goe for most of the OL
|
|
44
|
-
# extractors.
|
|
45
|
-
|
|
46
|
-
def __init__(self):
|
|
47
|
-
super().__init__()
|
|
48
|
-
|
|
49
|
-
_sql_operator_overrides = [
|
|
50
|
-
# The OL BigQuery extractor has some complex logic to fetch detect
|
|
51
|
-
# the BigQuery job_id and fetch lineage from there. However, it can't
|
|
52
|
-
# generate CLL, so we disable it and use our own extractor instead.
|
|
53
|
-
"BigQueryOperator",
|
|
54
|
-
"BigQueryExecuteQueryOperator",
|
|
55
|
-
# Athena also does something similar.
|
|
56
|
-
"AWSAthenaOperator",
|
|
57
|
-
# Additional types that OL doesn't support. This is only necessary because
|
|
58
|
-
# on older versions of Airflow, these operators don't inherit from SQLExecuteQueryOperator.
|
|
59
|
-
"SqliteOperator",
|
|
60
|
-
]
|
|
61
|
-
for operator in _sql_operator_overrides:
|
|
62
|
-
self.task_to_extractor.extractors[operator] = GenericSqlExtractor
|
|
63
|
-
|
|
64
|
-
self.task_to_extractor.extractors["AthenaOperator"] = AthenaOperatorExtractor
|
|
65
|
-
|
|
66
|
-
self.task_to_extractor.extractors["BigQueryInsertJobOperator"] = (
|
|
67
|
-
BigQueryInsertJobOperatorExtractor
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
self._graph: Optional["DataHubGraph"] = None
|
|
71
|
-
|
|
72
|
-
@contextlib.contextmanager
|
|
73
|
-
def _patch_extractors(self):
|
|
74
|
-
with contextlib.ExitStack() as stack:
|
|
75
|
-
# Patch the SqlExtractor.extract() method.
|
|
76
|
-
stack.enter_context(
|
|
77
|
-
unittest.mock.patch.object(
|
|
78
|
-
SqlExtractor,
|
|
79
|
-
"extract",
|
|
80
|
-
_sql_extractor_extract,
|
|
81
|
-
)
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
# Patch the SnowflakeExtractor.default_schema property.
|
|
85
|
-
stack.enter_context(
|
|
86
|
-
unittest.mock.patch.object(
|
|
87
|
-
SnowflakeExtractor,
|
|
88
|
-
"default_schema",
|
|
89
|
-
property(_snowflake_default_schema),
|
|
90
|
-
)
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
# TODO: Override the BigQuery extractor to use the DataHub SQL parser.
|
|
94
|
-
# self.extractor_manager.add_extractor()
|
|
95
|
-
|
|
96
|
-
# TODO: Override the Athena extractor to use the DataHub SQL parser.
|
|
97
|
-
|
|
98
|
-
yield
|
|
99
|
-
|
|
100
|
-
def extract_metadata(
|
|
101
|
-
self,
|
|
102
|
-
dagrun: "DagRun",
|
|
103
|
-
task: "Operator",
|
|
104
|
-
complete: bool = False,
|
|
105
|
-
task_instance: Optional["TaskInstance"] = None,
|
|
106
|
-
task_uuid: Optional[str] = None,
|
|
107
|
-
graph: Optional["DataHubGraph"] = None,
|
|
108
|
-
) -> TaskMetadata:
|
|
109
|
-
self._graph = graph
|
|
110
|
-
with self._patch_extractors():
|
|
111
|
-
return super().extract_metadata(
|
|
112
|
-
dagrun, task, complete, task_instance, task_uuid
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
def _get_extractor(self, task: "Operator") -> Optional[BaseExtractor]:
|
|
116
|
-
# By adding this, we can use the generic extractor as a fallback for
|
|
117
|
-
# any operator that inherits from SQLExecuteQueryOperator.
|
|
118
|
-
clazz = get_operator_class(task)
|
|
119
|
-
SQLExecuteQueryOperator = try_import_from_string(
|
|
120
|
-
"airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator"
|
|
121
|
-
)
|
|
122
|
-
if SQLExecuteQueryOperator and issubclass(clazz, SQLExecuteQueryOperator):
|
|
123
|
-
self.task_to_extractor.extractors.setdefault(
|
|
124
|
-
clazz.__name__, GenericSqlExtractor
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
extractor = super()._get_extractor(task)
|
|
128
|
-
if extractor:
|
|
129
|
-
extractor.set_context(_DATAHUB_GRAPH_CONTEXT_KEY, self._graph)
|
|
130
|
-
return extractor
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
class GenericSqlExtractor(SqlExtractor):
|
|
134
|
-
# Note that the extract() method is patched elsewhere.
|
|
135
|
-
|
|
136
|
-
@property
|
|
137
|
-
def default_schema(self):
|
|
138
|
-
return super().default_schema
|
|
139
|
-
|
|
140
|
-
def _get_scheme(self) -> Optional[str]:
|
|
141
|
-
# Best effort conversion to DataHub platform names.
|
|
142
|
-
|
|
143
|
-
with contextlib.suppress(Exception):
|
|
144
|
-
if self.hook:
|
|
145
|
-
if hasattr(self.hook, "get_uri"):
|
|
146
|
-
uri = self.hook.get_uri()
|
|
147
|
-
return get_platform_from_sqlalchemy_uri(uri)
|
|
148
|
-
|
|
149
|
-
return self.conn.conn_type or super().dialect
|
|
150
|
-
|
|
151
|
-
def _get_database(self) -> Optional[str]:
|
|
152
|
-
if self.conn:
|
|
153
|
-
# For BigQuery, the "database" is the project name.
|
|
154
|
-
if hasattr(self.conn, "project_id"):
|
|
155
|
-
return self.conn.project_id
|
|
156
|
-
|
|
157
|
-
return self.conn.schema
|
|
158
|
-
return None
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def _sql_extractor_extract(self: "SqlExtractor") -> TaskMetadata:
|
|
162
|
-
# Why not override the OL sql_parse method directly, instead of overriding
|
|
163
|
-
# extract()? A few reasons:
|
|
164
|
-
#
|
|
165
|
-
# 1. We would want to pass the default_db and graph instance into our sql parser
|
|
166
|
-
# method. The OL code doesn't pass the default_db (despite having it available),
|
|
167
|
-
# and it's not clear how to get the graph instance into that method.
|
|
168
|
-
# 2. OL has some janky logic to fetch table schemas as part of the sql extractor.
|
|
169
|
-
# We don't want that behavior and this lets us disable it.
|
|
170
|
-
# 3. Our SqlParsingResult already has DataHub urns, whereas using SqlMeta would
|
|
171
|
-
# require us to convert those urns to OL uris, just for them to get converted
|
|
172
|
-
# back to urns later on in our processing.
|
|
173
|
-
|
|
174
|
-
task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
|
|
175
|
-
sql = self.operator.sql
|
|
176
|
-
|
|
177
|
-
default_database = getattr(self.operator, "database", None)
|
|
178
|
-
if not default_database:
|
|
179
|
-
default_database = self.database
|
|
180
|
-
default_schema = self.default_schema
|
|
181
|
-
|
|
182
|
-
# TODO: Add better handling for sql being a list of statements.
|
|
183
|
-
if isinstance(sql, list):
|
|
184
|
-
logger.info(f"Got list of SQL statements for {task_name}. Using first one.")
|
|
185
|
-
sql = sql[0]
|
|
186
|
-
|
|
187
|
-
# Run the SQL parser.
|
|
188
|
-
scheme = self.scheme
|
|
189
|
-
platform = OL_SCHEME_TWEAKS.get(scheme, scheme)
|
|
190
|
-
|
|
191
|
-
return _parse_sql_into_task_metadata(
|
|
192
|
-
self,
|
|
193
|
-
sql,
|
|
194
|
-
platform=platform,
|
|
195
|
-
default_database=default_database,
|
|
196
|
-
default_schema=default_schema,
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def _parse_sql_into_task_metadata(
|
|
201
|
-
self: "BaseExtractor",
|
|
202
|
-
sql: str,
|
|
203
|
-
platform: str,
|
|
204
|
-
default_database: Optional[str],
|
|
205
|
-
default_schema: Optional[str],
|
|
206
|
-
) -> TaskMetadata:
|
|
207
|
-
task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
|
|
208
|
-
|
|
209
|
-
run_facets = {}
|
|
210
|
-
job_facets = {"sql": SqlJobFacet(query=SqlExtractor._normalize_sql(sql))}
|
|
211
|
-
|
|
212
|
-
# Prepare to run the SQL parser.
|
|
213
|
-
graph = self.context.get(_DATAHUB_GRAPH_CONTEXT_KEY, None)
|
|
214
|
-
|
|
215
|
-
self.log.debug(
|
|
216
|
-
"Running the SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
|
|
217
|
-
"with graph client" if graph else "in offline mode",
|
|
218
|
-
platform,
|
|
219
|
-
default_database,
|
|
220
|
-
default_schema,
|
|
221
|
-
sql,
|
|
222
|
-
)
|
|
223
|
-
sql_parsing_result: SqlParsingResult = create_lineage_sql_parsed_result(
|
|
224
|
-
query=sql,
|
|
225
|
-
graph=graph,
|
|
226
|
-
platform=platform,
|
|
227
|
-
platform_instance=None,
|
|
228
|
-
env=builder.DEFAULT_ENV,
|
|
229
|
-
default_db=default_database,
|
|
230
|
-
default_schema=default_schema,
|
|
231
|
-
)
|
|
232
|
-
self.log.debug(f"Got sql lineage {sql_parsing_result}")
|
|
233
|
-
|
|
234
|
-
if sql_parsing_result.debug_info.error:
|
|
235
|
-
error = sql_parsing_result.debug_info.error
|
|
236
|
-
run_facets["extractionError"] = ExtractionErrorRunFacet(
|
|
237
|
-
totalTasks=1,
|
|
238
|
-
failedTasks=1,
|
|
239
|
-
errors=[
|
|
240
|
-
ExtractionError(
|
|
241
|
-
errorMessage=str(error),
|
|
242
|
-
stackTrace=None,
|
|
243
|
-
task="datahub_sql_parser",
|
|
244
|
-
taskNumber=None,
|
|
245
|
-
)
|
|
246
|
-
],
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
# Save sql_parsing_result to the facets dict. It is removed from the
|
|
250
|
-
# facet dict in the extractor's processing logic.
|
|
251
|
-
run_facets[SQL_PARSING_RESULT_KEY] = sql_parsing_result # type: ignore
|
|
252
|
-
|
|
253
|
-
return TaskMetadata(
|
|
254
|
-
name=task_name,
|
|
255
|
-
inputs=[],
|
|
256
|
-
outputs=[],
|
|
257
|
-
run_facets=run_facets,
|
|
258
|
-
job_facets=job_facets,
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
class BigQueryInsertJobOperatorExtractor(BaseExtractor):
|
|
263
|
-
def extract(self) -> Optional[TaskMetadata]:
|
|
264
|
-
from airflow.providers.google.cloud.operators.bigquery import (
|
|
265
|
-
BigQueryInsertJobOperator, # type: ignore
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
operator: "BigQueryInsertJobOperator" = self.operator
|
|
269
|
-
sql = operator.configuration.get("query", {}).get("query")
|
|
270
|
-
if not sql:
|
|
271
|
-
self.log.warning("No query found in BigQueryInsertJobOperator")
|
|
272
|
-
return None
|
|
273
|
-
|
|
274
|
-
destination_table = operator.configuration.get("query", {}).get(
|
|
275
|
-
"destinationTable"
|
|
276
|
-
)
|
|
277
|
-
destination_table_urn = None
|
|
278
|
-
if destination_table:
|
|
279
|
-
project_id = destination_table.get("projectId")
|
|
280
|
-
dataset_id = destination_table.get("datasetId")
|
|
281
|
-
table_id = destination_table.get("tableId")
|
|
282
|
-
|
|
283
|
-
if project_id and dataset_id and table_id:
|
|
284
|
-
destination_table_urn = builder.make_dataset_urn(
|
|
285
|
-
platform="bigquery",
|
|
286
|
-
name=f"{project_id}.{dataset_id}.{table_id}",
|
|
287
|
-
env=builder.DEFAULT_ENV,
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
task_metadata = _parse_sql_into_task_metadata(
|
|
291
|
-
self,
|
|
292
|
-
sql,
|
|
293
|
-
platform="bigquery",
|
|
294
|
-
default_database=operator.project_id,
|
|
295
|
-
default_schema=None,
|
|
296
|
-
)
|
|
297
|
-
|
|
298
|
-
if destination_table_urn and task_metadata:
|
|
299
|
-
sql_parsing_result = task_metadata.run_facets.get(SQL_PARSING_RESULT_KEY)
|
|
300
|
-
if sql_parsing_result and isinstance(sql_parsing_result, SqlParsingResult):
|
|
301
|
-
sql_parsing_result.out_tables.append(destination_table_urn)
|
|
302
|
-
|
|
303
|
-
return task_metadata
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
class AthenaOperatorExtractor(BaseExtractor):
|
|
307
|
-
def extract(self) -> Optional[TaskMetadata]:
|
|
308
|
-
from airflow.providers.amazon.aws.operators.athena import (
|
|
309
|
-
AthenaOperator, # type: ignore
|
|
310
|
-
)
|
|
311
|
-
|
|
312
|
-
operator: "AthenaOperator" = self.operator
|
|
313
|
-
sql = operator.query
|
|
314
|
-
if not sql:
|
|
315
|
-
self.log.warning("No query found in AthenaOperator")
|
|
316
|
-
return None
|
|
317
|
-
|
|
318
|
-
return _parse_sql_into_task_metadata(
|
|
319
|
-
self,
|
|
320
|
-
sql,
|
|
321
|
-
platform="athena",
|
|
322
|
-
default_database=None,
|
|
323
|
-
default_schema=self.operator.database,
|
|
324
|
-
)
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
def _snowflake_default_schema(self: "SnowflakeExtractor") -> Optional[str]:
|
|
328
|
-
if hasattr(self.operator, "schema") and self.operator.schema is not None:
|
|
329
|
-
return self.operator.schema
|
|
330
|
-
return (
|
|
331
|
-
self.conn.extra_dejson.get("extra__snowflake__schema", "")
|
|
332
|
-
or self.conn.extra_dejson.get("schema", "")
|
|
333
|
-
or self.conn.schema
|
|
334
|
-
)
|
|
335
|
-
# TODO: Should we try a fallback of:
|
|
336
|
-
# execute_query_on_hook(self.hook, "SELECT current_schema();")[0][0]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|