acryl-datahub-airflow-plugin 1.3.1.4__py3-none-any.whl → 1.3.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
- acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
- datahub_airflow_plugin/_airflow_compat.py +32 -0
- datahub_airflow_plugin/_airflow_shims.py +64 -31
- datahub_airflow_plugin/_airflow_version_specific.py +184 -0
- datahub_airflow_plugin/_config.py +97 -19
- datahub_airflow_plugin/_constants.py +16 -0
- datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
- datahub_airflow_plugin/_version.py +1 -1
- datahub_airflow_plugin/airflow2/__init__.py +6 -0
- datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
- datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
- datahub_airflow_plugin/airflow2/_extractors.py +477 -0
- datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
- datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
- datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
- datahub_airflow_plugin/airflow2/_shims.py +88 -0
- datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
- datahub_airflow_plugin/airflow3/__init__.py +6 -0
- datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
- datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
- datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
- datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
- datahub_airflow_plugin/airflow3/_shims.py +82 -0
- datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
- datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
- datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
- datahub_airflow_plugin/client/airflow_generator.py +147 -43
- datahub_airflow_plugin/datahub_listener.py +19 -790
- datahub_airflow_plugin/example_dags/__init__.py +32 -0
- datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
- datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
- datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
- datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
- datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
- datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
- datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
- datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
- datahub_airflow_plugin/hooks/datahub.py +11 -2
- datahub_airflow_plugin/operators/datahub.py +20 -3
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA +0 -90
- acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD +0 -33
- datahub_airflow_plugin/_extractors.py +0 -336
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataHub Airflow Plugin Example DAGs
|
|
3
|
+
|
|
4
|
+
This directory contains example DAGs demonstrating various features of the DataHub Airflow plugin.
|
|
5
|
+
|
|
6
|
+
## Directory Structure
|
|
7
|
+
|
|
8
|
+
- **airflow2/**: Example DAGs for Airflow 2.x with compatibility layers
|
|
9
|
+
- **airflow3/**: Example DAGs for Airflow 3.0+ using native syntax (no compatibility layers)
|
|
10
|
+
- Root directory: Legacy example DAGs with compatibility layers (deprecated, use airflow2/ or airflow3/)
|
|
11
|
+
|
|
12
|
+
## Choosing the Right Examples
|
|
13
|
+
|
|
14
|
+
- If you're using **Airflow 3.0+**, refer to examples in `airflow3/`
|
|
15
|
+
- If you're using **Airflow 2.x**, refer to examples in `airflow2/`
|
|
16
|
+
- For production DAGs that need to work across multiple Airflow versions, see `airflow2/` for compatibility patterns
|
|
17
|
+
|
|
18
|
+
## Available Examples
|
|
19
|
+
|
|
20
|
+
### Lineage Collection
|
|
21
|
+
- `lineage_backend_demo.py`: Basic lineage collection using inlets/outlets
|
|
22
|
+
- `lineage_backend_taskflow_demo.py`: Lineage collection with TaskFlow API
|
|
23
|
+
|
|
24
|
+
### Data Ingestion
|
|
25
|
+
- `snowflake_sample_dag.py`: Ingest Snowflake metadata into DataHub
|
|
26
|
+
- `mysql_sample_dag.py`: Ingest MySQL metadata into DataHub
|
|
27
|
+
- `generic_recipe_sample_dag.py`: Run any DataHub recipe from Airflow
|
|
28
|
+
|
|
29
|
+
### Advanced Features
|
|
30
|
+
- `lineage_emission_dag.py`: Custom lineage emission with DatahubEmitterOperator
|
|
31
|
+
- `graph_usage_sample_dag.py`: Complex DAG graph with multiple dependencies
|
|
32
|
+
"""
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Generic DataHub Ingest via Recipe
|
|
2
|
+
|
|
3
|
+
This example demonstrates how to load any configuration file and run a
|
|
4
|
+
DataHub ingestion pipeline within an Airflow DAG.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from datetime import timedelta
|
|
8
|
+
|
|
9
|
+
from airflow import DAG
|
|
10
|
+
from airflow.operators.python import PythonOperator
|
|
11
|
+
|
|
12
|
+
from datahub.configuration.config_loader import load_config_file
|
|
13
|
+
from datahub.ingestion.run.pipeline import Pipeline
|
|
14
|
+
from datahub_airflow_plugin._airflow_version_specific import (
|
|
15
|
+
days_ago,
|
|
16
|
+
get_airflow_compatible_dag_kwargs,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
default_args = {
|
|
20
|
+
"owner": "airflow",
|
|
21
|
+
"depends_on_past": False,
|
|
22
|
+
"email": ["jdoe@example.com"],
|
|
23
|
+
"email_on_failure": False,
|
|
24
|
+
"email_on_retry": False,
|
|
25
|
+
"retries": 1,
|
|
26
|
+
"retry_delay": timedelta(minutes=5),
|
|
27
|
+
"execution_timeout": timedelta(minutes=120),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def datahub_recipe():
|
|
32
|
+
# Note that this will also resolve environment variables in the recipe.
|
|
33
|
+
config = load_config_file("path/to/recipe.yml")
|
|
34
|
+
|
|
35
|
+
pipeline = Pipeline.create(config)
|
|
36
|
+
pipeline.run()
|
|
37
|
+
pipeline.raise_from_status()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
with DAG(
|
|
41
|
+
"datahub_ingest_using_recipe",
|
|
42
|
+
**get_airflow_compatible_dag_kwargs(
|
|
43
|
+
default_args=default_args,
|
|
44
|
+
description="An example DAG which runs a DataHub ingestion recipe",
|
|
45
|
+
start_date=days_ago(2),
|
|
46
|
+
schedule_interval=timedelta(days=1),
|
|
47
|
+
catchup=False,
|
|
48
|
+
default_view="tree",
|
|
49
|
+
),
|
|
50
|
+
) as dag:
|
|
51
|
+
ingest_task = PythonOperator(
|
|
52
|
+
task_id="ingest_using_recipe",
|
|
53
|
+
python_callable=datahub_recipe,
|
|
54
|
+
)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""This example DAG demonstrates how to create and use a DataHubGraph client."""
|
|
2
|
+
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
|
|
5
|
+
import pendulum
|
|
6
|
+
from airflow.decorators import ( # type: ignore[attr-defined] # Decorators not available in all Airflow versions
|
|
7
|
+
dag,
|
|
8
|
+
task,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from datahub.ingestion.graph.client import DataHubGraph, RemovedStatusFilter
|
|
12
|
+
from datahub_airflow_plugin._airflow_version_specific import (
|
|
13
|
+
get_airflow_compatible_dag_kwargs,
|
|
14
|
+
)
|
|
15
|
+
from datahub_airflow_plugin.hooks.datahub import DatahubRestHook
|
|
16
|
+
|
|
17
|
+
dag_decorator_kwargs = get_airflow_compatible_dag_kwargs(
|
|
18
|
+
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
|
19
|
+
schedule_interval=timedelta(days=1),
|
|
20
|
+
catchup=False,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dag(**dag_decorator_kwargs)
|
|
25
|
+
def datahub_graph_usage_sample_dag():
|
|
26
|
+
@task()
|
|
27
|
+
def use_the_graph():
|
|
28
|
+
graph: DataHubGraph = DatahubRestHook("my_datahub_rest_conn_id").make_graph()
|
|
29
|
+
graph.test_connection()
|
|
30
|
+
|
|
31
|
+
# Example usage: Find all soft-deleted BigQuery DEV entities
|
|
32
|
+
# in DataHub, and hard delete them.
|
|
33
|
+
for urn in graph.get_urns_by_filter(
|
|
34
|
+
platform="bigquery",
|
|
35
|
+
env="DEV",
|
|
36
|
+
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
37
|
+
):
|
|
38
|
+
graph.hard_delete_entity(urn)
|
|
39
|
+
|
|
40
|
+
use_the_graph()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
datahub_graph_usage_sample_dag()
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Lineage Backend
|
|
2
|
+
|
|
3
|
+
An example DAG demonstrating the usage of DataHub's Airflow lineage backend.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from datetime import timedelta
|
|
7
|
+
|
|
8
|
+
from airflow import DAG
|
|
9
|
+
from airflow.operators.bash import BashOperator
|
|
10
|
+
|
|
11
|
+
from datahub_airflow_plugin._airflow_version_specific import days_ago
|
|
12
|
+
from datahub_airflow_plugin.entities import Dataset, Urn
|
|
13
|
+
|
|
14
|
+
default_args = {
|
|
15
|
+
"owner": "airflow",
|
|
16
|
+
"depends_on_past": False,
|
|
17
|
+
"email": ["jdoe@example.com"],
|
|
18
|
+
"email_on_failure": False,
|
|
19
|
+
"execution_timeout": timedelta(minutes=5),
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Create DAG arguments conditionally for Airflow version compatibility
|
|
24
|
+
import airflow # noqa: E402
|
|
25
|
+
|
|
26
|
+
dag_kwargs = {
|
|
27
|
+
"dag_id": "datahub_lineage_backend_demo",
|
|
28
|
+
"default_args": default_args,
|
|
29
|
+
"description": "An example DAG demonstrating the usage of DataHub's Airflow lineage backend.",
|
|
30
|
+
"start_date": days_ago(2),
|
|
31
|
+
"tags": ["example_tag"],
|
|
32
|
+
"catchup": False,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Handle schedule parameter change in Airflow 3.0
|
|
36
|
+
if hasattr(airflow, "__version__") and airflow.__version__.startswith(
|
|
37
|
+
("3.", "2.10", "2.9", "2.8", "2.7")
|
|
38
|
+
):
|
|
39
|
+
# Use schedule for newer Airflow versions (2.7+)
|
|
40
|
+
dag_kwargs["schedule"] = timedelta(days=1)
|
|
41
|
+
else:
|
|
42
|
+
# Use schedule_interval for older versions
|
|
43
|
+
dag_kwargs["schedule_interval"] = timedelta(days=1)
|
|
44
|
+
|
|
45
|
+
# Add default_view only for older Airflow versions that support it
|
|
46
|
+
if hasattr(airflow, "__version__") and not airflow.__version__.startswith("3."):
|
|
47
|
+
dag_kwargs["default_view"] = "tree"
|
|
48
|
+
|
|
49
|
+
with DAG(**dag_kwargs) as dag:
|
|
50
|
+
task1 = BashOperator(
|
|
51
|
+
task_id="run_data_task",
|
|
52
|
+
dag=dag,
|
|
53
|
+
bash_command="echo 'This is where you might run your data tooling.'",
|
|
54
|
+
inlets=[
|
|
55
|
+
Dataset(platform="snowflake", name="mydb.schema.tableA"),
|
|
56
|
+
Dataset(platform="snowflake", name="mydb.schema.tableB", env="DEV"),
|
|
57
|
+
Dataset(
|
|
58
|
+
platform="snowflake",
|
|
59
|
+
name="mydb.schema.tableC",
|
|
60
|
+
platform_instance="cloud",
|
|
61
|
+
),
|
|
62
|
+
# You can also put dataset URNs in the inlets/outlets lists.
|
|
63
|
+
Urn(
|
|
64
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)"
|
|
65
|
+
),
|
|
66
|
+
Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,dag1,prod),task1)"),
|
|
67
|
+
],
|
|
68
|
+
outlets=[Dataset("snowflake", "mydb.schema.tableD")],
|
|
69
|
+
)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Lineage Backend
|
|
2
|
+
|
|
3
|
+
An example DAG demonstrating the usage of DataHub's Airflow lineage backend using the TaskFlow API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from datetime import timedelta
|
|
7
|
+
|
|
8
|
+
from airflow.decorators import dag, task # type: ignore[attr-defined]
|
|
9
|
+
|
|
10
|
+
from datahub_airflow_plugin._airflow_version_specific import days_ago
|
|
11
|
+
from datahub_airflow_plugin.entities import Dataset, Urn
|
|
12
|
+
|
|
13
|
+
default_args = {
|
|
14
|
+
"owner": "airflow",
|
|
15
|
+
"depends_on_past": False,
|
|
16
|
+
"email": ["jdoe@example.com"],
|
|
17
|
+
"email_on_failure": False,
|
|
18
|
+
"execution_timeout": timedelta(minutes=5),
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Create DAG decorator arguments conditionally for Airflow version compatibility
|
|
23
|
+
import airflow # noqa: E402
|
|
24
|
+
|
|
25
|
+
dag_decorator_kwargs = {
|
|
26
|
+
"default_args": default_args,
|
|
27
|
+
"description": "An example DAG demonstrating the usage of DataHub's Airflow lineage backend using the TaskFlow API.",
|
|
28
|
+
"start_date": days_ago(2),
|
|
29
|
+
"tags": ["example_tag"],
|
|
30
|
+
"catchup": False,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Handle schedule parameter change in Airflow 3.0
|
|
34
|
+
if hasattr(airflow, "__version__") and airflow.__version__.startswith(
|
|
35
|
+
("3.", "2.10", "2.9", "2.8", "2.7")
|
|
36
|
+
):
|
|
37
|
+
# Use schedule for newer Airflow versions (2.7+)
|
|
38
|
+
dag_decorator_kwargs["schedule"] = timedelta(days=1)
|
|
39
|
+
else:
|
|
40
|
+
# Use schedule_interval for older versions
|
|
41
|
+
dag_decorator_kwargs["schedule_interval"] = timedelta(days=1)
|
|
42
|
+
|
|
43
|
+
# Add default_view only for older Airflow versions that support it
|
|
44
|
+
if hasattr(airflow, "__version__") and not airflow.__version__.startswith("3."):
|
|
45
|
+
dag_decorator_kwargs["default_view"] = "tree"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dag(**dag_decorator_kwargs)
|
|
49
|
+
def datahub_lineage_backend_taskflow_demo():
|
|
50
|
+
@task(
|
|
51
|
+
inlets=[
|
|
52
|
+
Dataset("snowflake", "mydb.schema.tableA"),
|
|
53
|
+
Dataset("snowflake", "mydb.schema.tableB", "DEV"),
|
|
54
|
+
# You can also put dataset URNs in the inlets/outlets lists.
|
|
55
|
+
Urn(
|
|
56
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)"
|
|
57
|
+
),
|
|
58
|
+
Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,dag1,prod),task1)"),
|
|
59
|
+
],
|
|
60
|
+
outlets=[Dataset("snowflake", "mydb.schema.tableD")],
|
|
61
|
+
)
|
|
62
|
+
def run_data_task():
|
|
63
|
+
# This is where you might run your data tooling.
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
run_data_task()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
datahub_lineage_backend_taskflow_dag = datahub_lineage_backend_taskflow_demo()
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Lineage Emission
|
|
2
|
+
|
|
3
|
+
This example demonstrates how to emit lineage to DataHub within an Airflow DAG.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from datetime import timedelta
|
|
7
|
+
|
|
8
|
+
from airflow import DAG
|
|
9
|
+
from airflow.operators.bash import BashOperator
|
|
10
|
+
|
|
11
|
+
import datahub.emitter.mce_builder as builder
|
|
12
|
+
from datahub_airflow_plugin._airflow_version_specific import days_ago
|
|
13
|
+
from datahub_airflow_plugin.operators.datahub import DatahubEmitterOperator
|
|
14
|
+
|
|
15
|
+
default_args = {
|
|
16
|
+
"owner": "airflow",
|
|
17
|
+
"depends_on_past": False,
|
|
18
|
+
"email": ["jdoe@example.com"],
|
|
19
|
+
"email_on_failure": False,
|
|
20
|
+
"email_on_retry": False,
|
|
21
|
+
"retries": 1,
|
|
22
|
+
"retry_delay": timedelta(minutes=5),
|
|
23
|
+
"execution_timeout": timedelta(minutes=120),
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Create DAG arguments conditionally for Airflow version compatibility
|
|
28
|
+
import airflow # noqa: E402
|
|
29
|
+
|
|
30
|
+
dag_kwargs = {
|
|
31
|
+
"dag_id": "datahub_lineage_emission_example",
|
|
32
|
+
"default_args": default_args,
|
|
33
|
+
"description": "An example DAG demonstrating lineage emission within an Airflow DAG.",
|
|
34
|
+
"start_date": days_ago(2),
|
|
35
|
+
"catchup": False,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# Handle schedule parameter change in Airflow 3.0
|
|
39
|
+
if hasattr(airflow, "__version__") and airflow.__version__.startswith(
|
|
40
|
+
("3.", "2.10", "2.9", "2.8", "2.7")
|
|
41
|
+
):
|
|
42
|
+
# Use schedule for newer Airflow versions (2.7+)
|
|
43
|
+
dag_kwargs["schedule"] = timedelta(days=1)
|
|
44
|
+
else:
|
|
45
|
+
# Use schedule_interval for older versions
|
|
46
|
+
dag_kwargs["schedule_interval"] = timedelta(days=1)
|
|
47
|
+
|
|
48
|
+
# Add default_view only for older Airflow versions that support it
|
|
49
|
+
if hasattr(airflow, "__version__") and not airflow.__version__.startswith("3."):
|
|
50
|
+
dag_kwargs["default_view"] = "tree"
|
|
51
|
+
|
|
52
|
+
with DAG(**dag_kwargs) as dag:
|
|
53
|
+
transformation_task = BashOperator(
|
|
54
|
+
task_id="transformation_task",
|
|
55
|
+
dag=dag,
|
|
56
|
+
bash_command="echo 'This is where you might run your data tooling.'",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
emit_lineage_task = DatahubEmitterOperator(
|
|
60
|
+
task_id="emit_lineage",
|
|
61
|
+
datahub_conn_id="datahub_rest_default",
|
|
62
|
+
mces=[
|
|
63
|
+
builder.make_lineage_mce(
|
|
64
|
+
upstream_urns=[
|
|
65
|
+
builder.make_dataset_urn(
|
|
66
|
+
platform="snowflake", name="mydb.schema.tableA"
|
|
67
|
+
),
|
|
68
|
+
builder.make_dataset_urn_with_platform_instance(
|
|
69
|
+
platform="snowflake",
|
|
70
|
+
name="mydb.schema.tableB",
|
|
71
|
+
platform_instance="cloud",
|
|
72
|
+
),
|
|
73
|
+
],
|
|
74
|
+
downstream_urn=builder.make_dataset_urn(
|
|
75
|
+
platform="snowflake", name="mydb.schema.tableC", env="DEV"
|
|
76
|
+
),
|
|
77
|
+
)
|
|
78
|
+
],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
transformation_task >> emit_lineage_task
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""MySQL DataHub Ingest DAG
|
|
2
|
+
|
|
3
|
+
This example demonstrates how to ingest metadata from MySQL into DataHub
|
|
4
|
+
from within an Airflow DAG. Note that the DB connection configuration is
|
|
5
|
+
embedded within the code.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from datetime import datetime, timedelta
|
|
9
|
+
|
|
10
|
+
from airflow import DAG
|
|
11
|
+
from airflow.operators.python import PythonVirtualenvOperator
|
|
12
|
+
|
|
13
|
+
from datahub_airflow_plugin._airflow_version_specific import (
|
|
14
|
+
get_airflow_compatible_dag_kwargs,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def ingest_from_mysql():
|
|
19
|
+
from datahub.ingestion.run.pipeline import Pipeline
|
|
20
|
+
|
|
21
|
+
pipeline = Pipeline.create(
|
|
22
|
+
# This configuration is analogous to a recipe configuration.
|
|
23
|
+
{
|
|
24
|
+
"source": {
|
|
25
|
+
"type": "mysql",
|
|
26
|
+
"config": {
|
|
27
|
+
# If you want to use Airflow connections, take a look at the snowflake_sample_dag.py example.
|
|
28
|
+
"username": "user",
|
|
29
|
+
"password": "pass",
|
|
30
|
+
"database": "db_name",
|
|
31
|
+
"host_port": "localhost:3306",
|
|
32
|
+
},
|
|
33
|
+
},
|
|
34
|
+
"sink": {
|
|
35
|
+
"type": "datahub-rest",
|
|
36
|
+
"config": {"server": "http://localhost:8080"},
|
|
37
|
+
},
|
|
38
|
+
}
|
|
39
|
+
)
|
|
40
|
+
pipeline.run()
|
|
41
|
+
pipeline.pretty_print_summary()
|
|
42
|
+
pipeline.raise_from_status()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
with DAG(
|
|
46
|
+
"datahub_mysql_ingest",
|
|
47
|
+
**get_airflow_compatible_dag_kwargs(
|
|
48
|
+
default_args={
|
|
49
|
+
"owner": "airflow",
|
|
50
|
+
},
|
|
51
|
+
description="An example DAG which ingests metadata from MySQL to DataHub",
|
|
52
|
+
start_date=datetime(2022, 1, 1),
|
|
53
|
+
schedule_interval=timedelta(days=1),
|
|
54
|
+
catchup=False,
|
|
55
|
+
default_view="tree",
|
|
56
|
+
),
|
|
57
|
+
) as dag:
|
|
58
|
+
# While it is also possible to use the PythonOperator, we recommend using
|
|
59
|
+
# the PythonVirtualenvOperator to ensure that there are no dependency
|
|
60
|
+
# conflicts between DataHub and the rest of your Airflow environment.
|
|
61
|
+
ingest_task = PythonVirtualenvOperator(
|
|
62
|
+
task_id="ingest_from_mysql",
|
|
63
|
+
requirements=[
|
|
64
|
+
"acryl-datahub[mysql]",
|
|
65
|
+
],
|
|
66
|
+
system_site_packages=False,
|
|
67
|
+
python_callable=ingest_from_mysql,
|
|
68
|
+
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Snowflake DataHub Ingest DAG
|
|
2
|
+
|
|
3
|
+
This example demonstrates how to ingest metadata from Snowflake into DataHub
|
|
4
|
+
from within an Airflow DAG. In contrast to the MySQL example, this DAG
|
|
5
|
+
pulls the DB connection configuration from Airflow's connection store.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from datetime import datetime, timedelta
|
|
9
|
+
|
|
10
|
+
from airflow import DAG
|
|
11
|
+
|
|
12
|
+
from datahub_airflow_plugin._airflow_version_specific import (
|
|
13
|
+
IS_AIRFLOW_3_OR_HIGHER,
|
|
14
|
+
get_airflow_compatible_dag_kwargs,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
if IS_AIRFLOW_3_OR_HIGHER:
|
|
18
|
+
from airflow.hooks.base_hook import BaseHook # type: ignore[attr-defined]
|
|
19
|
+
else:
|
|
20
|
+
from airflow.hooks.base import BaseHook # type: ignore[attr-defined]
|
|
21
|
+
from airflow.operators.python import PythonVirtualenvOperator
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def ingest_from_snowflake(snowflake_credentials, datahub_gms_server):
|
|
25
|
+
from datahub.ingestion.run.pipeline import Pipeline
|
|
26
|
+
|
|
27
|
+
pipeline = Pipeline.create(
|
|
28
|
+
# This configuration is analogous to a recipe configuration.
|
|
29
|
+
{
|
|
30
|
+
"source": {
|
|
31
|
+
"type": "snowflake",
|
|
32
|
+
"config": {
|
|
33
|
+
**snowflake_credentials,
|
|
34
|
+
# Other Snowflake config can be added here.
|
|
35
|
+
"profiling": {"enabled": False},
|
|
36
|
+
},
|
|
37
|
+
},
|
|
38
|
+
# Other ingestion features, like transformers, are also supported.
|
|
39
|
+
# "transformers": [
|
|
40
|
+
# {
|
|
41
|
+
# "type": "simple_add_dataset_ownership",
|
|
42
|
+
# "config": {
|
|
43
|
+
# "owner_urns": [
|
|
44
|
+
# "urn:li:corpuser:example",
|
|
45
|
+
# ]
|
|
46
|
+
# },
|
|
47
|
+
# }
|
|
48
|
+
# ],
|
|
49
|
+
"sink": {
|
|
50
|
+
"type": "datahub-rest",
|
|
51
|
+
"config": {"server": datahub_gms_server},
|
|
52
|
+
},
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
pipeline.run()
|
|
56
|
+
pipeline.pretty_print_summary()
|
|
57
|
+
pipeline.raise_from_status()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
with DAG(
|
|
61
|
+
"datahub_snowflake_ingest",
|
|
62
|
+
**get_airflow_compatible_dag_kwargs(
|
|
63
|
+
default_args={
|
|
64
|
+
"owner": "airflow",
|
|
65
|
+
},
|
|
66
|
+
description="An example DAG which ingests metadata from Snowflake to DataHub",
|
|
67
|
+
start_date=datetime(2022, 1, 1),
|
|
68
|
+
schedule_interval=timedelta(days=1),
|
|
69
|
+
catchup=False,
|
|
70
|
+
default_view="tree",
|
|
71
|
+
),
|
|
72
|
+
) as dag:
|
|
73
|
+
# This example pulls credentials from Airflow's connection store.
|
|
74
|
+
# For this to work, you must have previously configured these connections in Airflow.
|
|
75
|
+
# See the Airflow docs for details: https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html
|
|
76
|
+
snowflake_conn = BaseHook.get_connection("snowflake_admin_default")
|
|
77
|
+
datahub_conn = BaseHook.get_connection("datahub_rest_default")
|
|
78
|
+
|
|
79
|
+
# While it is also possible to use the PythonOperator, we recommend using
|
|
80
|
+
# the PythonVirtualenvOperator to ensure that there are no dependency
|
|
81
|
+
# conflicts between DataHub and the rest of your Airflow environment.
|
|
82
|
+
ingest_task = PythonVirtualenvOperator(
|
|
83
|
+
task_id="ingest_from_snowflake",
|
|
84
|
+
requirements=[
|
|
85
|
+
"acryl-datahub[snowflake]",
|
|
86
|
+
],
|
|
87
|
+
system_site_packages=False,
|
|
88
|
+
python_callable=ingest_from_snowflake,
|
|
89
|
+
op_kwargs={
|
|
90
|
+
"snowflake_credentials": {
|
|
91
|
+
"username": snowflake_conn.login,
|
|
92
|
+
"password": snowflake_conn.password,
|
|
93
|
+
"account_id": snowflake_conn.extra_dejson["account"],
|
|
94
|
+
"warehouse": snowflake_conn.extra_dejson.get("warehouse"),
|
|
95
|
+
"role": snowflake_conn.extra_dejson.get("role"),
|
|
96
|
+
},
|
|
97
|
+
"datahub_gms_server": datahub_conn.host,
|
|
98
|
+
},
|
|
99
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Lineage Backend
|
|
2
|
+
|
|
3
|
+
An example DAG demonstrating the usage of DataHub's Airflow lineage backend.
|
|
4
|
+
|
|
5
|
+
This is the Airflow 3.0+ version.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from datetime import datetime, timedelta
|
|
9
|
+
|
|
10
|
+
from airflow import DAG
|
|
11
|
+
from airflow.operators.bash import BashOperator
|
|
12
|
+
|
|
13
|
+
from datahub_airflow_plugin.entities import Dataset, Urn
|
|
14
|
+
|
|
15
|
+
default_args = {
|
|
16
|
+
"owner": "airflow",
|
|
17
|
+
"depends_on_past": False,
|
|
18
|
+
"email": ["jdoe@example.com"],
|
|
19
|
+
"email_on_failure": False,
|
|
20
|
+
"execution_timeout": timedelta(minutes=5),
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
with DAG(
|
|
24
|
+
dag_id="datahub_lineage_backend_demo",
|
|
25
|
+
default_args=default_args,
|
|
26
|
+
description="An example DAG demonstrating the usage of DataHub's Airflow lineage backend.",
|
|
27
|
+
start_date=datetime(2023, 1, 1),
|
|
28
|
+
schedule=timedelta(days=1),
|
|
29
|
+
tags=["example_tag"],
|
|
30
|
+
catchup=False,
|
|
31
|
+
) as dag:
|
|
32
|
+
task1 = BashOperator(
|
|
33
|
+
task_id="run_data_task",
|
|
34
|
+
dag=dag,
|
|
35
|
+
bash_command="echo 'This is where you might run your data tooling.'",
|
|
36
|
+
inlets=[
|
|
37
|
+
Dataset(platform="snowflake", name="mydb.schema.tableA"),
|
|
38
|
+
Dataset(platform="snowflake", name="mydb.schema.tableB", env="DEV"),
|
|
39
|
+
Dataset(
|
|
40
|
+
platform="snowflake",
|
|
41
|
+
name="mydb.schema.tableC",
|
|
42
|
+
platform_instance="cloud",
|
|
43
|
+
),
|
|
44
|
+
# You can also put dataset URNs in the inlets/outlets lists.
|
|
45
|
+
Urn(
|
|
46
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)"
|
|
47
|
+
),
|
|
48
|
+
Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,dag1,prod),task1)"),
|
|
49
|
+
],
|
|
50
|
+
outlets=[Dataset("snowflake", "mydb.schema.tableD")],
|
|
51
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Lineage Backend
|
|
2
|
+
|
|
3
|
+
An example DAG demonstrating the usage of DataHub's Airflow lineage backend using the TaskFlow API.
|
|
4
|
+
|
|
5
|
+
This is the Airflow 3.0+ version.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from datetime import datetime, timedelta
|
|
9
|
+
|
|
10
|
+
from airflow.decorators import dag, task # type: ignore[attr-defined]
|
|
11
|
+
|
|
12
|
+
from datahub_airflow_plugin.entities import Dataset, Urn
|
|
13
|
+
|
|
14
|
+
default_args = {
|
|
15
|
+
"owner": "airflow",
|
|
16
|
+
"depends_on_past": False,
|
|
17
|
+
"email": ["jdoe@example.com"],
|
|
18
|
+
"email_on_failure": False,
|
|
19
|
+
"execution_timeout": timedelta(minutes=5),
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dag(
|
|
24
|
+
default_args=default_args,
|
|
25
|
+
description="An example DAG demonstrating the usage of DataHub's Airflow lineage backend using the TaskFlow API.",
|
|
26
|
+
start_date=datetime(2023, 1, 1),
|
|
27
|
+
schedule=timedelta(days=1),
|
|
28
|
+
tags=["example_tag"],
|
|
29
|
+
catchup=False,
|
|
30
|
+
)
|
|
31
|
+
def datahub_lineage_backend_taskflow_demo():
|
|
32
|
+
@task(
|
|
33
|
+
inlets=[
|
|
34
|
+
Dataset("snowflake", "mydb.schema.tableA"),
|
|
35
|
+
Dataset("snowflake", "mydb.schema.tableB", "DEV"),
|
|
36
|
+
# You can also put dataset URNs in the inlets/outlets lists.
|
|
37
|
+
Urn(
|
|
38
|
+
"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)"
|
|
39
|
+
),
|
|
40
|
+
Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,dag1,prod),task1)"),
|
|
41
|
+
],
|
|
42
|
+
outlets=[Dataset("snowflake", "mydb.schema.tableD")],
|
|
43
|
+
)
|
|
44
|
+
def run_data_task():
|
|
45
|
+
# This is where you might run your data tooling.
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
run_data_task()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
datahub_lineage_backend_taskflow_dag = datahub_lineage_backend_taskflow_demo()
|