acryl-datahub-airflow-plugin 1.3.1.4__py3-none-any.whl → 1.3.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
  3. datahub_airflow_plugin/_airflow_compat.py +32 -0
  4. datahub_airflow_plugin/_airflow_shims.py +64 -31
  5. datahub_airflow_plugin/_airflow_version_specific.py +184 -0
  6. datahub_airflow_plugin/_config.py +97 -19
  7. datahub_airflow_plugin/_constants.py +16 -0
  8. datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
  9. datahub_airflow_plugin/_version.py +1 -1
  10. datahub_airflow_plugin/airflow2/__init__.py +6 -0
  11. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
  12. datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
  13. datahub_airflow_plugin/airflow2/_extractors.py +477 -0
  14. datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
  15. datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
  16. datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
  17. datahub_airflow_plugin/airflow2/_shims.py +88 -0
  18. datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
  19. datahub_airflow_plugin/airflow3/__init__.py +6 -0
  20. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
  21. datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
  22. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
  23. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
  24. datahub_airflow_plugin/airflow3/_shims.py +82 -0
  25. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
  26. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
  27. datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
  28. datahub_airflow_plugin/client/airflow_generator.py +147 -43
  29. datahub_airflow_plugin/datahub_listener.py +19 -790
  30. datahub_airflow_plugin/example_dags/__init__.py +32 -0
  31. datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
  32. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
  33. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
  34. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
  35. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
  36. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
  37. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
  38. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
  39. datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
  40. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
  41. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
  42. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
  43. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
  44. datahub_airflow_plugin/hooks/datahub.py +11 -2
  45. datahub_airflow_plugin/operators/datahub.py +20 -3
  46. acryl_datahub_airflow_plugin-1.3.1.4.dist-info/METADATA +0 -90
  47. acryl_datahub_airflow_plugin-1.3.1.4.dist-info/RECORD +0 -33
  48. datahub_airflow_plugin/_extractors.py +0 -336
  49. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.4.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,32 @@
1
+ """
2
+ DataHub Airflow Plugin Example DAGs
3
+
4
+ This directory contains example DAGs demonstrating various features of the DataHub Airflow plugin.
5
+
6
+ ## Directory Structure
7
+
8
+ - **airflow2/**: Example DAGs for Airflow 2.x with compatibility layers
9
+ - **airflow3/**: Example DAGs for Airflow 3.0+ using native syntax (no compatibility layers)
10
+ - Root directory: Legacy example DAGs with compatibility layers (deprecated, use airflow2/ or airflow3/)
11
+
12
+ ## Choosing the Right Examples
13
+
14
+ - If you're using **Airflow 3.0+**, refer to examples in `airflow3/`
15
+ - If you're using **Airflow 2.x**, refer to examples in `airflow2/`
16
+ - For production DAGs that need to work across multiple Airflow versions, see `airflow2/` for compatibility patterns
17
+
18
+ ## Available Examples
19
+
20
+ ### Lineage Collection
21
+ - `lineage_backend_demo.py`: Basic lineage collection using inlets/outlets
22
+ - `lineage_backend_taskflow_demo.py`: Lineage collection with TaskFlow API
23
+
24
+ ### Data Ingestion
25
+ - `snowflake_sample_dag.py`: Ingest Snowflake metadata into DataHub
26
+ - `mysql_sample_dag.py`: Ingest MySQL metadata into DataHub
27
+ - `generic_recipe_sample_dag.py`: Run any DataHub recipe from Airflow
28
+
29
+ ### Advanced Features
30
+ - `lineage_emission_dag.py`: Custom lineage emission with DatahubEmitterOperator
31
+ - `graph_usage_sample_dag.py`: Complex DAG graph with multiple dependencies
32
+ """
@@ -0,0 +1,8 @@
1
+ """
2
+ Airflow 2.x Example DAGs
3
+
4
+ This directory contains example DAGs for Airflow 2.x.
5
+ These DAGs use compatibility layers to work across multiple Airflow 2.x versions.
6
+
7
+ For Airflow 3.0+ examples, see the ../airflow3/ directory.
8
+ """
@@ -0,0 +1,54 @@
1
+ """Generic DataHub Ingest via Recipe
2
+
3
+ This example demonstrates how to load any configuration file and run a
4
+ DataHub ingestion pipeline within an Airflow DAG.
5
+ """
6
+
7
+ from datetime import timedelta
8
+
9
+ from airflow import DAG
10
+ from airflow.operators.python import PythonOperator
11
+
12
+ from datahub.configuration.config_loader import load_config_file
13
+ from datahub.ingestion.run.pipeline import Pipeline
14
+ from datahub_airflow_plugin._airflow_version_specific import (
15
+ days_ago,
16
+ get_airflow_compatible_dag_kwargs,
17
+ )
18
+
19
+ default_args = {
20
+ "owner": "airflow",
21
+ "depends_on_past": False,
22
+ "email": ["jdoe@example.com"],
23
+ "email_on_failure": False,
24
+ "email_on_retry": False,
25
+ "retries": 1,
26
+ "retry_delay": timedelta(minutes=5),
27
+ "execution_timeout": timedelta(minutes=120),
28
+ }
29
+
30
+
31
+ def datahub_recipe():
32
+ # Note that this will also resolve environment variables in the recipe.
33
+ config = load_config_file("path/to/recipe.yml")
34
+
35
+ pipeline = Pipeline.create(config)
36
+ pipeline.run()
37
+ pipeline.raise_from_status()
38
+
39
+
40
+ with DAG(
41
+ "datahub_ingest_using_recipe",
42
+ **get_airflow_compatible_dag_kwargs(
43
+ default_args=default_args,
44
+ description="An example DAG which runs a DataHub ingestion recipe",
45
+ start_date=days_ago(2),
46
+ schedule_interval=timedelta(days=1),
47
+ catchup=False,
48
+ default_view="tree",
49
+ ),
50
+ ) as dag:
51
+ ingest_task = PythonOperator(
52
+ task_id="ingest_using_recipe",
53
+ python_callable=datahub_recipe,
54
+ )
@@ -0,0 +1,43 @@
1
+ """This example DAG demonstrates how to create and use a DataHubGraph client."""
2
+
3
+ from datetime import timedelta
4
+
5
+ import pendulum
6
+ from airflow.decorators import ( # type: ignore[attr-defined] # Decorators not available in all Airflow versions
7
+ dag,
8
+ task,
9
+ )
10
+
11
+ from datahub.ingestion.graph.client import DataHubGraph, RemovedStatusFilter
12
+ from datahub_airflow_plugin._airflow_version_specific import (
13
+ get_airflow_compatible_dag_kwargs,
14
+ )
15
+ from datahub_airflow_plugin.hooks.datahub import DatahubRestHook
16
+
17
+ dag_decorator_kwargs = get_airflow_compatible_dag_kwargs(
18
+ start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
19
+ schedule_interval=timedelta(days=1),
20
+ catchup=False,
21
+ )
22
+
23
+
24
+ @dag(**dag_decorator_kwargs)
25
+ def datahub_graph_usage_sample_dag():
26
+ @task()
27
+ def use_the_graph():
28
+ graph: DataHubGraph = DatahubRestHook("my_datahub_rest_conn_id").make_graph()
29
+ graph.test_connection()
30
+
31
+ # Example usage: Find all soft-deleted BigQuery DEV entities
32
+ # in DataHub, and hard delete them.
33
+ for urn in graph.get_urns_by_filter(
34
+ platform="bigquery",
35
+ env="DEV",
36
+ status=RemovedStatusFilter.ONLY_SOFT_DELETED,
37
+ ):
38
+ graph.hard_delete_entity(urn)
39
+
40
+ use_the_graph()
41
+
42
+
43
+ datahub_graph_usage_sample_dag()
@@ -0,0 +1,69 @@
1
+ """Lineage Backend
2
+
3
+ An example DAG demonstrating the usage of DataHub's Airflow lineage backend.
4
+ """
5
+
6
+ from datetime import timedelta
7
+
8
+ from airflow import DAG
9
+ from airflow.operators.bash import BashOperator
10
+
11
+ from datahub_airflow_plugin._airflow_version_specific import days_ago
12
+ from datahub_airflow_plugin.entities import Dataset, Urn
13
+
14
+ default_args = {
15
+ "owner": "airflow",
16
+ "depends_on_past": False,
17
+ "email": ["jdoe@example.com"],
18
+ "email_on_failure": False,
19
+ "execution_timeout": timedelta(minutes=5),
20
+ }
21
+
22
+
23
+ # Create DAG arguments conditionally for Airflow version compatibility
24
+ import airflow # noqa: E402
25
+
26
+ dag_kwargs = {
27
+ "dag_id": "datahub_lineage_backend_demo",
28
+ "default_args": default_args,
29
+ "description": "An example DAG demonstrating the usage of DataHub's Airflow lineage backend.",
30
+ "start_date": days_ago(2),
31
+ "tags": ["example_tag"],
32
+ "catchup": False,
33
+ }
34
+
35
+ # Handle schedule parameter change in Airflow 3.0
36
+ if hasattr(airflow, "__version__") and airflow.__version__.startswith(
37
+ ("3.", "2.10", "2.9", "2.8", "2.7")
38
+ ):
39
+ # Use schedule for newer Airflow versions (2.7+)
40
+ dag_kwargs["schedule"] = timedelta(days=1)
41
+ else:
42
+ # Use schedule_interval for older versions
43
+ dag_kwargs["schedule_interval"] = timedelta(days=1)
44
+
45
+ # Add default_view only for older Airflow versions that support it
46
+ if hasattr(airflow, "__version__") and not airflow.__version__.startswith("3."):
47
+ dag_kwargs["default_view"] = "tree"
48
+
49
+ with DAG(**dag_kwargs) as dag:
50
+ task1 = BashOperator(
51
+ task_id="run_data_task",
52
+ dag=dag,
53
+ bash_command="echo 'This is where you might run your data tooling.'",
54
+ inlets=[
55
+ Dataset(platform="snowflake", name="mydb.schema.tableA"),
56
+ Dataset(platform="snowflake", name="mydb.schema.tableB", env="DEV"),
57
+ Dataset(
58
+ platform="snowflake",
59
+ name="mydb.schema.tableC",
60
+ platform_instance="cloud",
61
+ ),
62
+ # You can also put dataset URNs in the inlets/outlets lists.
63
+ Urn(
64
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)"
65
+ ),
66
+ Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,dag1,prod),task1)"),
67
+ ],
68
+ outlets=[Dataset("snowflake", "mydb.schema.tableD")],
69
+ )
@@ -0,0 +1,69 @@
1
+ """Lineage Backend
2
+
3
+ An example DAG demonstrating the usage of DataHub's Airflow lineage backend using the TaskFlow API.
4
+ """
5
+
6
+ from datetime import timedelta
7
+
8
+ from airflow.decorators import dag, task # type: ignore[attr-defined]
9
+
10
+ from datahub_airflow_plugin._airflow_version_specific import days_ago
11
+ from datahub_airflow_plugin.entities import Dataset, Urn
12
+
13
+ default_args = {
14
+ "owner": "airflow",
15
+ "depends_on_past": False,
16
+ "email": ["jdoe@example.com"],
17
+ "email_on_failure": False,
18
+ "execution_timeout": timedelta(minutes=5),
19
+ }
20
+
21
+
22
+ # Create DAG decorator arguments conditionally for Airflow version compatibility
23
+ import airflow # noqa: E402
24
+
25
+ dag_decorator_kwargs = {
26
+ "default_args": default_args,
27
+ "description": "An example DAG demonstrating the usage of DataHub's Airflow lineage backend using the TaskFlow API.",
28
+ "start_date": days_ago(2),
29
+ "tags": ["example_tag"],
30
+ "catchup": False,
31
+ }
32
+
33
+ # Handle schedule parameter change in Airflow 3.0
34
+ if hasattr(airflow, "__version__") and airflow.__version__.startswith(
35
+ ("3.", "2.10", "2.9", "2.8", "2.7")
36
+ ):
37
+ # Use schedule for newer Airflow versions (2.7+)
38
+ dag_decorator_kwargs["schedule"] = timedelta(days=1)
39
+ else:
40
+ # Use schedule_interval for older versions
41
+ dag_decorator_kwargs["schedule_interval"] = timedelta(days=1)
42
+
43
+ # Add default_view only for older Airflow versions that support it
44
+ if hasattr(airflow, "__version__") and not airflow.__version__.startswith("3."):
45
+ dag_decorator_kwargs["default_view"] = "tree"
46
+
47
+
48
+ @dag(**dag_decorator_kwargs)
49
+ def datahub_lineage_backend_taskflow_demo():
50
+ @task(
51
+ inlets=[
52
+ Dataset("snowflake", "mydb.schema.tableA"),
53
+ Dataset("snowflake", "mydb.schema.tableB", "DEV"),
54
+ # You can also put dataset URNs in the inlets/outlets lists.
55
+ Urn(
56
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)"
57
+ ),
58
+ Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,dag1,prod),task1)"),
59
+ ],
60
+ outlets=[Dataset("snowflake", "mydb.schema.tableD")],
61
+ )
62
+ def run_data_task():
63
+ # This is where you might run your data tooling.
64
+ pass
65
+
66
+ run_data_task()
67
+
68
+
69
+ datahub_lineage_backend_taskflow_dag = datahub_lineage_backend_taskflow_demo()
@@ -0,0 +1,81 @@
1
+ """Lineage Emission
2
+
3
+ This example demonstrates how to emit lineage to DataHub within an Airflow DAG.
4
+ """
5
+
6
+ from datetime import timedelta
7
+
8
+ from airflow import DAG
9
+ from airflow.operators.bash import BashOperator
10
+
11
+ import datahub.emitter.mce_builder as builder
12
+ from datahub_airflow_plugin._airflow_version_specific import days_ago
13
+ from datahub_airflow_plugin.operators.datahub import DatahubEmitterOperator
14
+
15
+ default_args = {
16
+ "owner": "airflow",
17
+ "depends_on_past": False,
18
+ "email": ["jdoe@example.com"],
19
+ "email_on_failure": False,
20
+ "email_on_retry": False,
21
+ "retries": 1,
22
+ "retry_delay": timedelta(minutes=5),
23
+ "execution_timeout": timedelta(minutes=120),
24
+ }
25
+
26
+
27
+ # Create DAG arguments conditionally for Airflow version compatibility
28
+ import airflow # noqa: E402
29
+
30
+ dag_kwargs = {
31
+ "dag_id": "datahub_lineage_emission_example",
32
+ "default_args": default_args,
33
+ "description": "An example DAG demonstrating lineage emission within an Airflow DAG.",
34
+ "start_date": days_ago(2),
35
+ "catchup": False,
36
+ }
37
+
38
+ # Handle schedule parameter change in Airflow 3.0
39
+ if hasattr(airflow, "__version__") and airflow.__version__.startswith(
40
+ ("3.", "2.10", "2.9", "2.8", "2.7")
41
+ ):
42
+ # Use schedule for newer Airflow versions (2.7+)
43
+ dag_kwargs["schedule"] = timedelta(days=1)
44
+ else:
45
+ # Use schedule_interval for older versions
46
+ dag_kwargs["schedule_interval"] = timedelta(days=1)
47
+
48
+ # Add default_view only for older Airflow versions that support it
49
+ if hasattr(airflow, "__version__") and not airflow.__version__.startswith("3."):
50
+ dag_kwargs["default_view"] = "tree"
51
+
52
+ with DAG(**dag_kwargs) as dag:
53
+ transformation_task = BashOperator(
54
+ task_id="transformation_task",
55
+ dag=dag,
56
+ bash_command="echo 'This is where you might run your data tooling.'",
57
+ )
58
+
59
+ emit_lineage_task = DatahubEmitterOperator(
60
+ task_id="emit_lineage",
61
+ datahub_conn_id="datahub_rest_default",
62
+ mces=[
63
+ builder.make_lineage_mce(
64
+ upstream_urns=[
65
+ builder.make_dataset_urn(
66
+ platform="snowflake", name="mydb.schema.tableA"
67
+ ),
68
+ builder.make_dataset_urn_with_platform_instance(
69
+ platform="snowflake",
70
+ name="mydb.schema.tableB",
71
+ platform_instance="cloud",
72
+ ),
73
+ ],
74
+ downstream_urn=builder.make_dataset_urn(
75
+ platform="snowflake", name="mydb.schema.tableC", env="DEV"
76
+ ),
77
+ )
78
+ ],
79
+ )
80
+
81
+ transformation_task >> emit_lineage_task
@@ -0,0 +1,68 @@
1
+ """MySQL DataHub Ingest DAG
2
+
3
+ This example demonstrates how to ingest metadata from MySQL into DataHub
4
+ from within an Airflow DAG. Note that the DB connection configuration is
5
+ embedded within the code.
6
+ """
7
+
8
+ from datetime import datetime, timedelta
9
+
10
+ from airflow import DAG
11
+ from airflow.operators.python import PythonVirtualenvOperator
12
+
13
+ from datahub_airflow_plugin._airflow_version_specific import (
14
+ get_airflow_compatible_dag_kwargs,
15
+ )
16
+
17
+
18
+ def ingest_from_mysql():
19
+ from datahub.ingestion.run.pipeline import Pipeline
20
+
21
+ pipeline = Pipeline.create(
22
+ # This configuration is analogous to a recipe configuration.
23
+ {
24
+ "source": {
25
+ "type": "mysql",
26
+ "config": {
27
+ # If you want to use Airflow connections, take a look at the snowflake_sample_dag.py example.
28
+ "username": "user",
29
+ "password": "pass",
30
+ "database": "db_name",
31
+ "host_port": "localhost:3306",
32
+ },
33
+ },
34
+ "sink": {
35
+ "type": "datahub-rest",
36
+ "config": {"server": "http://localhost:8080"},
37
+ },
38
+ }
39
+ )
40
+ pipeline.run()
41
+ pipeline.pretty_print_summary()
42
+ pipeline.raise_from_status()
43
+
44
+
45
+ with DAG(
46
+ "datahub_mysql_ingest",
47
+ **get_airflow_compatible_dag_kwargs(
48
+ default_args={
49
+ "owner": "airflow",
50
+ },
51
+ description="An example DAG which ingests metadata from MySQL to DataHub",
52
+ start_date=datetime(2022, 1, 1),
53
+ schedule_interval=timedelta(days=1),
54
+ catchup=False,
55
+ default_view="tree",
56
+ ),
57
+ ) as dag:
58
+ # While it is also possible to use the PythonOperator, we recommend using
59
+ # the PythonVirtualenvOperator to ensure that there are no dependency
60
+ # conflicts between DataHub and the rest of your Airflow environment.
61
+ ingest_task = PythonVirtualenvOperator(
62
+ task_id="ingest_from_mysql",
63
+ requirements=[
64
+ "acryl-datahub[mysql]",
65
+ ],
66
+ system_site_packages=False,
67
+ python_callable=ingest_from_mysql,
68
+ )
@@ -0,0 +1,99 @@
1
+ """Snowflake DataHub Ingest DAG
2
+
3
+ This example demonstrates how to ingest metadata from Snowflake into DataHub
4
+ from within an Airflow DAG. In contrast to the MySQL example, this DAG
5
+ pulls the DB connection configuration from Airflow's connection store.
6
+ """
7
+
8
+ from datetime import datetime, timedelta
9
+
10
+ from airflow import DAG
11
+
12
+ from datahub_airflow_plugin._airflow_version_specific import (
13
+ IS_AIRFLOW_3_OR_HIGHER,
14
+ get_airflow_compatible_dag_kwargs,
15
+ )
16
+
17
+ if IS_AIRFLOW_3_OR_HIGHER:
18
+ from airflow.hooks.base_hook import BaseHook # type: ignore[attr-defined]
19
+ else:
20
+ from airflow.hooks.base import BaseHook # type: ignore[attr-defined]
21
+ from airflow.operators.python import PythonVirtualenvOperator
22
+
23
+
24
+ def ingest_from_snowflake(snowflake_credentials, datahub_gms_server):
25
+ from datahub.ingestion.run.pipeline import Pipeline
26
+
27
+ pipeline = Pipeline.create(
28
+ # This configuration is analogous to a recipe configuration.
29
+ {
30
+ "source": {
31
+ "type": "snowflake",
32
+ "config": {
33
+ **snowflake_credentials,
34
+ # Other Snowflake config can be added here.
35
+ "profiling": {"enabled": False},
36
+ },
37
+ },
38
+ # Other ingestion features, like transformers, are also supported.
39
+ # "transformers": [
40
+ # {
41
+ # "type": "simple_add_dataset_ownership",
42
+ # "config": {
43
+ # "owner_urns": [
44
+ # "urn:li:corpuser:example",
45
+ # ]
46
+ # },
47
+ # }
48
+ # ],
49
+ "sink": {
50
+ "type": "datahub-rest",
51
+ "config": {"server": datahub_gms_server},
52
+ },
53
+ }
54
+ )
55
+ pipeline.run()
56
+ pipeline.pretty_print_summary()
57
+ pipeline.raise_from_status()
58
+
59
+
60
+ with DAG(
61
+ "datahub_snowflake_ingest",
62
+ **get_airflow_compatible_dag_kwargs(
63
+ default_args={
64
+ "owner": "airflow",
65
+ },
66
+ description="An example DAG which ingests metadata from Snowflake to DataHub",
67
+ start_date=datetime(2022, 1, 1),
68
+ schedule_interval=timedelta(days=1),
69
+ catchup=False,
70
+ default_view="tree",
71
+ ),
72
+ ) as dag:
73
+ # This example pulls credentials from Airflow's connection store.
74
+ # For this to work, you must have previously configured these connections in Airflow.
75
+ # See the Airflow docs for details: https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html
76
+ snowflake_conn = BaseHook.get_connection("snowflake_admin_default")
77
+ datahub_conn = BaseHook.get_connection("datahub_rest_default")
78
+
79
+ # While it is also possible to use the PythonOperator, we recommend using
80
+ # the PythonVirtualenvOperator to ensure that there are no dependency
81
+ # conflicts between DataHub and the rest of your Airflow environment.
82
+ ingest_task = PythonVirtualenvOperator(
83
+ task_id="ingest_from_snowflake",
84
+ requirements=[
85
+ "acryl-datahub[snowflake]",
86
+ ],
87
+ system_site_packages=False,
88
+ python_callable=ingest_from_snowflake,
89
+ op_kwargs={
90
+ "snowflake_credentials": {
91
+ "username": snowflake_conn.login,
92
+ "password": snowflake_conn.password,
93
+ "account_id": snowflake_conn.extra_dejson["account"],
94
+ "warehouse": snowflake_conn.extra_dejson.get("warehouse"),
95
+ "role": snowflake_conn.extra_dejson.get("role"),
96
+ },
97
+ "datahub_gms_server": datahub_conn.host,
98
+ },
99
+ )
@@ -0,0 +1,8 @@
1
+ """
2
+ Airflow 3.0+ Example DAGs
3
+
4
+ This directory contains example DAGs specifically for Airflow 3.0+.
5
+ These DAGs use native Airflow 3.0 syntax without compatibility layers.
6
+
7
+ For Airflow 2.x examples, see the ../airflow2/ directory.
8
+ """
@@ -0,0 +1,51 @@
1
+ """Lineage Backend
2
+
3
+ An example DAG demonstrating the usage of DataHub's Airflow lineage backend.
4
+
5
+ This is the Airflow 3.0+ version.
6
+ """
7
+
8
+ from datetime import datetime, timedelta
9
+
10
+ from airflow import DAG
11
+ from airflow.operators.bash import BashOperator
12
+
13
+ from datahub_airflow_plugin.entities import Dataset, Urn
14
+
15
+ default_args = {
16
+ "owner": "airflow",
17
+ "depends_on_past": False,
18
+ "email": ["jdoe@example.com"],
19
+ "email_on_failure": False,
20
+ "execution_timeout": timedelta(minutes=5),
21
+ }
22
+
23
+ with DAG(
24
+ dag_id="datahub_lineage_backend_demo",
25
+ default_args=default_args,
26
+ description="An example DAG demonstrating the usage of DataHub's Airflow lineage backend.",
27
+ start_date=datetime(2023, 1, 1),
28
+ schedule=timedelta(days=1),
29
+ tags=["example_tag"],
30
+ catchup=False,
31
+ ) as dag:
32
+ task1 = BashOperator(
33
+ task_id="run_data_task",
34
+ dag=dag,
35
+ bash_command="echo 'This is where you might run your data tooling.'",
36
+ inlets=[
37
+ Dataset(platform="snowflake", name="mydb.schema.tableA"),
38
+ Dataset(platform="snowflake", name="mydb.schema.tableB", env="DEV"),
39
+ Dataset(
40
+ platform="snowflake",
41
+ name="mydb.schema.tableC",
42
+ platform_instance="cloud",
43
+ ),
44
+ # You can also put dataset URNs in the inlets/outlets lists.
45
+ Urn(
46
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)"
47
+ ),
48
+ Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,dag1,prod),task1)"),
49
+ ],
50
+ outlets=[Dataset("snowflake", "mydb.schema.tableD")],
51
+ )
@@ -0,0 +1,51 @@
1
+ """Lineage Backend
2
+
3
+ An example DAG demonstrating the usage of DataHub's Airflow lineage backend using the TaskFlow API.
4
+
5
+ This is the Airflow 3.0+ version.
6
+ """
7
+
8
+ from datetime import datetime, timedelta
9
+
10
+ from airflow.decorators import dag, task # type: ignore[attr-defined]
11
+
12
+ from datahub_airflow_plugin.entities import Dataset, Urn
13
+
14
+ default_args = {
15
+ "owner": "airflow",
16
+ "depends_on_past": False,
17
+ "email": ["jdoe@example.com"],
18
+ "email_on_failure": False,
19
+ "execution_timeout": timedelta(minutes=5),
20
+ }
21
+
22
+
23
+ @dag(
24
+ default_args=default_args,
25
+ description="An example DAG demonstrating the usage of DataHub's Airflow lineage backend using the TaskFlow API.",
26
+ start_date=datetime(2023, 1, 1),
27
+ schedule=timedelta(days=1),
28
+ tags=["example_tag"],
29
+ catchup=False,
30
+ )
31
+ def datahub_lineage_backend_taskflow_demo():
32
+ @task(
33
+ inlets=[
34
+ Dataset("snowflake", "mydb.schema.tableA"),
35
+ Dataset("snowflake", "mydb.schema.tableB", "DEV"),
36
+ # You can also put dataset URNs in the inlets/outlets lists.
37
+ Urn(
38
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)"
39
+ ),
40
+ Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,dag1,prod),task1)"),
41
+ ],
42
+ outlets=[Dataset("snowflake", "mydb.schema.tableD")],
43
+ )
44
+ def run_data_task():
45
+ # This is where you might run your data tooling.
46
+ pass
47
+
48
+ run_data_task()
49
+
50
+
51
+ datahub_lineage_backend_taskflow_dag = datahub_lineage_backend_taskflow_demo()