acryl-datahub-dagster-plugin 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/METADATA +115 -0
- acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/RECORD +11 -0
- acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/WHEEL +5 -0
- acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/entry_points.txt +2 -0
- acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/top_level.txt +1 -0
- datahub_dagster_plugin/__init__.py +21 -0
- datahub_dagster_plugin/client/__init__.py +0 -0
- datahub_dagster_plugin/client/dagster_generator.py +504 -0
- datahub_dagster_plugin/datahub_dagster_plugin.py +2 -0
- datahub_dagster_plugin/sensors/__init__.py +0 -0
- datahub_dagster_plugin/sensors/datahub_sensors.py +439 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: acryl-datahub-dagster-plugin
|
|
3
|
+
Version: 0.0.0.dev0
|
|
4
|
+
Summary: Datahub Dagster plugin to capture executions and send to Datahub
|
|
5
|
+
Home-page: https://datahubproject.io/
|
|
6
|
+
License: Apache License 2.0
|
|
7
|
+
Project-URL: Documentation, https://datahubproject.io/docs/
|
|
8
|
+
Project-URL: Source, https://github.com/datahub-project/datahub
|
|
9
|
+
Project-URL: Changelog, https://github.com/datahub-project/datahub/releases
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Intended Audience :: Developers
|
|
18
|
+
Classifier: Intended Audience :: Information Technology
|
|
19
|
+
Classifier: Intended Audience :: System Administrators
|
|
20
|
+
Classifier: License :: OSI Approved
|
|
21
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
22
|
+
Classifier: Operating System :: Unix
|
|
23
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
24
|
+
Classifier: Environment :: Console
|
|
25
|
+
Classifier: Environment :: MacOS X
|
|
26
|
+
Classifier: Topic :: Software Development
|
|
27
|
+
Requires-Python: >=3.8
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
Requires-Dist: acryl-datahub[datahub-rest]
|
|
30
|
+
Requires-Dist: dagster >=1.3.3
|
|
31
|
+
Requires-Dist: requests-file
|
|
32
|
+
Requires-Dist: requests
|
|
33
|
+
Requires-Dist: dagit >=1.3.3
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: dagster >=1.3.3 ; extra == 'dev'
|
|
36
|
+
Requires-Dist: types-freezegun ; extra == 'dev'
|
|
37
|
+
Requires-Dist: types-cachetools ; extra == 'dev'
|
|
38
|
+
Requires-Dist: tox ; extra == 'dev'
|
|
39
|
+
Requires-Dist: requests-file ; extra == 'dev'
|
|
40
|
+
Requires-Dist: types-click ==0.1.12 ; extra == 'dev'
|
|
41
|
+
Requires-Dist: flake8 >=6.0.0 ; extra == 'dev'
|
|
42
|
+
Requires-Dist: packaging ; extra == 'dev'
|
|
43
|
+
Requires-Dist: jsonpickle ; extra == 'dev'
|
|
44
|
+
Requires-Dist: types-dataclasses ; extra == 'dev'
|
|
45
|
+
Requires-Dist: black ==22.12.0 ; extra == 'dev'
|
|
46
|
+
Requires-Dist: build ; extra == 'dev'
|
|
47
|
+
Requires-Dist: flake8-tidy-imports >=4.3.0 ; extra == 'dev'
|
|
48
|
+
Requires-Dist: types-requests ; extra == 'dev'
|
|
49
|
+
Requires-Dist: mypy >=1.4.0 ; extra == 'dev'
|
|
50
|
+
Requires-Dist: pytest-cov >=2.8.1 ; extra == 'dev'
|
|
51
|
+
Requires-Dist: types-toml ; extra == 'dev'
|
|
52
|
+
Requires-Dist: acryl-datahub[datahub-rest] ; extra == 'dev'
|
|
53
|
+
Requires-Dist: types-pytz ; extra == 'dev'
|
|
54
|
+
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
55
|
+
Requires-Dist: requests-mock ; extra == 'dev'
|
|
56
|
+
Requires-Dist: pytest >=6.2.2 ; extra == 'dev'
|
|
57
|
+
Requires-Dist: requests ; extra == 'dev'
|
|
58
|
+
Requires-Dist: pytest-asyncio >=0.16.0 ; extra == 'dev'
|
|
59
|
+
Requires-Dist: types-tabulate ; extra == 'dev'
|
|
60
|
+
Requires-Dist: deepdiff ; extra == 'dev'
|
|
61
|
+
Requires-Dist: dagit >=1.3.3 ; extra == 'dev'
|
|
62
|
+
Requires-Dist: freezegun ; extra == 'dev'
|
|
63
|
+
Requires-Dist: flake8-bugbear ==23.3.12 ; extra == 'dev'
|
|
64
|
+
Requires-Dist: types-pkg-resources ; extra == 'dev'
|
|
65
|
+
Requires-Dist: isort >=5.7.0 ; extra == 'dev'
|
|
66
|
+
Requires-Dist: twine ; extra == 'dev'
|
|
67
|
+
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
68
|
+
Requires-Dist: types-six ; extra == 'dev'
|
|
69
|
+
Requires-Dist: sqlalchemy-stubs ; extra == 'dev'
|
|
70
|
+
Requires-Dist: coverage >=5.1 ; extra == 'dev'
|
|
71
|
+
Requires-Dist: pydantic !=1.10.3,>=1.10.0 ; extra == 'dev'
|
|
72
|
+
Provides-Extra: ignore
|
|
73
|
+
Provides-Extra: integration-tests
|
|
74
|
+
Requires-Dist: dagster >=1.3.3 ; extra == 'integration-tests'
|
|
75
|
+
Requires-Dist: types-freezegun ; extra == 'integration-tests'
|
|
76
|
+
Requires-Dist: types-cachetools ; extra == 'integration-tests'
|
|
77
|
+
Requires-Dist: tox ; extra == 'integration-tests'
|
|
78
|
+
Requires-Dist: requests-file ; extra == 'integration-tests'
|
|
79
|
+
Requires-Dist: types-click ==0.1.12 ; extra == 'integration-tests'
|
|
80
|
+
Requires-Dist: flake8 >=6.0.0 ; extra == 'integration-tests'
|
|
81
|
+
Requires-Dist: packaging ; extra == 'integration-tests'
|
|
82
|
+
Requires-Dist: jsonpickle ; extra == 'integration-tests'
|
|
83
|
+
Requires-Dist: types-dataclasses ; extra == 'integration-tests'
|
|
84
|
+
Requires-Dist: black ==22.12.0 ; extra == 'integration-tests'
|
|
85
|
+
Requires-Dist: build ; extra == 'integration-tests'
|
|
86
|
+
Requires-Dist: flake8-tidy-imports >=4.3.0 ; extra == 'integration-tests'
|
|
87
|
+
Requires-Dist: types-requests ; extra == 'integration-tests'
|
|
88
|
+
Requires-Dist: mypy >=1.4.0 ; extra == 'integration-tests'
|
|
89
|
+
Requires-Dist: pytest-cov >=2.8.1 ; extra == 'integration-tests'
|
|
90
|
+
Requires-Dist: types-toml ; extra == 'integration-tests'
|
|
91
|
+
Requires-Dist: acryl-datahub[datahub-rest] ; extra == 'integration-tests'
|
|
92
|
+
Requires-Dist: types-pytz ; extra == 'integration-tests'
|
|
93
|
+
Requires-Dist: types-PyYAML ; extra == 'integration-tests'
|
|
94
|
+
Requires-Dist: requests-mock ; extra == 'integration-tests'
|
|
95
|
+
Requires-Dist: pytest >=6.2.2 ; extra == 'integration-tests'
|
|
96
|
+
Requires-Dist: requests ; extra == 'integration-tests'
|
|
97
|
+
Requires-Dist: pytest-asyncio >=0.16.0 ; extra == 'integration-tests'
|
|
98
|
+
Requires-Dist: types-tabulate ; extra == 'integration-tests'
|
|
99
|
+
Requires-Dist: deepdiff ; extra == 'integration-tests'
|
|
100
|
+
Requires-Dist: dagit >=1.3.3 ; extra == 'integration-tests'
|
|
101
|
+
Requires-Dist: freezegun ; extra == 'integration-tests'
|
|
102
|
+
Requires-Dist: flake8-bugbear ==23.3.12 ; extra == 'integration-tests'
|
|
103
|
+
Requires-Dist: types-pkg-resources ; extra == 'integration-tests'
|
|
104
|
+
Requires-Dist: isort >=5.7.0 ; extra == 'integration-tests'
|
|
105
|
+
Requires-Dist: twine ; extra == 'integration-tests'
|
|
106
|
+
Requires-Dist: types-python-dateutil ; extra == 'integration-tests'
|
|
107
|
+
Requires-Dist: types-six ; extra == 'integration-tests'
|
|
108
|
+
Requires-Dist: sqlalchemy-stubs ; extra == 'integration-tests'
|
|
109
|
+
Requires-Dist: coverage >=5.1 ; extra == 'integration-tests'
|
|
110
|
+
Requires-Dist: pydantic !=1.10.3,>=1.10.0 ; extra == 'integration-tests'
|
|
111
|
+
|
|
112
|
+
# Datahub Dagster Plugin
|
|
113
|
+
|
|
114
|
+
See the DataHub Dagster docs for details.
|
|
115
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
datahub_dagster_plugin/__init__.py,sha256=KN5nSxBdJqXJ-oGx6T8Hp0ta2pGtYDF1FD5UWFmCMTs,530
|
|
2
|
+
datahub_dagster_plugin/datahub_dagster_plugin.py,sha256=szsT7ddxs16eE56mhCdNXDtlakbNLmuhd2wUbkba5Xc,64
|
|
3
|
+
datahub_dagster_plugin/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
datahub_dagster_plugin/client/dagster_generator.py,sha256=yi40Fs4kY-Ba9g5eeH6kxI9VGRB1uiRHum1SMGIvAdU,18594
|
|
5
|
+
datahub_dagster_plugin/sensors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
datahub_dagster_plugin/sensors/datahub_sensors.py,sha256=oWAgOMlLeiJs37WzijfTK8W-vwnmGE0pf4IArFgt4YM,16652
|
|
7
|
+
acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/METADATA,sha256=PXm9UvKsGUNH8lNgMvKArW94GfHZihPs_ghq78iV9P8,5609
|
|
8
|
+
acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
9
|
+
acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/entry_points.txt,sha256=-CtPxtYb1u-zR36QnUQrvJJ6qbf1eDw9SruA22XBPZw,116
|
|
10
|
+
acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/top_level.txt,sha256=JS8QiBAj3eMTcMNcstm_EXGAcziiXVNT2nzOcfhdEMc,23
|
|
11
|
+
acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
datahub_dagster_plugin
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Published at https://pypi.org/project/acryl-datahub/.
|
|
2
|
+
__package_name__ = "acryl-datahub-dagster-plugin"
|
|
3
|
+
__version__ = "0.0.0.dev0"
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def is_dev_mode() -> bool:
|
|
7
|
+
return __version__.endswith("dev0")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def nice_version_name() -> str:
|
|
11
|
+
if is_dev_mode():
|
|
12
|
+
return "unavailable (installed in develop mode)"
|
|
13
|
+
return __version__
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_provider_info():
|
|
17
|
+
return {
|
|
18
|
+
"package-name": f"{__package_name__}",
|
|
19
|
+
"name": f"{__package_name__}",
|
|
20
|
+
"description": "Datahub metadata collector plugin",
|
|
21
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,504 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from logging import Logger
|
|
3
|
+
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Set
|
|
4
|
+
from urllib.parse import urlsplit
|
|
5
|
+
|
|
6
|
+
import pydantic
|
|
7
|
+
from dagster import DagsterRunStatus, PathMetadataValue, RunStatusSensorContext
|
|
8
|
+
from dagster._core.execution.stats import RunStepKeyStatsSnapshot, StepEventStatus
|
|
9
|
+
from dagster._core.snap import JobSnapshot
|
|
10
|
+
from dagster._core.snap.node import OpDefSnap
|
|
11
|
+
from dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatsSnapshot
|
|
12
|
+
from datahub.api.entities.datajob import DataFlow, DataJob
|
|
13
|
+
from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
14
|
+
DataProcessInstance,
|
|
15
|
+
InstanceRunResult,
|
|
16
|
+
)
|
|
17
|
+
from datahub.api.entities.dataset.dataset import Dataset
|
|
18
|
+
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
19
|
+
from datahub.emitter.mce_builder import (
|
|
20
|
+
make_data_platform_urn,
|
|
21
|
+
make_dataplatform_instance_urn,
|
|
22
|
+
)
|
|
23
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
24
|
+
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
|
|
25
|
+
from datahub.metadata.schema_classes import DataPlatformInstanceClass, SubTypesClass
|
|
26
|
+
from datahub.utilities.urns.data_flow_urn import DataFlowUrn
|
|
27
|
+
from datahub.utilities.urns.data_job_urn import DataJobUrn
|
|
28
|
+
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Constant:
|
|
32
|
+
"""
|
|
33
|
+
keys used in dagster plugin
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
ORCHESTRATOR = "dagster"
|
|
37
|
+
|
|
38
|
+
# Default config constants
|
|
39
|
+
DEFAULT_DATAHUB_REST_URL = "http://localhost:8080"
|
|
40
|
+
|
|
41
|
+
# Environment variable contants
|
|
42
|
+
DATAHUB_REST_URL = "DATAHUB_REST_URL"
|
|
43
|
+
DATAHUB_ENV = "DATAHUB_ENV"
|
|
44
|
+
DATAHUB_PLATFORM_INSTANCE = "DATAHUB_PLATFORM_INSTANCE"
|
|
45
|
+
DAGSTER_UI_URL = "DAGSTER_UI_URL"
|
|
46
|
+
|
|
47
|
+
# Datahub inputs/outputs constant
|
|
48
|
+
DATAHUB_INPUTS = "datahub.inputs"
|
|
49
|
+
DATAHUB_OUTPUTS = "datahub.outputs"
|
|
50
|
+
|
|
51
|
+
# Job run constant
|
|
52
|
+
JOB_SNAPSHOT_ID = "job_snapshot_id"
|
|
53
|
+
EXECUTION_PLAN_SNAPSHOT_ID = "execution_plan_snapshot_id"
|
|
54
|
+
ROOT_RUN_ID = "root_run_id"
|
|
55
|
+
PARENT_RUN_ID = "parent_run_id"
|
|
56
|
+
HAS_REPOSITORY_LOAD_DATA = "has_repository_load_data"
|
|
57
|
+
TAGS = "tags"
|
|
58
|
+
STEPS_SUCCEEDED = "steps_succeeded"
|
|
59
|
+
STEPS_FAILED = "steps_failed"
|
|
60
|
+
MATERIALIZATIONS = "materializations"
|
|
61
|
+
EXPECTATIONS = "expectations"
|
|
62
|
+
ENQUEUED_TIME = "enqueued_time"
|
|
63
|
+
LAUNCH_TIME = "launch_time"
|
|
64
|
+
START_TIME = "start_time"
|
|
65
|
+
END_TIME = "end_time"
|
|
66
|
+
|
|
67
|
+
# Op run contants
|
|
68
|
+
STEP_KEY = "step_key"
|
|
69
|
+
ATTEMPTS = "attempts"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class DatasetLineage(NamedTuple):
|
|
73
|
+
inputs: Set[str]
|
|
74
|
+
outputs: Set[str]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class DatahubDagsterSourceConfig(DatasetSourceConfigMixin):
|
|
78
|
+
datahub_client_config: DatahubClientConfig = pydantic.Field(
|
|
79
|
+
default=DatahubClientConfig(),
|
|
80
|
+
description="Datahub client config",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
dagster_url: Optional[str] = pydantic.Field(
|
|
84
|
+
default=None,
|
|
85
|
+
description="Dagster UI URL. Like: https://myDagsterCloudEnvironment.dagster.cloud/prod",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
capture_asset_materialization: bool = pydantic.Field(
|
|
89
|
+
default=True,
|
|
90
|
+
description="Whether to capture asset keys as Dataset on AssetMaterialization event",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
capture_input_output: bool = pydantic.Field(
|
|
94
|
+
default=False,
|
|
95
|
+
description="Whether to capture and try to parse input and output from HANDLED_OUTPUT, LOADED_INPUT event. (currently only filepathvalue metadata supported",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
asset_lineage_extractor: Optional[
|
|
99
|
+
Callable[
|
|
100
|
+
[RunStatusSensorContext, "DagsterGenerator", DataHubGraph],
|
|
101
|
+
Dict[str, DatasetLineage],
|
|
102
|
+
]
|
|
103
|
+
] = pydantic.Field(
|
|
104
|
+
default=None,
|
|
105
|
+
description="Custom asset lineage extractor function. See details at [https://datahubproject.io/docs/lineage/dagster/#define-your-custom-logic-to-capture-asset-lineage-information]",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _str_urn_to_dataset_urn(urns: List[str]) -> List[DatasetUrn]:
|
|
110
|
+
return [DatasetUrn.create_from_string(urn) for urn in urns]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass
|
|
114
|
+
class DagsterEnvironment:
|
|
115
|
+
repository: Optional[str]
|
|
116
|
+
is_cloud: bool = True
|
|
117
|
+
is_branch_deployment: bool = False
|
|
118
|
+
branch: Optional[str] = "prod"
|
|
119
|
+
module: Optional[str] = None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def job_url_generator(dagster_url: str, dagster_environment: DagsterEnvironment) -> str:
|
|
123
|
+
if dagster_environment.is_cloud:
|
|
124
|
+
base_url = f"{dagster_url}/{dagster_environment.branch}"
|
|
125
|
+
else:
|
|
126
|
+
base_url = dagster_url
|
|
127
|
+
|
|
128
|
+
if dagster_environment.module:
|
|
129
|
+
base_url = f"{base_url}/locations/{dagster_environment.module}"
|
|
130
|
+
|
|
131
|
+
return base_url
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class DagsterGenerator:
|
|
135
|
+
def __init__(
|
|
136
|
+
self,
|
|
137
|
+
logger: Logger,
|
|
138
|
+
config: DatahubDagsterSourceConfig,
|
|
139
|
+
dagster_environment: DagsterEnvironment,
|
|
140
|
+
):
|
|
141
|
+
self.logger = logger
|
|
142
|
+
self.config = config
|
|
143
|
+
self.dagster_environment = dagster_environment
|
|
144
|
+
|
|
145
|
+
def path_metadata_resolver(self, value: PathMetadataValue) -> Optional[DatasetUrn]:
|
|
146
|
+
"""
|
|
147
|
+
Resolve path metadata to dataset urn
|
|
148
|
+
"""
|
|
149
|
+
path = value.value
|
|
150
|
+
if not path:
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
if "://" in path:
|
|
154
|
+
url = urlsplit(path)
|
|
155
|
+
scheme = url.scheme
|
|
156
|
+
|
|
157
|
+
# Need to adjust some these schemes
|
|
158
|
+
if scheme in ["s3a", "s3n"]:
|
|
159
|
+
scheme = "s3"
|
|
160
|
+
elif scheme in ["gs"]:
|
|
161
|
+
scheme = "gcs"
|
|
162
|
+
|
|
163
|
+
return DatasetUrn(platform=scheme, name=url.path)
|
|
164
|
+
else:
|
|
165
|
+
return DatasetUrn(platform="file", name=path)
|
|
166
|
+
|
|
167
|
+
def metadata_resolver(self, metadata: Any) -> Optional[DatasetUrn]:
|
|
168
|
+
"""
|
|
169
|
+
Resolve metadata to dataset urn
|
|
170
|
+
"""
|
|
171
|
+
if isinstance(metadata, PathMetadataValue):
|
|
172
|
+
return self.path_metadata_resolver(metadata)
|
|
173
|
+
else:
|
|
174
|
+
self.logger.info(f"Unknown Metadata: {metadata} of type {type(metadata)}")
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
def generate_dataflow(
|
|
178
|
+
self,
|
|
179
|
+
job_snapshot: JobSnapshot,
|
|
180
|
+
env: str,
|
|
181
|
+
platform_instance: Optional[str] = None,
|
|
182
|
+
) -> DataFlow:
|
|
183
|
+
"""
|
|
184
|
+
Generates a Dataflow object from an Dagster Job Snapshot
|
|
185
|
+
:param job_snapshot: JobSnapshot - Job snapshot object
|
|
186
|
+
:param env: str
|
|
187
|
+
:param platform_instance: Optional[str]
|
|
188
|
+
:return: DataFlow - Data generated dataflow
|
|
189
|
+
"""
|
|
190
|
+
if self.dagster_environment.is_cloud:
|
|
191
|
+
id = f"{self.dagster_environment.branch}/{self.dagster_environment.module}/{job_snapshot.name}"
|
|
192
|
+
else:
|
|
193
|
+
id = f"{self.dagster_environment.module}/{job_snapshot.name}"
|
|
194
|
+
|
|
195
|
+
dataflow = DataFlow(
|
|
196
|
+
orchestrator=Constant.ORCHESTRATOR,
|
|
197
|
+
id=id,
|
|
198
|
+
env=env,
|
|
199
|
+
name=job_snapshot.name,
|
|
200
|
+
platform_instance=platform_instance,
|
|
201
|
+
)
|
|
202
|
+
dataflow.description = job_snapshot.description
|
|
203
|
+
dataflow.tags = set(job_snapshot.tags.keys())
|
|
204
|
+
if self.config.dagster_url:
|
|
205
|
+
dataflow.url = f"{job_url_generator(dagster_url=self.config.dagster_url, dagster_environment=self.dagster_environment)}/jobs/{job_snapshot.name}"
|
|
206
|
+
flow_property_bag: Dict[str, str] = {}
|
|
207
|
+
for key in job_snapshot.metadata.keys():
|
|
208
|
+
flow_property_bag[key] = str(job_snapshot.metadata[key])
|
|
209
|
+
dataflow.properties = flow_property_bag
|
|
210
|
+
return dataflow
|
|
211
|
+
|
|
212
|
+
def generate_datajob(
|
|
213
|
+
self,
|
|
214
|
+
job_snapshot: JobSnapshot,
|
|
215
|
+
step_deps: Dict[str, List],
|
|
216
|
+
op_def_snap: OpDefSnap,
|
|
217
|
+
env: str,
|
|
218
|
+
input_datasets: Dict[str, Set[DatasetUrn]],
|
|
219
|
+
output_datasets: Dict[str, Set[DatasetUrn]],
|
|
220
|
+
platform_instance: Optional[str] = None,
|
|
221
|
+
) -> DataJob:
|
|
222
|
+
"""
|
|
223
|
+
Generates a Datajob object from an Dagster op snapshot
|
|
224
|
+
:param job_snapshot: JobSnapshot - Job snapshot object
|
|
225
|
+
:param op_def_snap: OpDefSnap - Op def snapshot object
|
|
226
|
+
:param env: str
|
|
227
|
+
:param platform_instance: Optional[str]
|
|
228
|
+
:param output_datasets: dict[str, Set[DatasetUrn]] - output datasets for each op
|
|
229
|
+
:return: DataJob - Data generated datajob
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
if self.dagster_environment.is_cloud:
|
|
233
|
+
flow_id = f"{self.dagster_environment.branch}/{self.dagster_environment.module}/{job_snapshot.name}"
|
|
234
|
+
job_id = f"{self.dagster_environment.branch}/{self.dagster_environment.module}/{op_def_snap.name}"
|
|
235
|
+
else:
|
|
236
|
+
flow_id = f"{self.dagster_environment.module}/{job_snapshot.name}"
|
|
237
|
+
job_id = f"{self.dagster_environment.module}/{op_def_snap.name}"
|
|
238
|
+
|
|
239
|
+
dataflow_urn = DataFlowUrn.create_from_ids(
|
|
240
|
+
orchestrator=Constant.ORCHESTRATOR,
|
|
241
|
+
flow_id=flow_id,
|
|
242
|
+
env=env,
|
|
243
|
+
platform_instance=platform_instance,
|
|
244
|
+
)
|
|
245
|
+
datajob = DataJob(
|
|
246
|
+
id=job_id,
|
|
247
|
+
flow_urn=dataflow_urn,
|
|
248
|
+
name=op_def_snap.name,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
if self.config.dagster_url:
|
|
252
|
+
datajob.url = f"{job_url_generator(dagster_url=self.config.dagster_url, dagster_environment=self.dagster_environment)}/jobs/{job_snapshot.name}/{op_def_snap.name}"
|
|
253
|
+
|
|
254
|
+
datajob.description = op_def_snap.description
|
|
255
|
+
datajob.tags = set(op_def_snap.tags.keys())
|
|
256
|
+
|
|
257
|
+
# Add upstream dependencies for this op
|
|
258
|
+
for upstream_op_name in step_deps[op_def_snap.name]:
|
|
259
|
+
if self.dagster_environment.is_cloud:
|
|
260
|
+
upstream_job_id = f"{self.dagster_environment.branch}/{self.dagster_environment.module}/{upstream_op_name}"
|
|
261
|
+
else:
|
|
262
|
+
upstream_job_id = (
|
|
263
|
+
f"{self.dagster_environment.module}/{upstream_op_name}"
|
|
264
|
+
)
|
|
265
|
+
upstream_op_urn = DataJobUrn.create_from_ids(
|
|
266
|
+
data_flow_urn=str(dataflow_urn),
|
|
267
|
+
job_id=upstream_job_id,
|
|
268
|
+
)
|
|
269
|
+
datajob.upstream_urns.extend([upstream_op_urn])
|
|
270
|
+
job_property_bag: Dict[str, str] = {}
|
|
271
|
+
if input_datasets:
|
|
272
|
+
self.logger.info(
|
|
273
|
+
f"Input datasets for {op_def_snap.name} are { list(input_datasets.get(op_def_snap.name, []))}"
|
|
274
|
+
)
|
|
275
|
+
datajob.inlets = list(input_datasets.get(op_def_snap.name, []))
|
|
276
|
+
|
|
277
|
+
if output_datasets:
|
|
278
|
+
self.logger.info(
|
|
279
|
+
f"Output datasets for {op_def_snap.name} are { list(output_datasets.get(op_def_snap.name, []))}"
|
|
280
|
+
)
|
|
281
|
+
datajob.outlets = list(output_datasets.get(op_def_snap.name, []))
|
|
282
|
+
|
|
283
|
+
# For all op inputs/outputs:
|
|
284
|
+
# Add input/output details like its type, description, metadata etc in datajob properties.
|
|
285
|
+
# Also, add datahub inputs/outputs if present in input/output metatdata.
|
|
286
|
+
for input_def_snap in op_def_snap.input_def_snaps:
|
|
287
|
+
job_property_bag[f"input.{input_def_snap.name}"] = str(
|
|
288
|
+
input_def_snap._asdict()
|
|
289
|
+
)
|
|
290
|
+
if Constant.DATAHUB_INPUTS in input_def_snap.metadata:
|
|
291
|
+
datajob.inlets.extend(
|
|
292
|
+
_str_urn_to_dataset_urn(
|
|
293
|
+
input_def_snap.metadata[Constant.DATAHUB_INPUTS].value # type: ignore
|
|
294
|
+
)
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
for output_def_snap in op_def_snap.output_def_snaps:
|
|
298
|
+
job_property_bag[f"output_{output_def_snap.name}"] = str(
|
|
299
|
+
output_def_snap._asdict()
|
|
300
|
+
)
|
|
301
|
+
if Constant.DATAHUB_OUTPUTS in output_def_snap.metadata:
|
|
302
|
+
datajob.outlets.extend(
|
|
303
|
+
_str_urn_to_dataset_urn(
|
|
304
|
+
output_def_snap.metadata[Constant.DATAHUB_OUTPUTS].value # type: ignore
|
|
305
|
+
)
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
datajob.properties = job_property_bag
|
|
309
|
+
|
|
310
|
+
return datajob
|
|
311
|
+
|
|
312
|
+
def emit_job_run(
|
|
313
|
+
self,
|
|
314
|
+
graph: DataHubGraph,
|
|
315
|
+
dataflow: DataFlow,
|
|
316
|
+
run: DagsterRun,
|
|
317
|
+
run_stats: DagsterRunStatsSnapshot,
|
|
318
|
+
) -> None:
|
|
319
|
+
"""
|
|
320
|
+
Emit a latest job run
|
|
321
|
+
:param graph: DatahubRestEmitter
|
|
322
|
+
:param dataflow: DataFlow - DataFlow object
|
|
323
|
+
:param run: DagsterRun - Dagster Run object
|
|
324
|
+
:param run_stats: DagsterRunStatsSnapshot - latest job run stats
|
|
325
|
+
"""
|
|
326
|
+
dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=run_stats.run_id)
|
|
327
|
+
if self.config.dagster_url:
|
|
328
|
+
if self.dagster_environment.is_cloud:
|
|
329
|
+
dpi.url = f"{self.config.dagster_url}/{self.dagster_environment.branch}/runs/{run.run_id}"
|
|
330
|
+
else:
|
|
331
|
+
dpi.url = f"{self.config.dagster_url}/runs/{run.run_id}"
|
|
332
|
+
|
|
333
|
+
# Add below details in dpi properties
|
|
334
|
+
dpi_property_bag: Dict[str, str] = {}
|
|
335
|
+
allowed_job_run_keys = [
|
|
336
|
+
Constant.JOB_SNAPSHOT_ID,
|
|
337
|
+
Constant.EXECUTION_PLAN_SNAPSHOT_ID,
|
|
338
|
+
Constant.ROOT_RUN_ID,
|
|
339
|
+
Constant.PARENT_RUN_ID,
|
|
340
|
+
Constant.HAS_REPOSITORY_LOAD_DATA,
|
|
341
|
+
Constant.TAGS,
|
|
342
|
+
Constant.STEPS_SUCCEEDED,
|
|
343
|
+
Constant.STEPS_FAILED,
|
|
344
|
+
Constant.MATERIALIZATIONS,
|
|
345
|
+
Constant.EXPECTATIONS,
|
|
346
|
+
Constant.ENQUEUED_TIME,
|
|
347
|
+
Constant.LAUNCH_TIME,
|
|
348
|
+
Constant.START_TIME,
|
|
349
|
+
Constant.END_TIME,
|
|
350
|
+
]
|
|
351
|
+
for key in allowed_job_run_keys:
|
|
352
|
+
if hasattr(run, key) and getattr(run, key) is not None:
|
|
353
|
+
dpi_property_bag[key] = str(getattr(run, key))
|
|
354
|
+
if hasattr(run_stats, key) and getattr(run_stats, key) is not None:
|
|
355
|
+
dpi_property_bag[key] = str(getattr(run_stats, key))
|
|
356
|
+
dpi.properties.update(dpi_property_bag)
|
|
357
|
+
|
|
358
|
+
status_result_map = {
|
|
359
|
+
DagsterRunStatus.SUCCESS: InstanceRunResult.SUCCESS,
|
|
360
|
+
DagsterRunStatus.FAILURE: InstanceRunResult.FAILURE,
|
|
361
|
+
DagsterRunStatus.CANCELED: InstanceRunResult.SKIPPED,
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
if run.status not in status_result_map:
|
|
365
|
+
raise Exception(
|
|
366
|
+
f"Job run status should be either complete, failed or cancelled and it was "
|
|
367
|
+
f"{run.status }"
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
if run_stats.start_time is not None:
|
|
371
|
+
dpi.emit_process_start(
|
|
372
|
+
emitter=graph,
|
|
373
|
+
start_timestamp_millis=int(run_stats.start_time * 1000),
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
if run_stats.end_time is not None:
|
|
377
|
+
dpi.emit_process_end(
|
|
378
|
+
emitter=graph,
|
|
379
|
+
end_timestamp_millis=int(run_stats.end_time * 1000),
|
|
380
|
+
result=status_result_map[run.status],
|
|
381
|
+
result_type=Constant.ORCHESTRATOR,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
def emit_op_run(
|
|
385
|
+
self,
|
|
386
|
+
graph: DataHubGraph,
|
|
387
|
+
datajob: DataJob,
|
|
388
|
+
run_step_stats: RunStepKeyStatsSnapshot,
|
|
389
|
+
) -> None:
|
|
390
|
+
"""
|
|
391
|
+
Emit an op run
|
|
392
|
+
:param graph: DataHubGraph
|
|
393
|
+
:param datajob: DataJob - DataJob object
|
|
394
|
+
:param run_step_stats: RunStepKeyStatsSnapshot - step(op) run stats
|
|
395
|
+
"""
|
|
396
|
+
dpi = DataProcessInstance.from_datajob(
|
|
397
|
+
datajob=datajob,
|
|
398
|
+
id=f"{run_step_stats.run_id}.{datajob.id}",
|
|
399
|
+
clone_inlets=True,
|
|
400
|
+
clone_outlets=True,
|
|
401
|
+
)
|
|
402
|
+
if self.config.dagster_url:
|
|
403
|
+
dpi.url = f"{self.config.dagster_url}/runs/{run_step_stats.run_id}"
|
|
404
|
+
if self.dagster_environment.is_cloud:
|
|
405
|
+
dpi.url = f"{self.config.dagster_url}/{self.dagster_environment.branch}/runs/{run_step_stats.run_id}"
|
|
406
|
+
else:
|
|
407
|
+
dpi.url = f"{self.config.dagster_url}/runs/{run_step_stats.run_id}"
|
|
408
|
+
|
|
409
|
+
# Add below details in dpi properties
|
|
410
|
+
dpi_property_bag: Dict[str, str] = {}
|
|
411
|
+
allowed_op_run_keys = [
|
|
412
|
+
Constant.STEP_KEY,
|
|
413
|
+
Constant.ATTEMPTS,
|
|
414
|
+
Constant.START_TIME,
|
|
415
|
+
Constant.END_TIME,
|
|
416
|
+
]
|
|
417
|
+
for key in allowed_op_run_keys:
|
|
418
|
+
if (
|
|
419
|
+
hasattr(run_step_stats, key)
|
|
420
|
+
and getattr(run_step_stats, key) is not None
|
|
421
|
+
):
|
|
422
|
+
dpi_property_bag[key] = str(getattr(run_step_stats, key))
|
|
423
|
+
dpi.properties.update(dpi_property_bag)
|
|
424
|
+
|
|
425
|
+
status_result_map = {
|
|
426
|
+
StepEventStatus.SUCCESS: InstanceRunResult.SUCCESS,
|
|
427
|
+
StepEventStatus.FAILURE: InstanceRunResult.FAILURE,
|
|
428
|
+
StepEventStatus.SKIPPED: InstanceRunResult.SKIPPED,
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
if run_step_stats.status not in status_result_map:
|
|
432
|
+
raise Exception(
|
|
433
|
+
f"Step run status should be either complete, failed or cancelled and it was "
|
|
434
|
+
f"{run_step_stats.status }"
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
if run_step_stats.start_time is not None:
|
|
438
|
+
dpi.emit_process_start(
|
|
439
|
+
emitter=graph,
|
|
440
|
+
start_timestamp_millis=int(run_step_stats.start_time * 1000),
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
if run_step_stats.end_time is not None:
|
|
444
|
+
dpi.emit_process_end(
|
|
445
|
+
emitter=graph,
|
|
446
|
+
end_timestamp_millis=int(run_step_stats.end_time * 1000),
|
|
447
|
+
result=status_result_map[run_step_stats.status],
|
|
448
|
+
result_type=Constant.ORCHESTRATOR,
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
def dataset_urn_from_asset(self, asset_key: Sequence[str]) -> DatasetUrn:
|
|
452
|
+
"""
|
|
453
|
+
Generate dataset urn from asset key
|
|
454
|
+
"""
|
|
455
|
+
return DatasetUrn(
|
|
456
|
+
platform="dagster", env=self.config.env, name="/".join(asset_key)
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
def emit_asset(
|
|
460
|
+
self,
|
|
461
|
+
graph: DataHubGraph,
|
|
462
|
+
asset_key: Sequence[str],
|
|
463
|
+
description: Optional[str],
|
|
464
|
+
properties: Optional[Dict[str, str]],
|
|
465
|
+
) -> str:
|
|
466
|
+
"""
|
|
467
|
+
Emit asset to datahub
|
|
468
|
+
"""
|
|
469
|
+
dataset_urn = self.dataset_urn_from_asset(asset_key)
|
|
470
|
+
dataset = Dataset(
|
|
471
|
+
id=None,
|
|
472
|
+
urn=dataset_urn.urn(),
|
|
473
|
+
platform="dagster",
|
|
474
|
+
name=asset_key[-1],
|
|
475
|
+
schema=None,
|
|
476
|
+
downstreams=None,
|
|
477
|
+
subtype="Asset",
|
|
478
|
+
subtypes=None,
|
|
479
|
+
description=description,
|
|
480
|
+
env=self.config.env,
|
|
481
|
+
properties=properties,
|
|
482
|
+
)
|
|
483
|
+
for mcp in dataset.generate_mcp():
|
|
484
|
+
graph.emit_mcp(mcp)
|
|
485
|
+
|
|
486
|
+
mcp = MetadataChangeProposalWrapper(
|
|
487
|
+
entityUrn=dataset_urn.urn(),
|
|
488
|
+
aspect=SubTypesClass(typeNames=["Asset"]),
|
|
489
|
+
)
|
|
490
|
+
graph.emit_mcp(mcp)
|
|
491
|
+
|
|
492
|
+
if self.config.platform_instance:
|
|
493
|
+
mcp = MetadataChangeProposalWrapper(
|
|
494
|
+
entityUrn=dataset_urn.urn(),
|
|
495
|
+
aspect=DataPlatformInstanceClass(
|
|
496
|
+
instance=make_dataplatform_instance_urn(
|
|
497
|
+
instance=self.config.platform_instance,
|
|
498
|
+
platform="dagster",
|
|
499
|
+
),
|
|
500
|
+
platform=make_data_platform_urn("dagster"),
|
|
501
|
+
),
|
|
502
|
+
)
|
|
503
|
+
graph.emit_mcp(mcp)
|
|
504
|
+
return dataset_urn.urn()
|
|
File without changes
|
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import traceback
|
|
3
|
+
from typing import Dict, List, Optional, Sequence, Set, Tuple
|
|
4
|
+
|
|
5
|
+
from dagster import (
|
|
6
|
+
DagsterRunStatus,
|
|
7
|
+
EventLogEntry,
|
|
8
|
+
RunStatusSensorContext,
|
|
9
|
+
SensorDefinition,
|
|
10
|
+
SkipReason,
|
|
11
|
+
run_status_sensor,
|
|
12
|
+
sensor,
|
|
13
|
+
)
|
|
14
|
+
from dagster._core.definitions.asset_selection import CoercibleToAssetSelection
|
|
15
|
+
from dagster._core.definitions.sensor_definition import (
|
|
16
|
+
DefaultSensorStatus,
|
|
17
|
+
RawSensorEvaluationFunctionReturn,
|
|
18
|
+
)
|
|
19
|
+
from dagster._core.definitions.target import ExecutableDefinition
|
|
20
|
+
from dagster._core.events import DagsterEventType, HandledOutputData, LoadedInputData
|
|
21
|
+
from dagster._core.execution.stats import RunStepKeyStatsSnapshot
|
|
22
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
23
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
24
|
+
from datahub.metadata.schema_classes import SubTypesClass
|
|
25
|
+
|
|
26
|
+
from datahub_dagster_plugin.client.dagster_generator import (
|
|
27
|
+
DagsterEnvironment,
|
|
28
|
+
DagsterGenerator,
|
|
29
|
+
DatahubDagsterSourceConfig,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def make_datahub_sensor(
|
|
34
|
+
config: DatahubDagsterSourceConfig,
|
|
35
|
+
name: Optional[str] = None,
|
|
36
|
+
minimum_interval_seconds: Optional[int] = None,
|
|
37
|
+
description: Optional[str] = None,
|
|
38
|
+
job: Optional[ExecutableDefinition] = None,
|
|
39
|
+
jobs: Optional[Sequence[ExecutableDefinition]] = None,
|
|
40
|
+
default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,
|
|
41
|
+
asset_selection: Optional[CoercibleToAssetSelection] = None,
|
|
42
|
+
required_resource_keys: Optional[Set[str]] = None,
|
|
43
|
+
) -> SensorDefinition:
|
|
44
|
+
"""Create a sensor on job status change emit lineage to DataHub.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
config (DatahubDagsterSourceConfig): DataHub Sensor config
|
|
48
|
+
name: (Optional[str]): The name of the sensor. Defaults to "datahub_sensor".
|
|
49
|
+
minimum_interval_seconds: (Optional[int]): The minimum number of seconds that will elapse
|
|
50
|
+
between sensor evaluations.
|
|
51
|
+
default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default
|
|
52
|
+
status can be overridden from Dagit or via the GraphQL API.
|
|
53
|
+
|
|
54
|
+
Examples:
|
|
55
|
+
.. code-block:: python
|
|
56
|
+
|
|
57
|
+
datahub_sensor = make_datahub_sensor(
|
|
58
|
+
config
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
@repository
|
|
62
|
+
def my_repo():
|
|
63
|
+
return [my_job + datahub_sensor]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
@sensor(
|
|
69
|
+
name=name,
|
|
70
|
+
minimum_interval_seconds=minimum_interval_seconds,
|
|
71
|
+
description=description,
|
|
72
|
+
job=job,
|
|
73
|
+
jobs=jobs,
|
|
74
|
+
default_status=default_status,
|
|
75
|
+
asset_selection=asset_selection,
|
|
76
|
+
required_resource_keys=required_resource_keys,
|
|
77
|
+
)
|
|
78
|
+
def datahub_sensor(context):
|
|
79
|
+
"""
|
|
80
|
+
Sensor which instigate all run status sensors and trigger them based upon run status
|
|
81
|
+
"""
|
|
82
|
+
for each in DatahubSensors(config).sensors:
|
|
83
|
+
each.evaluate_tick(context)
|
|
84
|
+
return SkipReason("Trigger run status sensors if any new runs present...")
|
|
85
|
+
|
|
86
|
+
return datahub_sensor
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class DatahubSensors:
|
|
90
|
+
def __init__(self, config: Optional[DatahubDagsterSourceConfig] = None):
|
|
91
|
+
"""
|
|
92
|
+
Set dagster source configurations and initialize datahub emitter and dagster run status sensors
|
|
93
|
+
"""
|
|
94
|
+
if config:
|
|
95
|
+
self.config = config
|
|
96
|
+
else:
|
|
97
|
+
self.config = DatahubDagsterSourceConfig()
|
|
98
|
+
self.graph = DataHubGraph(
|
|
99
|
+
self.config.datahub_client_config,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
self.graph.test_connection()
|
|
103
|
+
self.sensors: List[SensorDefinition] = []
|
|
104
|
+
self.sensors.append(
|
|
105
|
+
run_status_sensor(
|
|
106
|
+
name="datahub_success_sensor", run_status=DagsterRunStatus.SUCCESS
|
|
107
|
+
)(self._emit_metadata)
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
self.sensors.append(
|
|
111
|
+
run_status_sensor(
|
|
112
|
+
name="datahub_failure_sensor", run_status=DagsterRunStatus.FAILURE
|
|
113
|
+
)(self._emit_metadata)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
self.sensors.append(
|
|
117
|
+
run_status_sensor(
|
|
118
|
+
name="datahub_canceled_sensor", run_status=DagsterRunStatus.CANCELED
|
|
119
|
+
)(self._emit_metadata)
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def get_dagster_environment(
|
|
123
|
+
self, context: RunStatusSensorContext
|
|
124
|
+
) -> Optional[DagsterEnvironment]:
|
|
125
|
+
if (
|
|
126
|
+
context.dagster_run.job_code_origin
|
|
127
|
+
and context.dagster_run.job_code_origin.repository_origin
|
|
128
|
+
and context.dagster_run.job_code_origin.repository_origin.code_pointer
|
|
129
|
+
):
|
|
130
|
+
|
|
131
|
+
code_pointer = (
|
|
132
|
+
context.dagster_run.job_code_origin.repository_origin.code_pointer
|
|
133
|
+
)
|
|
134
|
+
context.log.debug(f"code_pointer: {code_pointer}")
|
|
135
|
+
|
|
136
|
+
if hasattr(code_pointer, "attribute"):
|
|
137
|
+
repository = code_pointer.attribute
|
|
138
|
+
else:
|
|
139
|
+
repository = None
|
|
140
|
+
|
|
141
|
+
if hasattr(code_pointer, "module"):
|
|
142
|
+
module = code_pointer.module
|
|
143
|
+
else:
|
|
144
|
+
context.log.error("Unable to get Module")
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
dagster_environment = DagsterEnvironment(
|
|
148
|
+
is_cloud=os.getenv("DAGSTER_CLOUD_IS_BRANCH_DEPLOYMENT", None)
|
|
149
|
+
is not None,
|
|
150
|
+
is_branch_deployment=(
|
|
151
|
+
True
|
|
152
|
+
if os.getenv("DAGSTER_CLOUD_IS_BRANCH_DEPLOYMENT", False) == 1
|
|
153
|
+
else False
|
|
154
|
+
),
|
|
155
|
+
branch=os.getenv("DAGSTER_CLOUD_DEPLOYMENT_NAME", "prod"),
|
|
156
|
+
module=module,
|
|
157
|
+
repository=repository,
|
|
158
|
+
)
|
|
159
|
+
return dagster_environment
|
|
160
|
+
else:
|
|
161
|
+
context.log.error("Unable to get Dagster Environment...")
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
def process_asset_logs(
|
|
165
|
+
self,
|
|
166
|
+
dagster_generator: DagsterGenerator,
|
|
167
|
+
log: EventLogEntry,
|
|
168
|
+
dataset_inputs: Dict[str, set],
|
|
169
|
+
dataset_outputs: Dict[str, set],
|
|
170
|
+
) -> None:
|
|
171
|
+
|
|
172
|
+
if not log.dagster_event or not log.step_key:
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
if log.dagster_event.event_type == DagsterEventType.ASSET_MATERIALIZATION:
|
|
176
|
+
if log.step_key not in dataset_outputs:
|
|
177
|
+
dataset_outputs[log.step_key] = set()
|
|
178
|
+
|
|
179
|
+
materialization = log.asset_materialization
|
|
180
|
+
if not materialization:
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
properties = {
|
|
184
|
+
key: str(value) for (key, value) in materialization.metadata.items()
|
|
185
|
+
}
|
|
186
|
+
asset_key = materialization.asset_key.path
|
|
187
|
+
dataset_urn = dagster_generator.emit_asset(
|
|
188
|
+
self.graph, asset_key, materialization.description, properties
|
|
189
|
+
)
|
|
190
|
+
dataset_outputs[log.step_key].add(dataset_urn)
|
|
191
|
+
|
|
192
|
+
elif log.dagster_event.event_type == DagsterEventType.ASSET_OBSERVATION:
|
|
193
|
+
if log.step_key not in dataset_inputs:
|
|
194
|
+
dataset_inputs[log.step_key] = set()
|
|
195
|
+
asset_observation = log.asset_observation
|
|
196
|
+
if not asset_observation:
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
properties = {
|
|
200
|
+
key: str(value)
|
|
201
|
+
for (key, value) in asset_observation.metadata.items() # type: ignore
|
|
202
|
+
}
|
|
203
|
+
asset_key = asset_observation.asset_key.path # type: ignore
|
|
204
|
+
dataset_urn = dagster_generator.emit_asset(
|
|
205
|
+
self.graph,
|
|
206
|
+
asset_key,
|
|
207
|
+
asset_observation.description,
|
|
208
|
+
properties, # type: ignore
|
|
209
|
+
)
|
|
210
|
+
dataset_inputs[log.step_key].add(dataset_urn)
|
|
211
|
+
|
|
212
|
+
def process_handle_input_output(
|
|
213
|
+
self,
|
|
214
|
+
context: RunStatusSensorContext,
|
|
215
|
+
log: EventLogEntry,
|
|
216
|
+
dagster_generator: DagsterGenerator,
|
|
217
|
+
dataset_inputs: Dict[str, set],
|
|
218
|
+
dataset_outputs: Dict[str, set],
|
|
219
|
+
) -> None:
|
|
220
|
+
if not log.dagster_event or not log.step_key:
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
if (
|
|
224
|
+
self.config.capture_input_output
|
|
225
|
+
and log.dagster_event.event_type == DagsterEventType.HANDLED_OUTPUT
|
|
226
|
+
):
|
|
227
|
+
if log.step_key not in dataset_outputs:
|
|
228
|
+
dataset_outputs[log.step_key] = set()
|
|
229
|
+
|
|
230
|
+
event_specific_data = log.dagster_event.event_specific_data
|
|
231
|
+
if isinstance(event_specific_data, HandledOutputData):
|
|
232
|
+
context.log.debug(
|
|
233
|
+
f"Output Path: {event_specific_data.metadata.get('path')}"
|
|
234
|
+
)
|
|
235
|
+
metadata = event_specific_data.metadata.get("path")
|
|
236
|
+
context.log.debug(f"Metadata: {metadata}")
|
|
237
|
+
if not metadata:
|
|
238
|
+
return
|
|
239
|
+
urn = dagster_generator.metadata_resolver(metadata)
|
|
240
|
+
if urn:
|
|
241
|
+
context.log.debug(f"Output Urn: {urn}")
|
|
242
|
+
dataset_outputs[log.step_key].add(urn)
|
|
243
|
+
elif (
|
|
244
|
+
self.config.capture_input_output
|
|
245
|
+
and log.dagster_event.event_type == DagsterEventType.LOADED_INPUT
|
|
246
|
+
):
|
|
247
|
+
if log.step_key not in dataset_inputs:
|
|
248
|
+
dataset_inputs[log.step_key] = set()
|
|
249
|
+
event_specific_data = log.dagster_event.event_specific_data
|
|
250
|
+
if isinstance(event_specific_data, LoadedInputData):
|
|
251
|
+
context.log.debug(
|
|
252
|
+
f"Input Path: {event_specific_data.metadata.get('path')}"
|
|
253
|
+
)
|
|
254
|
+
metadata = event_specific_data.metadata.get("path")
|
|
255
|
+
context.log.debug(f"Metadata: {metadata}")
|
|
256
|
+
if not metadata:
|
|
257
|
+
return
|
|
258
|
+
urn = dagster_generator.metadata_resolver(metadata)
|
|
259
|
+
if urn:
|
|
260
|
+
context.log.debug(f"Input Urn: {urn}")
|
|
261
|
+
dataset_inputs[log.step_key].add(urn)
|
|
262
|
+
|
|
263
|
+
def process_dagster_logs(
|
|
264
|
+
self, context: RunStatusSensorContext, dagster_generator: DagsterGenerator
|
|
265
|
+
) -> Tuple[Dict[str, set], Dict[str, set]]:
|
|
266
|
+
dataset_outputs: Dict[str, set] = {}
|
|
267
|
+
dataset_inputs: Dict[str, set] = {}
|
|
268
|
+
|
|
269
|
+
logs = context.instance.all_logs(
|
|
270
|
+
context.dagster_run.run_id,
|
|
271
|
+
{
|
|
272
|
+
DagsterEventType.ASSET_MATERIALIZATION,
|
|
273
|
+
DagsterEventType.ASSET_OBSERVATION,
|
|
274
|
+
DagsterEventType.HANDLED_OUTPUT,
|
|
275
|
+
DagsterEventType.LOADED_INPUT,
|
|
276
|
+
},
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
for log in logs:
|
|
280
|
+
if not log.dagster_event or not log.step_key:
|
|
281
|
+
continue
|
|
282
|
+
context.log.debug(f"Log: {log.step_key} - {log.dagster_event}")
|
|
283
|
+
context.log.debug(f"Event Type: {log.dagster_event.event_type}")
|
|
284
|
+
if self.config.capture_input_output:
|
|
285
|
+
self.process_handle_input_output(
|
|
286
|
+
context=context,
|
|
287
|
+
log=log,
|
|
288
|
+
dagster_generator=dagster_generator,
|
|
289
|
+
dataset_inputs=dataset_inputs,
|
|
290
|
+
dataset_outputs=dataset_outputs,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
if self.config.capture_asset_materialization:
|
|
294
|
+
self.process_asset_logs(
|
|
295
|
+
dagster_generator=dagster_generator,
|
|
296
|
+
log=log,
|
|
297
|
+
dataset_inputs=dataset_inputs,
|
|
298
|
+
dataset_outputs=dataset_outputs,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
return dataset_inputs, dataset_outputs
|
|
302
|
+
|
|
303
|
+
@staticmethod
|
|
304
|
+
def merge_dicts(dict1: Dict[str, Set], dict2: Dict[str, Set]) -> Dict[str, Set]:
|
|
305
|
+
"""
|
|
306
|
+
Merge two dictionaries
|
|
307
|
+
"""
|
|
308
|
+
for key, value in dict2.items():
|
|
309
|
+
if key in dict1:
|
|
310
|
+
dict1[key] = dict1[key].union(value)
|
|
311
|
+
else:
|
|
312
|
+
dict1[key] = value
|
|
313
|
+
return dict1
|
|
314
|
+
|
|
315
|
+
def _emit_metadata(
|
|
316
|
+
self, context: RunStatusSensorContext
|
|
317
|
+
) -> RawSensorEvaluationFunctionReturn:
|
|
318
|
+
"""
|
|
319
|
+
Function to emit metadata for datahub rest.
|
|
320
|
+
"""
|
|
321
|
+
try:
|
|
322
|
+
context.log.info("Emitting metadata...")
|
|
323
|
+
|
|
324
|
+
assert context.dagster_run.job_snapshot_id
|
|
325
|
+
assert context.dagster_run.execution_plan_snapshot_id
|
|
326
|
+
|
|
327
|
+
dagster_environment = self.get_dagster_environment(context)
|
|
328
|
+
context.log.debug(f"dagster enivronment: {dagster_environment}")
|
|
329
|
+
if not dagster_environment:
|
|
330
|
+
return SkipReason(
|
|
331
|
+
"Unable to get Dagster Environment from DataHub Sensor"
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
context.log.debug(f"Dagster Environment: {dagster_environment}")
|
|
335
|
+
|
|
336
|
+
dagster_generator = DagsterGenerator(
|
|
337
|
+
logger=context.log,
|
|
338
|
+
config=self.config,
|
|
339
|
+
dagster_environment=dagster_environment,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
job_snapshot = context.instance.get_job_snapshot(
|
|
343
|
+
snapshot_id=context.dagster_run.job_snapshot_id
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
dataset_inputs: Dict[str, Set] = {}
|
|
347
|
+
dataset_outputs: Dict[str, Set] = {}
|
|
348
|
+
|
|
349
|
+
if self.config.asset_lineage_extractor:
|
|
350
|
+
asset_lineages = self.config.asset_lineage_extractor(
|
|
351
|
+
context, dagster_generator, self.graph
|
|
352
|
+
)
|
|
353
|
+
for key, value in asset_lineages.items():
|
|
354
|
+
dataset_inputs[key] = dataset_inputs.get(key, set()).union(
|
|
355
|
+
value.inputs
|
|
356
|
+
)
|
|
357
|
+
dataset_outputs[key] = dataset_outputs.get(key, set()).union(
|
|
358
|
+
value.outputs
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
(
|
|
362
|
+
dataset_inputs_from_logs,
|
|
363
|
+
dataset_outputs_from_logs,
|
|
364
|
+
) = self.process_dagster_logs(context, dagster_generator)
|
|
365
|
+
|
|
366
|
+
dataset_inputs = DatahubSensors.merge_dicts(
|
|
367
|
+
dataset_inputs, dataset_inputs_from_logs
|
|
368
|
+
)
|
|
369
|
+
dataset_outputs = DatahubSensors.merge_dicts(
|
|
370
|
+
dataset_outputs, dataset_outputs_from_logs
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
context.log.debug(f"Outputs: {dataset_outputs}")
|
|
374
|
+
# Emit dagster job entity which get mapped with datahub dataflow entity
|
|
375
|
+
dataflow = dagster_generator.generate_dataflow(
|
|
376
|
+
job_snapshot=job_snapshot,
|
|
377
|
+
env=self.config.env,
|
|
378
|
+
platform_instance=self.config.platform_instance,
|
|
379
|
+
)
|
|
380
|
+
dataflow.emit(self.graph)
|
|
381
|
+
|
|
382
|
+
# Emit dagster job run which get mapped with datahub data process instance entity
|
|
383
|
+
dagster_generator.emit_job_run(
|
|
384
|
+
graph=self.graph,
|
|
385
|
+
dataflow=dataflow,
|
|
386
|
+
run=context.dagster_run,
|
|
387
|
+
run_stats=context.instance.get_run_stats(context.dagster_run.run_id),
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
# Execution plan snapshot contains all steps(ops) dependency.
|
|
391
|
+
execution_plan_snapshot = context.instance.get_execution_plan_snapshot(
|
|
392
|
+
snapshot_id=context.dagster_run.execution_plan_snapshot_id
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# Map step key with its run step stats
|
|
396
|
+
run_step_stats: Dict[str, RunStepKeyStatsSnapshot] = {
|
|
397
|
+
run_step_stat.step_key: run_step_stat
|
|
398
|
+
for run_step_stat in context.instance.get_run_step_stats(
|
|
399
|
+
context.dagster_run.run_id
|
|
400
|
+
)
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
# For all dagster ops present in job:
|
|
404
|
+
# Emit op entity which get mapped with datahub datajob entity.
|
|
405
|
+
# Emit op run which get mapped with datahub data process instance entity.
|
|
406
|
+
for op_def_snap in job_snapshot.node_defs_snapshot.op_def_snaps:
|
|
407
|
+
datajob = dagster_generator.generate_datajob(
|
|
408
|
+
job_snapshot=job_snapshot,
|
|
409
|
+
step_deps=execution_plan_snapshot.step_deps,
|
|
410
|
+
op_def_snap=op_def_snap,
|
|
411
|
+
env=self.config.env,
|
|
412
|
+
platform_instance=self.config.platform_instance,
|
|
413
|
+
output_datasets=dataset_outputs,
|
|
414
|
+
input_datasets=dataset_inputs,
|
|
415
|
+
)
|
|
416
|
+
context.log.info(f"Generated Datajob: {datajob}")
|
|
417
|
+
datajob.emit(self.graph)
|
|
418
|
+
|
|
419
|
+
self.graph.emit_mcp(
|
|
420
|
+
mcp=MetadataChangeProposalWrapper(
|
|
421
|
+
entityUrn=str(datajob.urn),
|
|
422
|
+
aspect=SubTypesClass(
|
|
423
|
+
typeNames=["Op"],
|
|
424
|
+
),
|
|
425
|
+
)
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
dagster_generator.emit_op_run(
|
|
429
|
+
graph=self.graph,
|
|
430
|
+
datajob=datajob,
|
|
431
|
+
run_step_stats=run_step_stats[op_def_snap.name],
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
return SkipReason("Pipeline metadata is emitted to DataHub")
|
|
435
|
+
except Exception as e:
|
|
436
|
+
context.log.error(
|
|
437
|
+
f"Error in emitting metadata to DataHub: {e}. Traceback: {traceback.format_exc()}"
|
|
438
|
+
)
|
|
439
|
+
return SkipReason("Error in emitting metadata to DataHub")
|