acryl-datahub-dagster-plugin 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.1
2
+ Name: acryl-datahub-dagster-plugin
3
+ Version: 0.0.0.dev0
4
+ Summary: Datahub Dagster plugin to capture executions and send to Datahub
5
+ Home-page: https://datahubproject.io/
6
+ License: Apache License 2.0
7
+ Project-URL: Documentation, https://datahubproject.io/docs/
8
+ Project-URL: Source, https://github.com/datahub-project/datahub
9
+ Project-URL: Changelog, https://github.com/datahub-project/datahub/releases
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Programming Language :: Python
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Intended Audience :: Developers
18
+ Classifier: Intended Audience :: Information Technology
19
+ Classifier: Intended Audience :: System Administrators
20
+ Classifier: License :: OSI Approved
21
+ Classifier: License :: OSI Approved :: Apache Software License
22
+ Classifier: Operating System :: Unix
23
+ Classifier: Operating System :: POSIX :: Linux
24
+ Classifier: Environment :: Console
25
+ Classifier: Environment :: MacOS X
26
+ Classifier: Topic :: Software Development
27
+ Requires-Python: >=3.8
28
+ Description-Content-Type: text/markdown
29
+ Requires-Dist: acryl-datahub[datahub-rest]
30
+ Requires-Dist: dagster >=1.3.3
31
+ Requires-Dist: requests-file
32
+ Requires-Dist: requests
33
+ Requires-Dist: dagit >=1.3.3
34
+ Provides-Extra: dev
35
+ Requires-Dist: dagster >=1.3.3 ; extra == 'dev'
36
+ Requires-Dist: types-freezegun ; extra == 'dev'
37
+ Requires-Dist: types-cachetools ; extra == 'dev'
38
+ Requires-Dist: tox ; extra == 'dev'
39
+ Requires-Dist: requests-file ; extra == 'dev'
40
+ Requires-Dist: types-click ==0.1.12 ; extra == 'dev'
41
+ Requires-Dist: flake8 >=6.0.0 ; extra == 'dev'
42
+ Requires-Dist: packaging ; extra == 'dev'
43
+ Requires-Dist: jsonpickle ; extra == 'dev'
44
+ Requires-Dist: types-dataclasses ; extra == 'dev'
45
+ Requires-Dist: black ==22.12.0 ; extra == 'dev'
46
+ Requires-Dist: build ; extra == 'dev'
47
+ Requires-Dist: flake8-tidy-imports >=4.3.0 ; extra == 'dev'
48
+ Requires-Dist: types-requests ; extra == 'dev'
49
+ Requires-Dist: mypy >=1.4.0 ; extra == 'dev'
50
+ Requires-Dist: pytest-cov >=2.8.1 ; extra == 'dev'
51
+ Requires-Dist: types-toml ; extra == 'dev'
52
+ Requires-Dist: acryl-datahub[datahub-rest] ; extra == 'dev'
53
+ Requires-Dist: types-pytz ; extra == 'dev'
54
+ Requires-Dist: types-PyYAML ; extra == 'dev'
55
+ Requires-Dist: requests-mock ; extra == 'dev'
56
+ Requires-Dist: pytest >=6.2.2 ; extra == 'dev'
57
+ Requires-Dist: requests ; extra == 'dev'
58
+ Requires-Dist: pytest-asyncio >=0.16.0 ; extra == 'dev'
59
+ Requires-Dist: types-tabulate ; extra == 'dev'
60
+ Requires-Dist: deepdiff ; extra == 'dev'
61
+ Requires-Dist: dagit >=1.3.3 ; extra == 'dev'
62
+ Requires-Dist: freezegun ; extra == 'dev'
63
+ Requires-Dist: flake8-bugbear ==23.3.12 ; extra == 'dev'
64
+ Requires-Dist: types-pkg-resources ; extra == 'dev'
65
+ Requires-Dist: isort >=5.7.0 ; extra == 'dev'
66
+ Requires-Dist: twine ; extra == 'dev'
67
+ Requires-Dist: types-python-dateutil ; extra == 'dev'
68
+ Requires-Dist: types-six ; extra == 'dev'
69
+ Requires-Dist: sqlalchemy-stubs ; extra == 'dev'
70
+ Requires-Dist: coverage >=5.1 ; extra == 'dev'
71
+ Requires-Dist: pydantic !=1.10.3,>=1.10.0 ; extra == 'dev'
72
+ Provides-Extra: ignore
73
+ Provides-Extra: integration-tests
74
+ Requires-Dist: dagster >=1.3.3 ; extra == 'integration-tests'
75
+ Requires-Dist: types-freezegun ; extra == 'integration-tests'
76
+ Requires-Dist: types-cachetools ; extra == 'integration-tests'
77
+ Requires-Dist: tox ; extra == 'integration-tests'
78
+ Requires-Dist: requests-file ; extra == 'integration-tests'
79
+ Requires-Dist: types-click ==0.1.12 ; extra == 'integration-tests'
80
+ Requires-Dist: flake8 >=6.0.0 ; extra == 'integration-tests'
81
+ Requires-Dist: packaging ; extra == 'integration-tests'
82
+ Requires-Dist: jsonpickle ; extra == 'integration-tests'
83
+ Requires-Dist: types-dataclasses ; extra == 'integration-tests'
84
+ Requires-Dist: black ==22.12.0 ; extra == 'integration-tests'
85
+ Requires-Dist: build ; extra == 'integration-tests'
86
+ Requires-Dist: flake8-tidy-imports >=4.3.0 ; extra == 'integration-tests'
87
+ Requires-Dist: types-requests ; extra == 'integration-tests'
88
+ Requires-Dist: mypy >=1.4.0 ; extra == 'integration-tests'
89
+ Requires-Dist: pytest-cov >=2.8.1 ; extra == 'integration-tests'
90
+ Requires-Dist: types-toml ; extra == 'integration-tests'
91
+ Requires-Dist: acryl-datahub[datahub-rest] ; extra == 'integration-tests'
92
+ Requires-Dist: types-pytz ; extra == 'integration-tests'
93
+ Requires-Dist: types-PyYAML ; extra == 'integration-tests'
94
+ Requires-Dist: requests-mock ; extra == 'integration-tests'
95
+ Requires-Dist: pytest >=6.2.2 ; extra == 'integration-tests'
96
+ Requires-Dist: requests ; extra == 'integration-tests'
97
+ Requires-Dist: pytest-asyncio >=0.16.0 ; extra == 'integration-tests'
98
+ Requires-Dist: types-tabulate ; extra == 'integration-tests'
99
+ Requires-Dist: deepdiff ; extra == 'integration-tests'
100
+ Requires-Dist: dagit >=1.3.3 ; extra == 'integration-tests'
101
+ Requires-Dist: freezegun ; extra == 'integration-tests'
102
+ Requires-Dist: flake8-bugbear ==23.3.12 ; extra == 'integration-tests'
103
+ Requires-Dist: types-pkg-resources ; extra == 'integration-tests'
104
+ Requires-Dist: isort >=5.7.0 ; extra == 'integration-tests'
105
+ Requires-Dist: twine ; extra == 'integration-tests'
106
+ Requires-Dist: types-python-dateutil ; extra == 'integration-tests'
107
+ Requires-Dist: types-six ; extra == 'integration-tests'
108
+ Requires-Dist: sqlalchemy-stubs ; extra == 'integration-tests'
109
+ Requires-Dist: coverage >=5.1 ; extra == 'integration-tests'
110
+ Requires-Dist: pydantic !=1.10.3,>=1.10.0 ; extra == 'integration-tests'
111
+
112
+ # Datahub Dagster Plugin
113
+
114
+ See the DataHub Dagster docs for details.
115
+
@@ -0,0 +1,11 @@
1
+ datahub_dagster_plugin/__init__.py,sha256=KN5nSxBdJqXJ-oGx6T8Hp0ta2pGtYDF1FD5UWFmCMTs,530
2
+ datahub_dagster_plugin/datahub_dagster_plugin.py,sha256=szsT7ddxs16eE56mhCdNXDtlakbNLmuhd2wUbkba5Xc,64
3
+ datahub_dagster_plugin/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ datahub_dagster_plugin/client/dagster_generator.py,sha256=yi40Fs4kY-Ba9g5eeH6kxI9VGRB1uiRHum1SMGIvAdU,18594
5
+ datahub_dagster_plugin/sensors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ datahub_dagster_plugin/sensors/datahub_sensors.py,sha256=oWAgOMlLeiJs37WzijfTK8W-vwnmGE0pf4IArFgt4YM,16652
7
+ acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/METADATA,sha256=PXm9UvKsGUNH8lNgMvKArW94GfHZihPs_ghq78iV9P8,5609
8
+ acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
9
+ acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/entry_points.txt,sha256=-CtPxtYb1u-zR36QnUQrvJJ6qbf1eDw9SruA22XBPZw,116
10
+ acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/top_level.txt,sha256=JS8QiBAj3eMTcMNcstm_EXGAcziiXVNT2nzOcfhdEMc,23
11
+ acryl_datahub_dagster_plugin-0.0.0.dev0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.43.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [dagster.plugins]
2
+ acryl-datahub-dagster-plugin = datahub_dagster_plugin.datahub_dagster_plugin:DatahubDagsterPlugin
@@ -0,0 +1 @@
1
+ datahub_dagster_plugin
@@ -0,0 +1,21 @@
1
+ # Published at https://pypi.org/project/acryl-datahub/.
2
+ __package_name__ = "acryl-datahub-dagster-plugin"
3
+ __version__ = "0.0.0.dev0"
4
+
5
+
6
+ def is_dev_mode() -> bool:
7
+ return __version__.endswith("dev0")
8
+
9
+
10
+ def nice_version_name() -> str:
11
+ if is_dev_mode():
12
+ return "unavailable (installed in develop mode)"
13
+ return __version__
14
+
15
+
16
+ def get_provider_info():
17
+ return {
18
+ "package-name": f"{__package_name__}",
19
+ "name": f"{__package_name__}",
20
+ "description": "Datahub metadata collector plugin",
21
+ }
File without changes
@@ -0,0 +1,504 @@
1
+ from dataclasses import dataclass
2
+ from logging import Logger
3
+ from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Set
4
+ from urllib.parse import urlsplit
5
+
6
+ import pydantic
7
+ from dagster import DagsterRunStatus, PathMetadataValue, RunStatusSensorContext
8
+ from dagster._core.execution.stats import RunStepKeyStatsSnapshot, StepEventStatus
9
+ from dagster._core.snap import JobSnapshot
10
+ from dagster._core.snap.node import OpDefSnap
11
+ from dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatsSnapshot
12
+ from datahub.api.entities.datajob import DataFlow, DataJob
13
+ from datahub.api.entities.dataprocess.dataprocess_instance import (
14
+ DataProcessInstance,
15
+ InstanceRunResult,
16
+ )
17
+ from datahub.api.entities.dataset.dataset import Dataset
18
+ from datahub.configuration.source_common import DatasetSourceConfigMixin
19
+ from datahub.emitter.mce_builder import (
20
+ make_data_platform_urn,
21
+ make_dataplatform_instance_urn,
22
+ )
23
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
24
+ from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
25
+ from datahub.metadata.schema_classes import DataPlatformInstanceClass, SubTypesClass
26
+ from datahub.utilities.urns.data_flow_urn import DataFlowUrn
27
+ from datahub.utilities.urns.data_job_urn import DataJobUrn
28
+ from datahub.utilities.urns.dataset_urn import DatasetUrn
29
+
30
+
31
+ class Constant:
32
+ """
33
+ keys used in dagster plugin
34
+ """
35
+
36
+ ORCHESTRATOR = "dagster"
37
+
38
+ # Default config constants
39
+ DEFAULT_DATAHUB_REST_URL = "http://localhost:8080"
40
+
41
+ # Environment variable contants
42
+ DATAHUB_REST_URL = "DATAHUB_REST_URL"
43
+ DATAHUB_ENV = "DATAHUB_ENV"
44
+ DATAHUB_PLATFORM_INSTANCE = "DATAHUB_PLATFORM_INSTANCE"
45
+ DAGSTER_UI_URL = "DAGSTER_UI_URL"
46
+
47
+ # Datahub inputs/outputs constant
48
+ DATAHUB_INPUTS = "datahub.inputs"
49
+ DATAHUB_OUTPUTS = "datahub.outputs"
50
+
51
+ # Job run constant
52
+ JOB_SNAPSHOT_ID = "job_snapshot_id"
53
+ EXECUTION_PLAN_SNAPSHOT_ID = "execution_plan_snapshot_id"
54
+ ROOT_RUN_ID = "root_run_id"
55
+ PARENT_RUN_ID = "parent_run_id"
56
+ HAS_REPOSITORY_LOAD_DATA = "has_repository_load_data"
57
+ TAGS = "tags"
58
+ STEPS_SUCCEEDED = "steps_succeeded"
59
+ STEPS_FAILED = "steps_failed"
60
+ MATERIALIZATIONS = "materializations"
61
+ EXPECTATIONS = "expectations"
62
+ ENQUEUED_TIME = "enqueued_time"
63
+ LAUNCH_TIME = "launch_time"
64
+ START_TIME = "start_time"
65
+ END_TIME = "end_time"
66
+
67
+ # Op run contants
68
+ STEP_KEY = "step_key"
69
+ ATTEMPTS = "attempts"
70
+
71
+
72
+ class DatasetLineage(NamedTuple):
73
+ inputs: Set[str]
74
+ outputs: Set[str]
75
+
76
+
77
+ class DatahubDagsterSourceConfig(DatasetSourceConfigMixin):
78
+ datahub_client_config: DatahubClientConfig = pydantic.Field(
79
+ default=DatahubClientConfig(),
80
+ description="Datahub client config",
81
+ )
82
+
83
+ dagster_url: Optional[str] = pydantic.Field(
84
+ default=None,
85
+ description="Dagster UI URL. Like: https://myDagsterCloudEnvironment.dagster.cloud/prod",
86
+ )
87
+
88
+ capture_asset_materialization: bool = pydantic.Field(
89
+ default=True,
90
+ description="Whether to capture asset keys as Dataset on AssetMaterialization event",
91
+ )
92
+
93
+ capture_input_output: bool = pydantic.Field(
94
+ default=False,
95
+ description="Whether to capture and try to parse input and output from HANDLED_OUTPUT, LOADED_INPUT event. (currently only filepathvalue metadata supported",
96
+ )
97
+
98
+ asset_lineage_extractor: Optional[
99
+ Callable[
100
+ [RunStatusSensorContext, "DagsterGenerator", DataHubGraph],
101
+ Dict[str, DatasetLineage],
102
+ ]
103
+ ] = pydantic.Field(
104
+ default=None,
105
+ description="Custom asset lineage extractor function. See details at [https://datahubproject.io/docs/lineage/dagster/#define-your-custom-logic-to-capture-asset-lineage-information]",
106
+ )
107
+
108
+
109
+ def _str_urn_to_dataset_urn(urns: List[str]) -> List[DatasetUrn]:
110
+ return [DatasetUrn.create_from_string(urn) for urn in urns]
111
+
112
+
113
+ @dataclass
114
+ class DagsterEnvironment:
115
+ repository: Optional[str]
116
+ is_cloud: bool = True
117
+ is_branch_deployment: bool = False
118
+ branch: Optional[str] = "prod"
119
+ module: Optional[str] = None
120
+
121
+
122
+ def job_url_generator(dagster_url: str, dagster_environment: DagsterEnvironment) -> str:
123
+ if dagster_environment.is_cloud:
124
+ base_url = f"{dagster_url}/{dagster_environment.branch}"
125
+ else:
126
+ base_url = dagster_url
127
+
128
+ if dagster_environment.module:
129
+ base_url = f"{base_url}/locations/{dagster_environment.module}"
130
+
131
+ return base_url
132
+
133
+
134
+ class DagsterGenerator:
135
+ def __init__(
136
+ self,
137
+ logger: Logger,
138
+ config: DatahubDagsterSourceConfig,
139
+ dagster_environment: DagsterEnvironment,
140
+ ):
141
+ self.logger = logger
142
+ self.config = config
143
+ self.dagster_environment = dagster_environment
144
+
145
+ def path_metadata_resolver(self, value: PathMetadataValue) -> Optional[DatasetUrn]:
146
+ """
147
+ Resolve path metadata to dataset urn
148
+ """
149
+ path = value.value
150
+ if not path:
151
+ return None
152
+
153
+ if "://" in path:
154
+ url = urlsplit(path)
155
+ scheme = url.scheme
156
+
157
+ # Need to adjust some these schemes
158
+ if scheme in ["s3a", "s3n"]:
159
+ scheme = "s3"
160
+ elif scheme in ["gs"]:
161
+ scheme = "gcs"
162
+
163
+ return DatasetUrn(platform=scheme, name=url.path)
164
+ else:
165
+ return DatasetUrn(platform="file", name=path)
166
+
167
+ def metadata_resolver(self, metadata: Any) -> Optional[DatasetUrn]:
168
+ """
169
+ Resolve metadata to dataset urn
170
+ """
171
+ if isinstance(metadata, PathMetadataValue):
172
+ return self.path_metadata_resolver(metadata)
173
+ else:
174
+ self.logger.info(f"Unknown Metadata: {metadata} of type {type(metadata)}")
175
+ return None
176
+
177
+ def generate_dataflow(
178
+ self,
179
+ job_snapshot: JobSnapshot,
180
+ env: str,
181
+ platform_instance: Optional[str] = None,
182
+ ) -> DataFlow:
183
+ """
184
+ Generates a Dataflow object from an Dagster Job Snapshot
185
+ :param job_snapshot: JobSnapshot - Job snapshot object
186
+ :param env: str
187
+ :param platform_instance: Optional[str]
188
+ :return: DataFlow - Data generated dataflow
189
+ """
190
+ if self.dagster_environment.is_cloud:
191
+ id = f"{self.dagster_environment.branch}/{self.dagster_environment.module}/{job_snapshot.name}"
192
+ else:
193
+ id = f"{self.dagster_environment.module}/{job_snapshot.name}"
194
+
195
+ dataflow = DataFlow(
196
+ orchestrator=Constant.ORCHESTRATOR,
197
+ id=id,
198
+ env=env,
199
+ name=job_snapshot.name,
200
+ platform_instance=platform_instance,
201
+ )
202
+ dataflow.description = job_snapshot.description
203
+ dataflow.tags = set(job_snapshot.tags.keys())
204
+ if self.config.dagster_url:
205
+ dataflow.url = f"{job_url_generator(dagster_url=self.config.dagster_url, dagster_environment=self.dagster_environment)}/jobs/{job_snapshot.name}"
206
+ flow_property_bag: Dict[str, str] = {}
207
+ for key in job_snapshot.metadata.keys():
208
+ flow_property_bag[key] = str(job_snapshot.metadata[key])
209
+ dataflow.properties = flow_property_bag
210
+ return dataflow
211
+
212
+ def generate_datajob(
213
+ self,
214
+ job_snapshot: JobSnapshot,
215
+ step_deps: Dict[str, List],
216
+ op_def_snap: OpDefSnap,
217
+ env: str,
218
+ input_datasets: Dict[str, Set[DatasetUrn]],
219
+ output_datasets: Dict[str, Set[DatasetUrn]],
220
+ platform_instance: Optional[str] = None,
221
+ ) -> DataJob:
222
+ """
223
+ Generates a Datajob object from an Dagster op snapshot
224
+ :param job_snapshot: JobSnapshot - Job snapshot object
225
+ :param op_def_snap: OpDefSnap - Op def snapshot object
226
+ :param env: str
227
+ :param platform_instance: Optional[str]
228
+ :param output_datasets: dict[str, Set[DatasetUrn]] - output datasets for each op
229
+ :return: DataJob - Data generated datajob
230
+ """
231
+
232
+ if self.dagster_environment.is_cloud:
233
+ flow_id = f"{self.dagster_environment.branch}/{self.dagster_environment.module}/{job_snapshot.name}"
234
+ job_id = f"{self.dagster_environment.branch}/{self.dagster_environment.module}/{op_def_snap.name}"
235
+ else:
236
+ flow_id = f"{self.dagster_environment.module}/{job_snapshot.name}"
237
+ job_id = f"{self.dagster_environment.module}/{op_def_snap.name}"
238
+
239
+ dataflow_urn = DataFlowUrn.create_from_ids(
240
+ orchestrator=Constant.ORCHESTRATOR,
241
+ flow_id=flow_id,
242
+ env=env,
243
+ platform_instance=platform_instance,
244
+ )
245
+ datajob = DataJob(
246
+ id=job_id,
247
+ flow_urn=dataflow_urn,
248
+ name=op_def_snap.name,
249
+ )
250
+
251
+ if self.config.dagster_url:
252
+ datajob.url = f"{job_url_generator(dagster_url=self.config.dagster_url, dagster_environment=self.dagster_environment)}/jobs/{job_snapshot.name}/{op_def_snap.name}"
253
+
254
+ datajob.description = op_def_snap.description
255
+ datajob.tags = set(op_def_snap.tags.keys())
256
+
257
+ # Add upstream dependencies for this op
258
+ for upstream_op_name in step_deps[op_def_snap.name]:
259
+ if self.dagster_environment.is_cloud:
260
+ upstream_job_id = f"{self.dagster_environment.branch}/{self.dagster_environment.module}/{upstream_op_name}"
261
+ else:
262
+ upstream_job_id = (
263
+ f"{self.dagster_environment.module}/{upstream_op_name}"
264
+ )
265
+ upstream_op_urn = DataJobUrn.create_from_ids(
266
+ data_flow_urn=str(dataflow_urn),
267
+ job_id=upstream_job_id,
268
+ )
269
+ datajob.upstream_urns.extend([upstream_op_urn])
270
+ job_property_bag: Dict[str, str] = {}
271
+ if input_datasets:
272
+ self.logger.info(
273
+ f"Input datasets for {op_def_snap.name} are { list(input_datasets.get(op_def_snap.name, []))}"
274
+ )
275
+ datajob.inlets = list(input_datasets.get(op_def_snap.name, []))
276
+
277
+ if output_datasets:
278
+ self.logger.info(
279
+ f"Output datasets for {op_def_snap.name} are { list(output_datasets.get(op_def_snap.name, []))}"
280
+ )
281
+ datajob.outlets = list(output_datasets.get(op_def_snap.name, []))
282
+
283
+ # For all op inputs/outputs:
284
+ # Add input/output details like its type, description, metadata etc in datajob properties.
285
+ # Also, add datahub inputs/outputs if present in input/output metatdata.
286
+ for input_def_snap in op_def_snap.input_def_snaps:
287
+ job_property_bag[f"input.{input_def_snap.name}"] = str(
288
+ input_def_snap._asdict()
289
+ )
290
+ if Constant.DATAHUB_INPUTS in input_def_snap.metadata:
291
+ datajob.inlets.extend(
292
+ _str_urn_to_dataset_urn(
293
+ input_def_snap.metadata[Constant.DATAHUB_INPUTS].value # type: ignore
294
+ )
295
+ )
296
+
297
+ for output_def_snap in op_def_snap.output_def_snaps:
298
+ job_property_bag[f"output_{output_def_snap.name}"] = str(
299
+ output_def_snap._asdict()
300
+ )
301
+ if Constant.DATAHUB_OUTPUTS in output_def_snap.metadata:
302
+ datajob.outlets.extend(
303
+ _str_urn_to_dataset_urn(
304
+ output_def_snap.metadata[Constant.DATAHUB_OUTPUTS].value # type: ignore
305
+ )
306
+ )
307
+
308
+ datajob.properties = job_property_bag
309
+
310
+ return datajob
311
+
312
+ def emit_job_run(
313
+ self,
314
+ graph: DataHubGraph,
315
+ dataflow: DataFlow,
316
+ run: DagsterRun,
317
+ run_stats: DagsterRunStatsSnapshot,
318
+ ) -> None:
319
+ """
320
+ Emit a latest job run
321
+ :param graph: DatahubRestEmitter
322
+ :param dataflow: DataFlow - DataFlow object
323
+ :param run: DagsterRun - Dagster Run object
324
+ :param run_stats: DagsterRunStatsSnapshot - latest job run stats
325
+ """
326
+ dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=run_stats.run_id)
327
+ if self.config.dagster_url:
328
+ if self.dagster_environment.is_cloud:
329
+ dpi.url = f"{self.config.dagster_url}/{self.dagster_environment.branch}/runs/{run.run_id}"
330
+ else:
331
+ dpi.url = f"{self.config.dagster_url}/runs/{run.run_id}"
332
+
333
+ # Add below details in dpi properties
334
+ dpi_property_bag: Dict[str, str] = {}
335
+ allowed_job_run_keys = [
336
+ Constant.JOB_SNAPSHOT_ID,
337
+ Constant.EXECUTION_PLAN_SNAPSHOT_ID,
338
+ Constant.ROOT_RUN_ID,
339
+ Constant.PARENT_RUN_ID,
340
+ Constant.HAS_REPOSITORY_LOAD_DATA,
341
+ Constant.TAGS,
342
+ Constant.STEPS_SUCCEEDED,
343
+ Constant.STEPS_FAILED,
344
+ Constant.MATERIALIZATIONS,
345
+ Constant.EXPECTATIONS,
346
+ Constant.ENQUEUED_TIME,
347
+ Constant.LAUNCH_TIME,
348
+ Constant.START_TIME,
349
+ Constant.END_TIME,
350
+ ]
351
+ for key in allowed_job_run_keys:
352
+ if hasattr(run, key) and getattr(run, key) is not None:
353
+ dpi_property_bag[key] = str(getattr(run, key))
354
+ if hasattr(run_stats, key) and getattr(run_stats, key) is not None:
355
+ dpi_property_bag[key] = str(getattr(run_stats, key))
356
+ dpi.properties.update(dpi_property_bag)
357
+
358
+ status_result_map = {
359
+ DagsterRunStatus.SUCCESS: InstanceRunResult.SUCCESS,
360
+ DagsterRunStatus.FAILURE: InstanceRunResult.FAILURE,
361
+ DagsterRunStatus.CANCELED: InstanceRunResult.SKIPPED,
362
+ }
363
+
364
+ if run.status not in status_result_map:
365
+ raise Exception(
366
+ f"Job run status should be either complete, failed or cancelled and it was "
367
+ f"{run.status }"
368
+ )
369
+
370
+ if run_stats.start_time is not None:
371
+ dpi.emit_process_start(
372
+ emitter=graph,
373
+ start_timestamp_millis=int(run_stats.start_time * 1000),
374
+ )
375
+
376
+ if run_stats.end_time is not None:
377
+ dpi.emit_process_end(
378
+ emitter=graph,
379
+ end_timestamp_millis=int(run_stats.end_time * 1000),
380
+ result=status_result_map[run.status],
381
+ result_type=Constant.ORCHESTRATOR,
382
+ )
383
+
384
+ def emit_op_run(
385
+ self,
386
+ graph: DataHubGraph,
387
+ datajob: DataJob,
388
+ run_step_stats: RunStepKeyStatsSnapshot,
389
+ ) -> None:
390
+ """
391
+ Emit an op run
392
+ :param graph: DataHubGraph
393
+ :param datajob: DataJob - DataJob object
394
+ :param run_step_stats: RunStepKeyStatsSnapshot - step(op) run stats
395
+ """
396
+ dpi = DataProcessInstance.from_datajob(
397
+ datajob=datajob,
398
+ id=f"{run_step_stats.run_id}.{datajob.id}",
399
+ clone_inlets=True,
400
+ clone_outlets=True,
401
+ )
402
+ if self.config.dagster_url:
403
+ dpi.url = f"{self.config.dagster_url}/runs/{run_step_stats.run_id}"
404
+ if self.dagster_environment.is_cloud:
405
+ dpi.url = f"{self.config.dagster_url}/{self.dagster_environment.branch}/runs/{run_step_stats.run_id}"
406
+ else:
407
+ dpi.url = f"{self.config.dagster_url}/runs/{run_step_stats.run_id}"
408
+
409
+ # Add below details in dpi properties
410
+ dpi_property_bag: Dict[str, str] = {}
411
+ allowed_op_run_keys = [
412
+ Constant.STEP_KEY,
413
+ Constant.ATTEMPTS,
414
+ Constant.START_TIME,
415
+ Constant.END_TIME,
416
+ ]
417
+ for key in allowed_op_run_keys:
418
+ if (
419
+ hasattr(run_step_stats, key)
420
+ and getattr(run_step_stats, key) is not None
421
+ ):
422
+ dpi_property_bag[key] = str(getattr(run_step_stats, key))
423
+ dpi.properties.update(dpi_property_bag)
424
+
425
+ status_result_map = {
426
+ StepEventStatus.SUCCESS: InstanceRunResult.SUCCESS,
427
+ StepEventStatus.FAILURE: InstanceRunResult.FAILURE,
428
+ StepEventStatus.SKIPPED: InstanceRunResult.SKIPPED,
429
+ }
430
+
431
+ if run_step_stats.status not in status_result_map:
432
+ raise Exception(
433
+ f"Step run status should be either complete, failed or cancelled and it was "
434
+ f"{run_step_stats.status }"
435
+ )
436
+
437
+ if run_step_stats.start_time is not None:
438
+ dpi.emit_process_start(
439
+ emitter=graph,
440
+ start_timestamp_millis=int(run_step_stats.start_time * 1000),
441
+ )
442
+
443
+ if run_step_stats.end_time is not None:
444
+ dpi.emit_process_end(
445
+ emitter=graph,
446
+ end_timestamp_millis=int(run_step_stats.end_time * 1000),
447
+ result=status_result_map[run_step_stats.status],
448
+ result_type=Constant.ORCHESTRATOR,
449
+ )
450
+
451
+ def dataset_urn_from_asset(self, asset_key: Sequence[str]) -> DatasetUrn:
452
+ """
453
+ Generate dataset urn from asset key
454
+ """
455
+ return DatasetUrn(
456
+ platform="dagster", env=self.config.env, name="/".join(asset_key)
457
+ )
458
+
459
+ def emit_asset(
460
+ self,
461
+ graph: DataHubGraph,
462
+ asset_key: Sequence[str],
463
+ description: Optional[str],
464
+ properties: Optional[Dict[str, str]],
465
+ ) -> str:
466
+ """
467
+ Emit asset to datahub
468
+ """
469
+ dataset_urn = self.dataset_urn_from_asset(asset_key)
470
+ dataset = Dataset(
471
+ id=None,
472
+ urn=dataset_urn.urn(),
473
+ platform="dagster",
474
+ name=asset_key[-1],
475
+ schema=None,
476
+ downstreams=None,
477
+ subtype="Asset",
478
+ subtypes=None,
479
+ description=description,
480
+ env=self.config.env,
481
+ properties=properties,
482
+ )
483
+ for mcp in dataset.generate_mcp():
484
+ graph.emit_mcp(mcp)
485
+
486
+ mcp = MetadataChangeProposalWrapper(
487
+ entityUrn=dataset_urn.urn(),
488
+ aspect=SubTypesClass(typeNames=["Asset"]),
489
+ )
490
+ graph.emit_mcp(mcp)
491
+
492
+ if self.config.platform_instance:
493
+ mcp = MetadataChangeProposalWrapper(
494
+ entityUrn=dataset_urn.urn(),
495
+ aspect=DataPlatformInstanceClass(
496
+ instance=make_dataplatform_instance_urn(
497
+ instance=self.config.platform_instance,
498
+ platform="dagster",
499
+ ),
500
+ platform=make_data_platform_urn("dagster"),
501
+ ),
502
+ )
503
+ graph.emit_mcp(mcp)
504
+ return dataset_urn.urn()
@@ -0,0 +1,2 @@
1
+ class DatahubDagsterPlugin:
2
+ name = "datahub_dagster_plugin"
File without changes
@@ -0,0 +1,439 @@
1
+ import os
2
+ import traceback
3
+ from typing import Dict, List, Optional, Sequence, Set, Tuple
4
+
5
+ from dagster import (
6
+ DagsterRunStatus,
7
+ EventLogEntry,
8
+ RunStatusSensorContext,
9
+ SensorDefinition,
10
+ SkipReason,
11
+ run_status_sensor,
12
+ sensor,
13
+ )
14
+ from dagster._core.definitions.asset_selection import CoercibleToAssetSelection
15
+ from dagster._core.definitions.sensor_definition import (
16
+ DefaultSensorStatus,
17
+ RawSensorEvaluationFunctionReturn,
18
+ )
19
+ from dagster._core.definitions.target import ExecutableDefinition
20
+ from dagster._core.events import DagsterEventType, HandledOutputData, LoadedInputData
21
+ from dagster._core.execution.stats import RunStepKeyStatsSnapshot
22
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
23
+ from datahub.ingestion.graph.client import DataHubGraph
24
+ from datahub.metadata.schema_classes import SubTypesClass
25
+
26
+ from datahub_dagster_plugin.client.dagster_generator import (
27
+ DagsterEnvironment,
28
+ DagsterGenerator,
29
+ DatahubDagsterSourceConfig,
30
+ )
31
+
32
+
33
+ def make_datahub_sensor(
34
+ config: DatahubDagsterSourceConfig,
35
+ name: Optional[str] = None,
36
+ minimum_interval_seconds: Optional[int] = None,
37
+ description: Optional[str] = None,
38
+ job: Optional[ExecutableDefinition] = None,
39
+ jobs: Optional[Sequence[ExecutableDefinition]] = None,
40
+ default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,
41
+ asset_selection: Optional[CoercibleToAssetSelection] = None,
42
+ required_resource_keys: Optional[Set[str]] = None,
43
+ ) -> SensorDefinition:
44
+ """Create a sensor on job status change emit lineage to DataHub.
45
+
46
+ Args:
47
+ config (DatahubDagsterSourceConfig): DataHub Sensor config
48
+ name: (Optional[str]): The name of the sensor. Defaults to "datahub_sensor".
49
+ minimum_interval_seconds: (Optional[int]): The minimum number of seconds that will elapse
50
+ between sensor evaluations.
51
+ default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default
52
+ status can be overridden from Dagit or via the GraphQL API.
53
+
54
+ Examples:
55
+ .. code-block:: python
56
+
57
+ datahub_sensor = make_datahub_sensor(
58
+ config
59
+ )
60
+
61
+ @repository
62
+ def my_repo():
63
+ return [my_job + datahub_sensor]
64
+
65
+
66
+ """
67
+
68
+ @sensor(
69
+ name=name,
70
+ minimum_interval_seconds=minimum_interval_seconds,
71
+ description=description,
72
+ job=job,
73
+ jobs=jobs,
74
+ default_status=default_status,
75
+ asset_selection=asset_selection,
76
+ required_resource_keys=required_resource_keys,
77
+ )
78
+ def datahub_sensor(context):
79
+ """
80
+ Sensor which instigate all run status sensors and trigger them based upon run status
81
+ """
82
+ for each in DatahubSensors(config).sensors:
83
+ each.evaluate_tick(context)
84
+ return SkipReason("Trigger run status sensors if any new runs present...")
85
+
86
+ return datahub_sensor
87
+
88
+
89
+ class DatahubSensors:
90
+ def __init__(self, config: Optional[DatahubDagsterSourceConfig] = None):
91
+ """
92
+ Set dagster source configurations and initialize datahub emitter and dagster run status sensors
93
+ """
94
+ if config:
95
+ self.config = config
96
+ else:
97
+ self.config = DatahubDagsterSourceConfig()
98
+ self.graph = DataHubGraph(
99
+ self.config.datahub_client_config,
100
+ )
101
+
102
+ self.graph.test_connection()
103
+ self.sensors: List[SensorDefinition] = []
104
+ self.sensors.append(
105
+ run_status_sensor(
106
+ name="datahub_success_sensor", run_status=DagsterRunStatus.SUCCESS
107
+ )(self._emit_metadata)
108
+ )
109
+
110
+ self.sensors.append(
111
+ run_status_sensor(
112
+ name="datahub_failure_sensor", run_status=DagsterRunStatus.FAILURE
113
+ )(self._emit_metadata)
114
+ )
115
+
116
+ self.sensors.append(
117
+ run_status_sensor(
118
+ name="datahub_canceled_sensor", run_status=DagsterRunStatus.CANCELED
119
+ )(self._emit_metadata)
120
+ )
121
+
122
+ def get_dagster_environment(
123
+ self, context: RunStatusSensorContext
124
+ ) -> Optional[DagsterEnvironment]:
125
+ if (
126
+ context.dagster_run.job_code_origin
127
+ and context.dagster_run.job_code_origin.repository_origin
128
+ and context.dagster_run.job_code_origin.repository_origin.code_pointer
129
+ ):
130
+
131
+ code_pointer = (
132
+ context.dagster_run.job_code_origin.repository_origin.code_pointer
133
+ )
134
+ context.log.debug(f"code_pointer: {code_pointer}")
135
+
136
+ if hasattr(code_pointer, "attribute"):
137
+ repository = code_pointer.attribute
138
+ else:
139
+ repository = None
140
+
141
+ if hasattr(code_pointer, "module"):
142
+ module = code_pointer.module
143
+ else:
144
+ context.log.error("Unable to get Module")
145
+ return None
146
+
147
+ dagster_environment = DagsterEnvironment(
148
+ is_cloud=os.getenv("DAGSTER_CLOUD_IS_BRANCH_DEPLOYMENT", None)
149
+ is not None,
150
+ is_branch_deployment=(
151
+ True
152
+ if os.getenv("DAGSTER_CLOUD_IS_BRANCH_DEPLOYMENT", False) == 1
153
+ else False
154
+ ),
155
+ branch=os.getenv("DAGSTER_CLOUD_DEPLOYMENT_NAME", "prod"),
156
+ module=module,
157
+ repository=repository,
158
+ )
159
+ return dagster_environment
160
+ else:
161
+ context.log.error("Unable to get Dagster Environment...")
162
+ return None
163
+
164
+ def process_asset_logs(
165
+ self,
166
+ dagster_generator: DagsterGenerator,
167
+ log: EventLogEntry,
168
+ dataset_inputs: Dict[str, set],
169
+ dataset_outputs: Dict[str, set],
170
+ ) -> None:
171
+
172
+ if not log.dagster_event or not log.step_key:
173
+ return
174
+
175
+ if log.dagster_event.event_type == DagsterEventType.ASSET_MATERIALIZATION:
176
+ if log.step_key not in dataset_outputs:
177
+ dataset_outputs[log.step_key] = set()
178
+
179
+ materialization = log.asset_materialization
180
+ if not materialization:
181
+ return
182
+
183
+ properties = {
184
+ key: str(value) for (key, value) in materialization.metadata.items()
185
+ }
186
+ asset_key = materialization.asset_key.path
187
+ dataset_urn = dagster_generator.emit_asset(
188
+ self.graph, asset_key, materialization.description, properties
189
+ )
190
+ dataset_outputs[log.step_key].add(dataset_urn)
191
+
192
+ elif log.dagster_event.event_type == DagsterEventType.ASSET_OBSERVATION:
193
+ if log.step_key not in dataset_inputs:
194
+ dataset_inputs[log.step_key] = set()
195
+ asset_observation = log.asset_observation
196
+ if not asset_observation:
197
+ return
198
+
199
+ properties = {
200
+ key: str(value)
201
+ for (key, value) in asset_observation.metadata.items() # type: ignore
202
+ }
203
+ asset_key = asset_observation.asset_key.path # type: ignore
204
+ dataset_urn = dagster_generator.emit_asset(
205
+ self.graph,
206
+ asset_key,
207
+ asset_observation.description,
208
+ properties, # type: ignore
209
+ )
210
+ dataset_inputs[log.step_key].add(dataset_urn)
211
+
212
+ def process_handle_input_output(
213
+ self,
214
+ context: RunStatusSensorContext,
215
+ log: EventLogEntry,
216
+ dagster_generator: DagsterGenerator,
217
+ dataset_inputs: Dict[str, set],
218
+ dataset_outputs: Dict[str, set],
219
+ ) -> None:
220
+ if not log.dagster_event or not log.step_key:
221
+ return
222
+
223
+ if (
224
+ self.config.capture_input_output
225
+ and log.dagster_event.event_type == DagsterEventType.HANDLED_OUTPUT
226
+ ):
227
+ if log.step_key not in dataset_outputs:
228
+ dataset_outputs[log.step_key] = set()
229
+
230
+ event_specific_data = log.dagster_event.event_specific_data
231
+ if isinstance(event_specific_data, HandledOutputData):
232
+ context.log.debug(
233
+ f"Output Path: {event_specific_data.metadata.get('path')}"
234
+ )
235
+ metadata = event_specific_data.metadata.get("path")
236
+ context.log.debug(f"Metadata: {metadata}")
237
+ if not metadata:
238
+ return
239
+ urn = dagster_generator.metadata_resolver(metadata)
240
+ if urn:
241
+ context.log.debug(f"Output Urn: {urn}")
242
+ dataset_outputs[log.step_key].add(urn)
243
+ elif (
244
+ self.config.capture_input_output
245
+ and log.dagster_event.event_type == DagsterEventType.LOADED_INPUT
246
+ ):
247
+ if log.step_key not in dataset_inputs:
248
+ dataset_inputs[log.step_key] = set()
249
+ event_specific_data = log.dagster_event.event_specific_data
250
+ if isinstance(event_specific_data, LoadedInputData):
251
+ context.log.debug(
252
+ f"Input Path: {event_specific_data.metadata.get('path')}"
253
+ )
254
+ metadata = event_specific_data.metadata.get("path")
255
+ context.log.debug(f"Metadata: {metadata}")
256
+ if not metadata:
257
+ return
258
+ urn = dagster_generator.metadata_resolver(metadata)
259
+ if urn:
260
+ context.log.debug(f"Input Urn: {urn}")
261
+ dataset_inputs[log.step_key].add(urn)
262
+
263
+ def process_dagster_logs(
264
+ self, context: RunStatusSensorContext, dagster_generator: DagsterGenerator
265
+ ) -> Tuple[Dict[str, set], Dict[str, set]]:
266
+ dataset_outputs: Dict[str, set] = {}
267
+ dataset_inputs: Dict[str, set] = {}
268
+
269
+ logs = context.instance.all_logs(
270
+ context.dagster_run.run_id,
271
+ {
272
+ DagsterEventType.ASSET_MATERIALIZATION,
273
+ DagsterEventType.ASSET_OBSERVATION,
274
+ DagsterEventType.HANDLED_OUTPUT,
275
+ DagsterEventType.LOADED_INPUT,
276
+ },
277
+ )
278
+
279
+ for log in logs:
280
+ if not log.dagster_event or not log.step_key:
281
+ continue
282
+ context.log.debug(f"Log: {log.step_key} - {log.dagster_event}")
283
+ context.log.debug(f"Event Type: {log.dagster_event.event_type}")
284
+ if self.config.capture_input_output:
285
+ self.process_handle_input_output(
286
+ context=context,
287
+ log=log,
288
+ dagster_generator=dagster_generator,
289
+ dataset_inputs=dataset_inputs,
290
+ dataset_outputs=dataset_outputs,
291
+ )
292
+
293
+ if self.config.capture_asset_materialization:
294
+ self.process_asset_logs(
295
+ dagster_generator=dagster_generator,
296
+ log=log,
297
+ dataset_inputs=dataset_inputs,
298
+ dataset_outputs=dataset_outputs,
299
+ )
300
+
301
+ return dataset_inputs, dataset_outputs
302
+
303
+ @staticmethod
304
+ def merge_dicts(dict1: Dict[str, Set], dict2: Dict[str, Set]) -> Dict[str, Set]:
305
+ """
306
+ Merge two dictionaries
307
+ """
308
+ for key, value in dict2.items():
309
+ if key in dict1:
310
+ dict1[key] = dict1[key].union(value)
311
+ else:
312
+ dict1[key] = value
313
+ return dict1
314
+
315
+ def _emit_metadata(
316
+ self, context: RunStatusSensorContext
317
+ ) -> RawSensorEvaluationFunctionReturn:
318
+ """
319
+ Function to emit metadata for datahub rest.
320
+ """
321
+ try:
322
+ context.log.info("Emitting metadata...")
323
+
324
+ assert context.dagster_run.job_snapshot_id
325
+ assert context.dagster_run.execution_plan_snapshot_id
326
+
327
+ dagster_environment = self.get_dagster_environment(context)
328
+ context.log.debug(f"dagster enivronment: {dagster_environment}")
329
+ if not dagster_environment:
330
+ return SkipReason(
331
+ "Unable to get Dagster Environment from DataHub Sensor"
332
+ )
333
+
334
+ context.log.debug(f"Dagster Environment: {dagster_environment}")
335
+
336
+ dagster_generator = DagsterGenerator(
337
+ logger=context.log,
338
+ config=self.config,
339
+ dagster_environment=dagster_environment,
340
+ )
341
+
342
+ job_snapshot = context.instance.get_job_snapshot(
343
+ snapshot_id=context.dagster_run.job_snapshot_id
344
+ )
345
+
346
+ dataset_inputs: Dict[str, Set] = {}
347
+ dataset_outputs: Dict[str, Set] = {}
348
+
349
+ if self.config.asset_lineage_extractor:
350
+ asset_lineages = self.config.asset_lineage_extractor(
351
+ context, dagster_generator, self.graph
352
+ )
353
+ for key, value in asset_lineages.items():
354
+ dataset_inputs[key] = dataset_inputs.get(key, set()).union(
355
+ value.inputs
356
+ )
357
+ dataset_outputs[key] = dataset_outputs.get(key, set()).union(
358
+ value.outputs
359
+ )
360
+
361
+ (
362
+ dataset_inputs_from_logs,
363
+ dataset_outputs_from_logs,
364
+ ) = self.process_dagster_logs(context, dagster_generator)
365
+
366
+ dataset_inputs = DatahubSensors.merge_dicts(
367
+ dataset_inputs, dataset_inputs_from_logs
368
+ )
369
+ dataset_outputs = DatahubSensors.merge_dicts(
370
+ dataset_outputs, dataset_outputs_from_logs
371
+ )
372
+
373
+ context.log.debug(f"Outputs: {dataset_outputs}")
374
+ # Emit dagster job entity which get mapped with datahub dataflow entity
375
+ dataflow = dagster_generator.generate_dataflow(
376
+ job_snapshot=job_snapshot,
377
+ env=self.config.env,
378
+ platform_instance=self.config.platform_instance,
379
+ )
380
+ dataflow.emit(self.graph)
381
+
382
+ # Emit dagster job run which get mapped with datahub data process instance entity
383
+ dagster_generator.emit_job_run(
384
+ graph=self.graph,
385
+ dataflow=dataflow,
386
+ run=context.dagster_run,
387
+ run_stats=context.instance.get_run_stats(context.dagster_run.run_id),
388
+ )
389
+
390
+ # Execution plan snapshot contains all steps(ops) dependency.
391
+ execution_plan_snapshot = context.instance.get_execution_plan_snapshot(
392
+ snapshot_id=context.dagster_run.execution_plan_snapshot_id
393
+ )
394
+
395
+ # Map step key with its run step stats
396
+ run_step_stats: Dict[str, RunStepKeyStatsSnapshot] = {
397
+ run_step_stat.step_key: run_step_stat
398
+ for run_step_stat in context.instance.get_run_step_stats(
399
+ context.dagster_run.run_id
400
+ )
401
+ }
402
+
403
+ # For all dagster ops present in job:
404
+ # Emit op entity which get mapped with datahub datajob entity.
405
+ # Emit op run which get mapped with datahub data process instance entity.
406
+ for op_def_snap in job_snapshot.node_defs_snapshot.op_def_snaps:
407
+ datajob = dagster_generator.generate_datajob(
408
+ job_snapshot=job_snapshot,
409
+ step_deps=execution_plan_snapshot.step_deps,
410
+ op_def_snap=op_def_snap,
411
+ env=self.config.env,
412
+ platform_instance=self.config.platform_instance,
413
+ output_datasets=dataset_outputs,
414
+ input_datasets=dataset_inputs,
415
+ )
416
+ context.log.info(f"Generated Datajob: {datajob}")
417
+ datajob.emit(self.graph)
418
+
419
+ self.graph.emit_mcp(
420
+ mcp=MetadataChangeProposalWrapper(
421
+ entityUrn=str(datajob.urn),
422
+ aspect=SubTypesClass(
423
+ typeNames=["Op"],
424
+ ),
425
+ )
426
+ )
427
+
428
+ dagster_generator.emit_op_run(
429
+ graph=self.graph,
430
+ datajob=datajob,
431
+ run_step_stats=run_step_stats[op_def_snap.name],
432
+ )
433
+
434
+ return SkipReason("Pipeline metadata is emitted to DataHub")
435
+ except Exception as e:
436
+ context.log.error(
437
+ f"Error in emitting metadata to DataHub: {e}. Traceback: {traceback.format_exc()}"
438
+ )
439
+ return SkipReason("Error in emitting metadata to DataHub")