acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.1.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (29) hide show
  1. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/METADATA +2613 -2613
  2. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/RECORD +29 -27
  3. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -0
  6. datahub/cli/ingest_cli.py +9 -1
  7. datahub/emitter/response_helper.py +86 -1
  8. datahub/emitter/rest_emitter.py +1 -1
  9. datahub/ingestion/source/datahub/config.py +11 -0
  10. datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
  11. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  12. datahub/ingestion/source/openapi.py +12 -0
  13. datahub/ingestion/source/openapi_parser.py +56 -37
  14. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  15. datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
  16. datahub/metadata/_internal_schema_classes.py +514 -514
  17. datahub/metadata/_urns/urn_defs.py +1785 -1785
  18. datahub/metadata/schema.avsc +17354 -17725
  19. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  20. datahub/metadata/schemas/__init__.py +3 -3
  21. datahub/sdk/__init__.py +4 -0
  22. datahub/sdk/_all_entities.py +4 -0
  23. datahub/sdk/_shared.py +2 -1
  24. datahub/sdk/dataflow.py +302 -0
  25. datahub/sdk/datajob.py +335 -0
  26. datahub/sdk/entity_client.py +8 -0
  27. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/entry_points.txt +0 -0
  28. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/licenses/LICENSE +0 -0
  29. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/top_level.txt +0 -0
datahub/sdk/datajob.py ADDED
@@ -0,0 +1,335 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from datetime import datetime
5
+ from typing import Dict, List, Optional, Type
6
+
7
+ from typing_extensions import Self
8
+
9
+ import datahub.metadata.schema_classes as models
10
+ from datahub.cli.cli_utils import first_non_null
11
+ from datahub.errors import IngestionAttributionWarning
12
+ from datahub.metadata.urns import (
13
+ DataFlowUrn,
14
+ DataJobUrn,
15
+ DatasetUrn,
16
+ Urn,
17
+ )
18
+ from datahub.sdk._attribution import is_ingestion_attribution
19
+ from datahub.sdk._shared import (
20
+ DataflowUrnOrStr,
21
+ DatasetUrnOrStr,
22
+ DomainInputType,
23
+ HasContainer,
24
+ HasDomain,
25
+ HasInstitutionalMemory,
26
+ HasOwnership,
27
+ HasPlatformInstance,
28
+ HasSubtype,
29
+ HasTags,
30
+ HasTerms,
31
+ LinksInputType,
32
+ OwnersInputType,
33
+ TagsInputType,
34
+ TermsInputType,
35
+ make_time_stamp,
36
+ parse_time_stamp,
37
+ )
38
+ from datahub.sdk.dataflow import DataFlow
39
+ from datahub.sdk.entity import Entity, ExtraAspectsType
40
+
41
+
42
+ class DataJob(
43
+ HasPlatformInstance,
44
+ HasSubtype,
45
+ HasContainer,
46
+ HasOwnership,
47
+ HasInstitutionalMemory,
48
+ HasTags,
49
+ HasTerms,
50
+ HasDomain,
51
+ Entity,
52
+ ):
53
+ """Represents a data job in DataHub.
54
+ A data job is an executable unit of a data pipeline, such as an Airflow task or a Spark job.
55
+ """
56
+
57
+ __slots__ = ()
58
+
59
+ @classmethod
60
+ def get_urn_type(cls) -> Type[DataJobUrn]:
61
+ """Get the URN type for data jobs."""
62
+ return DataJobUrn
63
+
64
+ def __init__(
65
+ self,
66
+ *,
67
+ name: str,
68
+ flow: Optional[DataFlow] = None,
69
+ flow_urn: Optional[DataflowUrnOrStr] = None,
70
+ platform_instance: Optional[str] = None,
71
+ display_name: Optional[str] = None,
72
+ description: Optional[str] = None,
73
+ external_url: Optional[str] = None,
74
+ custom_properties: Optional[Dict[str, str]] = None,
75
+ created: Optional[datetime] = None,
76
+ last_modified: Optional[datetime] = None,
77
+ # Standard aspects
78
+ subtype: Optional[str] = None,
79
+ owners: Optional[OwnersInputType] = None,
80
+ links: Optional[LinksInputType] = None,
81
+ tags: Optional[TagsInputType] = None,
82
+ terms: Optional[TermsInputType] = None,
83
+ domain: Optional[DomainInputType] = None,
84
+ extra_aspects: ExtraAspectsType = None,
85
+ inlets: Optional[List[DatasetUrnOrStr]] = None,
86
+ outlets: Optional[List[DatasetUrnOrStr]] = None,
87
+ ):
88
+ """
89
+ Initialize a DataJob with either a DataFlow or a DataFlowUrn with platform instance.
90
+
91
+ Args:
92
+ name: Name of the data job (required)
93
+ flow: A DataFlow object (optional)
94
+ flow_urn: A DataFlowUrn object (optional)
95
+ platform_instance: Platform instance name (optional, required if flow_urn is provided)
96
+ ... (other optional parameters)
97
+
98
+ Raises:
99
+ ValueError: If neither flow nor (flow_urn and platform_instance) are provided
100
+ """
101
+ if flow is None:
102
+ if flow_urn is None or platform_instance is None:
103
+ raise ValueError(
104
+ "You must provide either: 1. a DataFlow object, or 2. a DataFlowUrn (and a platform_instance config if required)"
105
+ )
106
+ flow_urn = DataFlowUrn.from_string(flow_urn)
107
+ if flow_urn.flow_id.startswith(f"{platform_instance}."):
108
+ flow_name = flow_urn.flow_id[len(platform_instance) + 1 :]
109
+ else:
110
+ flow_name = flow_urn.flow_id
111
+ flow = DataFlow(
112
+ platform=flow_urn.orchestrator,
113
+ name=flow_name,
114
+ platform_instance=platform_instance,
115
+ )
116
+ urn = DataJobUrn.create_from_ids(
117
+ job_id=name,
118
+ data_flow_urn=str(flow.urn),
119
+ )
120
+ super().__init__(urn)
121
+ self._set_extra_aspects(extra_aspects)
122
+ self._set_platform_instance(flow.urn.orchestrator, flow.platform_instance)
123
+ self._set_browse_path_from_flow(flow)
124
+
125
+ # Initialize DataJobInfoClass with default type
126
+ job_info = models.DataJobInfoClass(
127
+ name=display_name or name,
128
+ type=models.AzkabanJobTypeClass.COMMAND, # Default type
129
+ )
130
+ self._setdefault_aspect(job_info)
131
+ self._ensure_datajob_props().flowUrn = str(flow.urn)
132
+
133
+ # Set properties if provided
134
+ if description is not None:
135
+ self.set_description(description)
136
+ if external_url is not None:
137
+ self.set_external_url(external_url)
138
+ if custom_properties is not None:
139
+ self.set_custom_properties(custom_properties)
140
+ if created is not None:
141
+ self.set_created(created)
142
+ if last_modified is not None:
143
+ self.set_last_modified(last_modified)
144
+
145
+ # Set standard aspects
146
+ if subtype is not None:
147
+ self.set_subtype(subtype)
148
+ if owners is not None:
149
+ self.set_owners(owners)
150
+ if links is not None:
151
+ self.set_links(links)
152
+ if tags is not None:
153
+ self.set_tags(tags)
154
+ if terms is not None:
155
+ self.set_terms(terms)
156
+ if domain is not None:
157
+ self.set_domain(domain)
158
+ if inlets is not None:
159
+ self.set_inlets(inlets)
160
+ if outlets is not None:
161
+ self.set_outlets(outlets)
162
+
163
+ @classmethod
164
+ def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
165
+ assert isinstance(urn, DataJobUrn)
166
+ # Extracting platform from the DataFlowUrn inside the DataJobUrn
167
+ data_flow_urn = urn.get_data_flow_urn()
168
+
169
+ entity = cls(
170
+ flow=DataFlow(
171
+ platform=data_flow_urn.orchestrator,
172
+ name=data_flow_urn.flow_id,
173
+ ),
174
+ name=urn.job_id,
175
+ )
176
+ return entity._init_from_graph(current_aspects)
177
+
178
+ @property
179
+ def urn(self) -> DataJobUrn:
180
+ return self._urn # type: ignore
181
+
182
+ def _ensure_datajob_props(self) -> models.DataJobInfoClass:
183
+ props = self._get_aspect(models.DataJobInfoClass)
184
+ if props is None:
185
+ # Use name from URN as fallback with default type
186
+ props = models.DataJobInfoClass(
187
+ name=self.urn.job_id, type=models.AzkabanJobTypeClass.COMMAND
188
+ )
189
+ self._set_aspect(props)
190
+ return props
191
+
192
+ def _get_datajob_inputoutput_props(
193
+ self,
194
+ ) -> Optional[models.DataJobInputOutputClass]:
195
+ return self._get_aspect(models.DataJobInputOutputClass)
196
+
197
+ def _ensure_datajob_inputoutput_props(
198
+ self,
199
+ ) -> models.DataJobInputOutputClass:
200
+ return self._setdefault_aspect(
201
+ models.DataJobInputOutputClass(inputDatasets=[], outputDatasets=[])
202
+ )
203
+
204
+ def _get_editable_props(self) -> Optional[models.EditableDataJobPropertiesClass]:
205
+ return self._get_aspect(models.EditableDataJobPropertiesClass)
206
+
207
+ def _ensure_editable_props(self) -> models.EditableDataJobPropertiesClass:
208
+ return self._setdefault_aspect(models.EditableDataJobPropertiesClass())
209
+
210
+ @property
211
+ def description(self) -> Optional[str]:
212
+ """Get the description of the data job."""
213
+ editable_props = self._get_editable_props()
214
+ return first_non_null(
215
+ [
216
+ editable_props.description if editable_props is not None else None,
217
+ self._ensure_datajob_props().description,
218
+ ]
219
+ )
220
+
221
+ def set_description(self, description: str) -> None:
222
+ """Set the description of the data job."""
223
+ if is_ingestion_attribution():
224
+ editable_props = self._get_editable_props()
225
+ if editable_props is not None and editable_props.description is not None:
226
+ warnings.warn(
227
+ "Overwriting non-ingestion description from ingestion is an anti-pattern.",
228
+ category=IngestionAttributionWarning,
229
+ stacklevel=2,
230
+ )
231
+ # Force the ingestion description to show up.
232
+ editable_props.description = None
233
+
234
+ self._ensure_datajob_props().description = description
235
+ else:
236
+ self._ensure_editable_props().description = description
237
+
238
+ @property
239
+ def name(self) -> str:
240
+ """Get the name of the data job."""
241
+ return self.urn.job_id
242
+
243
+ @property
244
+ def display_name(self) -> Optional[str]:
245
+ """Get the display name of the data job."""
246
+ return self._ensure_datajob_props().name
247
+
248
+ def set_display_name(self, display_name: str) -> None:
249
+ """Set the display name of the data job."""
250
+ self._ensure_datajob_props().name = display_name
251
+
252
+ @property
253
+ def external_url(self) -> Optional[str]:
254
+ """Get the external URL of the data job."""
255
+ return self._ensure_datajob_props().externalUrl
256
+
257
+ def set_external_url(self, external_url: str) -> None:
258
+ """Set the external URL of the data job."""
259
+ self._ensure_datajob_props().externalUrl = external_url
260
+
261
+ @property
262
+ def custom_properties(self) -> Dict[str, str]:
263
+ """Get the custom properties of the data job."""
264
+ return self._ensure_datajob_props().customProperties
265
+
266
+ def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
267
+ """Set the custom properties of the data job."""
268
+ self._ensure_datajob_props().customProperties = custom_properties
269
+
270
+ @property
271
+ def created(self) -> Optional[datetime]:
272
+ """Get the creation timestamp of the data job."""
273
+ return parse_time_stamp(self._ensure_datajob_props().created)
274
+
275
+ def set_created(self, created: datetime) -> None:
276
+ """Set the creation timestamp of the data job."""
277
+ self._ensure_datajob_props().created = make_time_stamp(created)
278
+
279
+ @property
280
+ def last_modified(self) -> Optional[datetime]:
281
+ """Get the last modification timestamp of the data job."""
282
+ return parse_time_stamp(self._ensure_datajob_props().lastModified)
283
+
284
+ def set_last_modified(self, last_modified: datetime) -> None:
285
+ """Set the last modification timestamp of the data job."""
286
+ self._ensure_datajob_props().lastModified = make_time_stamp(last_modified)
287
+
288
+ @property
289
+ def flow_urn(self) -> DataFlowUrn:
290
+ """Get the data flow associated with the data job."""
291
+ return self.urn.get_data_flow_urn()
292
+
293
+ def _set_browse_path_from_flow(self, flow: DataFlow) -> None:
294
+ flow_browse_path = flow._get_aspect(models.BrowsePathsV2Class)
295
+
296
+ # extend the flow's browse path with this job
297
+ browse_path = []
298
+ if flow_browse_path is not None:
299
+ for entry in flow_browse_path.path:
300
+ browse_path.append(
301
+ models.BrowsePathEntryClass(id=entry.id, urn=entry.urn)
302
+ )
303
+
304
+ # Add the job itself to the path
305
+ browse_path.append(models.BrowsePathEntryClass(id=flow.name, urn=str(flow.urn)))
306
+ # Set the browse path aspect
307
+ self._set_aspect(models.BrowsePathsV2Class(path=browse_path))
308
+
309
+ @property
310
+ def inlets(self) -> List[DatasetUrn]:
311
+ """Get the inlets of the data job."""
312
+ inlets = self._ensure_datajob_inputoutput_props().inputDatasets
313
+ return [DatasetUrn.from_string(inlet) for inlet in inlets]
314
+
315
+ def set_inlets(self, inlets: List[DatasetUrnOrStr]) -> None:
316
+ """Set the inlets of the data job."""
317
+ for inlet in inlets:
318
+ inlet_urn = DatasetUrn.from_string(inlet) # type checking
319
+ self._ensure_datajob_inputoutput_props().inputDatasets.append(
320
+ str(inlet_urn)
321
+ )
322
+
323
+ @property
324
+ def outlets(self) -> List[DatasetUrn]:
325
+ """Get the outlets of the data job."""
326
+ outlets = self._ensure_datajob_inputoutput_props().outputDatasets
327
+ return [DatasetUrn.from_string(outlet) for outlet in outlets]
328
+
329
+ def set_outlets(self, outlets: List[DatasetUrnOrStr]) -> None:
330
+ """Set the outlets of the data job."""
331
+ for outlet in outlets:
332
+ outlet_urn = DatasetUrn.from_string(outlet) # type checking
333
+ self._ensure_datajob_inputoutput_props().outputDatasets.append(
334
+ str(outlet_urn)
335
+ )
@@ -10,6 +10,8 @@ from datahub.errors import IngestionAttributionWarning, ItemNotFoundError, SdkUs
10
10
  from datahub.ingestion.graph.client import DataHubGraph
11
11
  from datahub.metadata.urns import (
12
12
  ContainerUrn,
13
+ DataFlowUrn,
14
+ DataJobUrn,
13
15
  DatasetUrn,
14
16
  MlModelGroupUrn,
15
17
  MlModelUrn,
@@ -18,6 +20,8 @@ from datahub.metadata.urns import (
18
20
  from datahub.sdk._all_entities import ENTITY_CLASSES
19
21
  from datahub.sdk._shared import UrnOrStr
20
22
  from datahub.sdk.container import Container
23
+ from datahub.sdk.dataflow import DataFlow
24
+ from datahub.sdk.datajob import DataJob
21
25
  from datahub.sdk.dataset import Dataset
22
26
  from datahub.sdk.entity import Entity
23
27
  from datahub.sdk.mlmodel import MLModel
@@ -57,6 +61,10 @@ class EntityClient:
57
61
  @overload
58
62
  def get(self, urn: MlModelGroupUrn) -> MLModelGroup: ...
59
63
  @overload
64
+ def get(self, urn: DataFlowUrn) -> DataFlow: ...
65
+ @overload
66
+ def get(self, urn: DataJobUrn) -> DataJob: ...
67
+ @overload
60
68
  def get(self, urn: Union[Urn, str]) -> Entity: ...
61
69
  def get(self, urn: UrnOrStr) -> Entity:
62
70
  """Retrieve an entity by its urn.