acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.1.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (29) hide show
  1. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/METADATA +2613 -2613
  2. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/RECORD +29 -27
  3. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -0
  6. datahub/cli/ingest_cli.py +9 -1
  7. datahub/emitter/response_helper.py +86 -1
  8. datahub/emitter/rest_emitter.py +1 -1
  9. datahub/ingestion/source/datahub/config.py +11 -0
  10. datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
  11. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  12. datahub/ingestion/source/openapi.py +12 -0
  13. datahub/ingestion/source/openapi_parser.py +56 -37
  14. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  15. datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
  16. datahub/metadata/_internal_schema_classes.py +514 -514
  17. datahub/metadata/_urns/urn_defs.py +1785 -1785
  18. datahub/metadata/schema.avsc +17354 -17725
  19. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  20. datahub/metadata/schemas/__init__.py +3 -3
  21. datahub/sdk/__init__.py +4 -0
  22. datahub/sdk/_all_entities.py +4 -0
  23. datahub/sdk/_shared.py +2 -1
  24. datahub/sdk/dataflow.py +302 -0
  25. datahub/sdk/datajob.py +335 -0
  26. datahub/sdk/entity_client.py +8 -0
  27. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/entry_points.txt +0 -0
  28. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/licenses/LICENSE +0 -0
  29. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.1.1rc4.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,11 @@
20
20
  "doc": "Arguments provided to the task"
21
21
  },
22
22
  {
23
+ "Searchable": {
24
+ "fieldName": "executorId",
25
+ "fieldType": "KEYWORD",
26
+ "queryByDefault": false
27
+ },
23
28
  "type": "string",
24
29
  "name": "executorId",
25
30
  "doc": "Advanced: specify a specific executor to route the request to. If none is provided, a \"default\" executor is used."
@@ -15,10 +15,10 @@ import pathlib
15
15
  def _load_schema(schema_name: str) -> str:
16
16
  return (pathlib.Path(__file__).parent / f"{schema_name}.avsc").read_text()
17
17
 
18
- def getMetadataChangeProposalSchema() -> str:
19
- return _load_schema("MetadataChangeProposal")
20
-
21
18
  def getMetadataChangeEventSchema() -> str:
22
19
  return _load_schema("MetadataChangeEvent")
23
20
 
21
+ def getMetadataChangeProposalSchema() -> str:
22
+ return _load_schema("MetadataChangeProposal")
23
+
24
24
  # fmt: on
datahub/sdk/__init__.py CHANGED
@@ -19,8 +19,12 @@ from datahub.metadata.urns import (
19
19
  TagUrn,
20
20
  )
21
21
  from datahub.sdk.container import Container
22
+ from datahub.sdk.dataflow import DataFlow
23
+ from datahub.sdk.datajob import DataJob
22
24
  from datahub.sdk.dataset import Dataset
23
25
  from datahub.sdk.main_client import DataHubClient
26
+ from datahub.sdk.mlmodel import MLModel
27
+ from datahub.sdk.mlmodelgroup import MLModelGroup
24
28
  from datahub.sdk.search_filters import Filter, FilterDsl
25
29
 
26
30
  # We want to print out the warning if people do `from datahub.sdk import X`.
@@ -1,6 +1,8 @@
1
1
  from typing import Dict, List, Type
2
2
 
3
3
  from datahub.sdk.container import Container
4
+ from datahub.sdk.dataflow import DataFlow
5
+ from datahub.sdk.datajob import DataJob
4
6
  from datahub.sdk.dataset import Dataset
5
7
  from datahub.sdk.entity import Entity
6
8
  from datahub.sdk.mlmodel import MLModel
@@ -12,6 +14,8 @@ ENTITY_CLASSES_LIST: List[Type[Entity]] = [
12
14
  Dataset,
13
15
  MLModel,
14
16
  MLModelGroup,
17
+ DataFlow,
18
+ DataJob,
15
19
  ]
16
20
 
17
21
  ENTITY_CLASSES: Dict[str, Type[Entity]] = {
datahub/sdk/_shared.py CHANGED
@@ -29,6 +29,7 @@ from datahub.metadata.urns import (
29
29
  ContainerUrn,
30
30
  CorpGroupUrn,
31
31
  CorpUserUrn,
32
+ DataFlowUrn,
32
33
  DataJobUrn,
33
34
  DataPlatformInstanceUrn,
34
35
  DataPlatformUrn,
@@ -47,10 +48,10 @@ from datahub.utilities.urns.error import InvalidUrnError
47
48
 
48
49
  if TYPE_CHECKING:
49
50
  from datahub.sdk.container import Container
50
-
51
51
  UrnOrStr: TypeAlias = Union[Urn, str]
52
52
  DatasetUrnOrStr: TypeAlias = Union[str, DatasetUrn]
53
53
  DatajobUrnOrStr: TypeAlias = Union[str, DataJobUrn]
54
+ DataflowUrnOrStr: TypeAlias = Union[str, DataFlowUrn]
54
55
 
55
56
  ActorUrn: TypeAlias = Union[CorpUserUrn, CorpGroupUrn]
56
57
 
@@ -0,0 +1,302 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from datetime import datetime
5
+ from typing import Dict, Optional, Type, Union
6
+
7
+ from typing_extensions import Self
8
+
9
+ import datahub.metadata.schema_classes as models
10
+ from datahub.cli.cli_utils import first_non_null
11
+ from datahub.emitter.mce_builder import DEFAULT_ENV
12
+ from datahub.errors import (
13
+ IngestionAttributionWarning,
14
+ )
15
+ from datahub.metadata.urns import DataFlowUrn, Urn
16
+ from datahub.sdk._attribution import is_ingestion_attribution
17
+ from datahub.sdk._shared import (
18
+ DomainInputType,
19
+ HasContainer,
20
+ HasDomain,
21
+ HasInstitutionalMemory,
22
+ HasOwnership,
23
+ HasPlatformInstance,
24
+ HasSubtype,
25
+ HasTags,
26
+ HasTerms,
27
+ LinksInputType,
28
+ OwnersInputType,
29
+ ParentContainerInputType,
30
+ TagsInputType,
31
+ TermsInputType,
32
+ make_time_stamp,
33
+ parse_time_stamp,
34
+ )
35
+ from datahub.sdk.entity import Entity, ExtraAspectsType
36
+ from datahub.utilities.sentinels import Unset, unset
37
+
38
+
39
+ class DataFlow(
40
+ HasPlatformInstance,
41
+ HasSubtype,
42
+ HasOwnership,
43
+ HasContainer,
44
+ HasInstitutionalMemory,
45
+ HasTags,
46
+ HasTerms,
47
+ HasDomain,
48
+ Entity,
49
+ ):
50
+ """Represents a dataflow in DataHub.
51
+ A dataflow represents a collection of data, such as a table, view, or file.
52
+ This class provides methods for managing dataflow metadata including schema,
53
+ lineage, and various aspects like ownership, tags, and terms.
54
+ """
55
+
56
+ __slots__ = ()
57
+
58
+ @classmethod
59
+ def get_urn_type(cls) -> Type[DataFlowUrn]:
60
+ """Get the URN type for dataflows.
61
+ Returns:
62
+ The DataflowUrn class.
63
+ """
64
+ return DataFlowUrn
65
+
66
+ def __init__(
67
+ self,
68
+ *,
69
+ # Identity.
70
+ name: str,
71
+ platform: str,
72
+ display_name: Optional[str] = None,
73
+ platform_instance: Optional[str] = None,
74
+ env: str = DEFAULT_ENV,
75
+ # Dataflow properties.
76
+ description: Optional[str] = None,
77
+ external_url: Optional[str] = None,
78
+ custom_properties: Optional[Dict[str, str]] = None,
79
+ created: Optional[datetime] = None,
80
+ last_modified: Optional[datetime] = None,
81
+ # Standard aspects.
82
+ subtype: Optional[str] = None,
83
+ owners: Optional[OwnersInputType] = None,
84
+ links: Optional[LinksInputType] = None,
85
+ tags: Optional[TagsInputType] = None,
86
+ terms: Optional[TermsInputType] = None,
87
+ domain: Optional[DomainInputType] = None,
88
+ parent_container: ParentContainerInputType | Unset = unset,
89
+ extra_aspects: ExtraAspectsType = None,
90
+ ):
91
+ """Initialize a new Dataflow instance.
92
+ Args:
93
+ platform: The platform this dataflow belongs to (e.g. "mysql", "snowflake").
94
+ name: The name of the dataflow.
95
+ platform_instance: Optional platform instance identifier.
96
+ env: The environment this dataflow belongs to (default: DEFAULT_ENV).
97
+ description: Optional description of the dataflow.
98
+ display_name: Optional display name for the dataflow.
99
+ external_url: Optional URL to external documentation or source.
100
+ custom_properties: Optional dictionary of custom properties.
101
+ created: Optional creation timestamp.
102
+ last_modified: Optional last modification timestamp.
103
+ subtype: Optional subtype of the dataflow.
104
+ owners: Optional list of owners.
105
+ links: Optional list of links.
106
+ tags: Optional list of tags.
107
+ terms: Optional list of glossary terms.
108
+ domain: Optional domain this dataflow belongs to.
109
+ extra_aspects: Optional list of additional aspects.
110
+ upstreams: Optional upstream lineage information.
111
+ """
112
+ urn = DataFlowUrn.create_from_ids(
113
+ orchestrator=platform,
114
+ flow_id=name,
115
+ env=env,
116
+ platform_instance=platform_instance,
117
+ )
118
+ super().__init__(urn)
119
+ self._set_extra_aspects(extra_aspects)
120
+
121
+ self._set_platform_instance(urn.orchestrator, platform_instance)
122
+
123
+ # Initialize DataFlowInfoClass directly with name
124
+ self._setdefault_aspect(models.DataFlowInfoClass(name=display_name or name))
125
+ self._ensure_dataflow_props().env = env
126
+
127
+ if description is not None:
128
+ self.set_description(description)
129
+ if display_name is not None:
130
+ self.set_display_name(display_name)
131
+ if external_url is not None:
132
+ self.set_external_url(external_url)
133
+ if custom_properties is not None:
134
+ self.set_custom_properties(custom_properties)
135
+ if created is not None:
136
+ self.set_created(created)
137
+ if last_modified is not None:
138
+ self.set_last_modified(last_modified)
139
+ if subtype is not None:
140
+ self.set_subtype(subtype)
141
+ if owners is not None:
142
+ self.set_owners(owners)
143
+ if links is not None:
144
+ self.set_links(links)
145
+ if tags is not None:
146
+ self.set_tags(tags)
147
+ if terms is not None:
148
+ self.set_terms(terms)
149
+ if domain is not None:
150
+ self.set_domain(domain)
151
+ if parent_container is not unset:
152
+ self._set_container(parent_container)
153
+
154
+ @classmethod
155
+ def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
156
+ assert isinstance(urn, DataFlowUrn)
157
+ entity = cls(
158
+ platform=urn.orchestrator,
159
+ name=urn.flow_id,
160
+ )
161
+ return entity._init_from_graph(current_aspects)
162
+
163
+ @property
164
+ def urn(self) -> DataFlowUrn:
165
+ return self._urn # type: ignore
166
+
167
+ def _ensure_dataflow_props(self) -> models.DataFlowInfoClass:
168
+ props = self._get_aspect(models.DataFlowInfoClass)
169
+ if props is None:
170
+ # Use name from URN as fallback
171
+ props = models.DataFlowInfoClass(name=self.urn.flow_id)
172
+ self._set_aspect(props)
173
+ return props
174
+
175
+ def _get_editable_props(self) -> Optional[models.EditableDataFlowPropertiesClass]:
176
+ return self._get_aspect(models.EditableDataFlowPropertiesClass)
177
+
178
+ def _ensure_editable_props(self) -> models.EditableDataFlowPropertiesClass:
179
+ # Note that most of the fields in this aspect are not used.
180
+ # The only one that's relevant for us is the description.
181
+ return self._setdefault_aspect(models.EditableDataFlowPropertiesClass())
182
+
183
+ @property
184
+ def description(self) -> Optional[str]:
185
+ """Get the description of the dataflow.
186
+ Returns:
187
+ The description if set, None otherwise.
188
+ """
189
+ editable_props = self._get_editable_props()
190
+ return first_non_null(
191
+ [
192
+ editable_props.description if editable_props is not None else None,
193
+ self._ensure_dataflow_props().description,
194
+ ]
195
+ )
196
+
197
+ def set_description(self, description: str) -> None:
198
+ """Set the description of the dataflow.
199
+ Args:
200
+ description: The description to set.
201
+ Note:
202
+ If called during ingestion, this will warn if overwriting
203
+ a non-ingestion description.
204
+ """
205
+ if is_ingestion_attribution():
206
+ editable_props = self._get_editable_props()
207
+ if editable_props is not None and editable_props.description is not None:
208
+ warnings.warn(
209
+ "Overwriting non-ingestion description from ingestion is an anti-pattern.",
210
+ category=IngestionAttributionWarning,
211
+ stacklevel=2,
212
+ )
213
+ # Force the ingestion description to show up.
214
+ editable_props.description = None
215
+
216
+ self._ensure_dataflow_props().description = description
217
+ else:
218
+ self._ensure_editable_props().description = description
219
+
220
+ @property
221
+ def name(self) -> str:
222
+ """Get the name of the dataflow.
223
+ Returns:
224
+ The name of the dataflow.
225
+ """
226
+ return self.urn.flow_id
227
+
228
+ @property
229
+ def display_name(self) -> Optional[str]:
230
+ """Get the display name of the dataflow.
231
+ Returns:
232
+ The display name if set, None otherwise.
233
+ """
234
+ return self._ensure_dataflow_props().name
235
+
236
+ def set_display_name(self, display_name: str) -> None:
237
+ """Set the display name of the dataflow.
238
+ Args:
239
+ display_name: The display name to set.
240
+ """
241
+ self._ensure_dataflow_props().name = display_name
242
+
243
+ @property
244
+ def external_url(self) -> Optional[str]:
245
+ """Get the external URL of the dataflow.
246
+ Returns:
247
+ The external URL if set, None otherwise.
248
+ """
249
+ return self._ensure_dataflow_props().externalUrl
250
+
251
+ def set_external_url(self, external_url: str) -> None:
252
+ """Set the external URL of the dataflow.
253
+ Args:
254
+ external_url: The external URL to set.
255
+ """
256
+ self._ensure_dataflow_props().externalUrl = external_url
257
+
258
+ @property
259
+ def custom_properties(self) -> Dict[str, str]:
260
+ """Get the custom properties of the dataflow.
261
+ Returns:
262
+ Dictionary of custom properties.
263
+ """
264
+ return self._ensure_dataflow_props().customProperties
265
+
266
+ def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
267
+ """Set the custom properties of the dataflow.
268
+ Args:
269
+ custom_properties: Dictionary of custom properties to set.
270
+ """
271
+ self._ensure_dataflow_props().customProperties = custom_properties
272
+
273
+ @property
274
+ def created(self) -> Optional[datetime]:
275
+ """Get the creation timestamp of the dataflow.
276
+ Returns:
277
+ The creation timestamp if set, None otherwise.
278
+ """
279
+ return parse_time_stamp(self._ensure_dataflow_props().created)
280
+
281
+ def set_created(self, created: datetime) -> None:
282
+ """Set the creation timestamp of the dataflow.
283
+ Args:
284
+ created: The creation timestamp to set.
285
+ """
286
+ self._ensure_dataflow_props().created = make_time_stamp(created)
287
+
288
+ @property
289
+ def last_modified(self) -> Optional[datetime]:
290
+ """Get the last modification timestamp of the dataflow.
291
+ Returns:
292
+ The last modification timestamp if set, None otherwise.
293
+ """
294
+ return parse_time_stamp(self._ensure_dataflow_props().lastModified)
295
+
296
+ def set_last_modified(self, last_modified: datetime) -> None:
297
+ self._ensure_dataflow_props().lastModified = make_time_stamp(last_modified)
298
+
299
+ @property
300
+ def env(self) -> Optional[Union[str, models.FabricTypeClass]]:
301
+ """Get the environment of the dataflow."""
302
+ return self._ensure_dataflow_props().env