acryl-datahub 1.0.0rc4__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2502 -2502
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +62 -59
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/cli/ingest_cli.py +3 -1
- datahub/emitter/mcp_builder.py +4 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/run/pipeline.py +109 -143
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -2
- datahub/ingestion/source/mlflow.py +30 -7
- datahub/ingestion/source/mode.py +7 -2
- datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
- datahub/ingestion/source/nifi.py +29 -6
- datahub/ingestion/source/openapi_parser.py +46 -14
- datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
- datahub/ingestion/source/pulsar.py +1 -0
- datahub/ingestion/source/redash.py +29 -6
- datahub/ingestion/source/s3/config.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -6
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/sql/oracle.py +34 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/metadata/_schema_classes.py +534 -410
- datahub/metadata/_urns/urn_defs.py +1670 -1670
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +17379 -17637
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/MetadataChangeEvent.avsc +13 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +29 -12
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_entity.py +20 -1
- datahub/sdk/_shared.py +163 -13
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +23 -5
- datahub/sdk/dataset.py +109 -17
- datahub/sdk/main_client.py +17 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/split_statements.py +20 -13
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
datahub/sdk/_shared.py
CHANGED
|
@@ -1,14 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import warnings
|
|
2
4
|
from datetime import datetime
|
|
3
5
|
from typing import (
|
|
4
6
|
TYPE_CHECKING,
|
|
7
|
+
Callable,
|
|
5
8
|
List,
|
|
6
9
|
Optional,
|
|
7
10
|
Tuple,
|
|
8
11
|
Union,
|
|
9
12
|
)
|
|
10
13
|
|
|
11
|
-
from typing_extensions import TypeAlias
|
|
14
|
+
from typing_extensions import TypeAlias, assert_never
|
|
12
15
|
|
|
13
16
|
import datahub.metadata.schema_classes as models
|
|
14
17
|
from datahub.emitter.mce_builder import (
|
|
@@ -20,6 +23,7 @@ from datahub.emitter.mce_builder import (
|
|
|
20
23
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
21
24
|
from datahub.errors import MultipleSubtypesWarning, SdkUsageError
|
|
22
25
|
from datahub.metadata.urns import (
|
|
26
|
+
ContainerUrn,
|
|
23
27
|
CorpGroupUrn,
|
|
24
28
|
CorpUserUrn,
|
|
25
29
|
DataJobUrn,
|
|
@@ -33,6 +37,7 @@ from datahub.metadata.urns import (
|
|
|
33
37
|
Urn,
|
|
34
38
|
)
|
|
35
39
|
from datahub.sdk._entity import Entity
|
|
40
|
+
from datahub.sdk._utils import add_list_unique, remove_list_unique
|
|
36
41
|
from datahub.utilities.urns.error import InvalidUrnError
|
|
37
42
|
|
|
38
43
|
if TYPE_CHECKING:
|
|
@@ -83,6 +88,13 @@ class HasPlatformInstance(Entity):
|
|
|
83
88
|
)
|
|
84
89
|
)
|
|
85
90
|
|
|
91
|
+
@property
|
|
92
|
+
def platform(self) -> Optional[DataPlatformUrn]:
|
|
93
|
+
dataPlatform = self._get_aspect(models.DataPlatformInstanceClass)
|
|
94
|
+
if dataPlatform and dataPlatform.platform:
|
|
95
|
+
return DataPlatformUrn.from_string(dataPlatform.platform)
|
|
96
|
+
return None
|
|
97
|
+
|
|
86
98
|
@property
|
|
87
99
|
def platform_instance(self) -> Optional[DataPlatformInstanceUrn]:
|
|
88
100
|
dataPlatformInstance = self._get_aspect(models.DataPlatformInstanceClass)
|
|
@@ -112,11 +124,11 @@ class HasSubtype(Entity):
|
|
|
112
124
|
self._set_aspect(models.SubTypesClass(typeNames=[subtype]))
|
|
113
125
|
|
|
114
126
|
|
|
127
|
+
# TODO: Reference OwnershipTypeClass as the valid ownership type enum.
|
|
115
128
|
OwnershipTypeType: TypeAlias = Union[str, OwnershipTypeUrn]
|
|
116
129
|
OwnerInputType: TypeAlias = Union[
|
|
117
|
-
str,
|
|
118
130
|
ActorUrn,
|
|
119
|
-
Tuple[
|
|
131
|
+
Tuple[ActorUrn, OwnershipTypeType],
|
|
120
132
|
models.OwnerClass,
|
|
121
133
|
]
|
|
122
134
|
OwnersInputType: TypeAlias = List[OwnerInputType]
|
|
@@ -126,15 +138,17 @@ class HasOwnership(Entity):
|
|
|
126
138
|
__slots__ = ()
|
|
127
139
|
|
|
128
140
|
@staticmethod
|
|
129
|
-
def _parse_owner_class(owner: OwnerInputType) -> models.OwnerClass:
|
|
141
|
+
def _parse_owner_class(owner: OwnerInputType) -> Tuple[models.OwnerClass, bool]:
|
|
130
142
|
if isinstance(owner, models.OwnerClass):
|
|
131
|
-
return owner
|
|
143
|
+
return owner, False
|
|
132
144
|
|
|
145
|
+
was_type_specified = False
|
|
133
146
|
owner_type = models.OwnershipTypeClass.TECHNICAL_OWNER
|
|
134
147
|
owner_type_urn = None
|
|
135
148
|
|
|
136
149
|
if isinstance(owner, tuple):
|
|
137
150
|
raw_owner, raw_owner_type = owner
|
|
151
|
+
was_type_specified = True
|
|
138
152
|
|
|
139
153
|
if isinstance(raw_owner_type, OwnershipTypeUrn):
|
|
140
154
|
owner_type = models.OwnershipTypeClass.CUSTOM
|
|
@@ -151,17 +165,15 @@ class HasOwnership(Entity):
|
|
|
151
165
|
owner=make_user_urn(raw_owner),
|
|
152
166
|
type=owner_type,
|
|
153
167
|
typeUrn=owner_type_urn,
|
|
154
|
-
)
|
|
168
|
+
), was_type_specified
|
|
155
169
|
elif isinstance(raw_owner, Urn):
|
|
156
170
|
return models.OwnerClass(
|
|
157
171
|
owner=str(raw_owner),
|
|
158
172
|
type=owner_type,
|
|
159
173
|
typeUrn=owner_type_urn,
|
|
160
|
-
)
|
|
174
|
+
), was_type_specified
|
|
161
175
|
else:
|
|
162
|
-
|
|
163
|
-
f"Invalid owner {owner}: {type(owner)} is not a valid owner type"
|
|
164
|
-
)
|
|
176
|
+
assert_never(raw_owner)
|
|
165
177
|
|
|
166
178
|
# TODO: Return a custom type with deserialized urns, instead of the raw aspect.
|
|
167
179
|
# Ideally we'd also use first-class ownership type urns here, not strings.
|
|
@@ -173,21 +185,74 @@ class HasOwnership(Entity):
|
|
|
173
185
|
|
|
174
186
|
def set_owners(self, owners: OwnersInputType) -> None:
|
|
175
187
|
# TODO: add docs on the default parsing + default ownership type
|
|
176
|
-
parsed_owners = [self._parse_owner_class(owner) for owner in owners]
|
|
188
|
+
parsed_owners = [self._parse_owner_class(owner)[0] for owner in owners]
|
|
177
189
|
self._set_aspect(models.OwnershipClass(owners=parsed_owners))
|
|
178
190
|
|
|
191
|
+
@classmethod
|
|
192
|
+
def _owner_key_method(
|
|
193
|
+
cls, consider_owner_type: bool
|
|
194
|
+
) -> Callable[[models.OwnerClass], Tuple[str, ...]]:
|
|
195
|
+
if consider_owner_type:
|
|
196
|
+
return cls._typed_owner_key
|
|
197
|
+
else:
|
|
198
|
+
return cls._simple_owner_key
|
|
179
199
|
|
|
180
|
-
|
|
200
|
+
@classmethod
|
|
201
|
+
def _typed_owner_key(cls, owner: models.OwnerClass) -> Tuple[str, str]:
|
|
202
|
+
return (owner.owner, owner.typeUrn or str(owner.type))
|
|
203
|
+
|
|
204
|
+
@classmethod
|
|
205
|
+
def _simple_owner_key(cls, owner: models.OwnerClass) -> Tuple[str,]:
|
|
206
|
+
return (owner.owner,)
|
|
207
|
+
|
|
208
|
+
def _ensure_owners(self) -> List[models.OwnerClass]:
|
|
209
|
+
owners = self._setdefault_aspect(models.OwnershipClass(owners=[])).owners
|
|
210
|
+
return owners
|
|
211
|
+
|
|
212
|
+
def add_owner(self, owner: OwnerInputType) -> None:
|
|
213
|
+
# Tricky: when adding an owner, we always use the ownership type.
|
|
214
|
+
# For removals, we only use it if it was explicitly specified.
|
|
215
|
+
parsed_owner, _ = self._parse_owner_class(owner)
|
|
216
|
+
add_list_unique(
|
|
217
|
+
self._ensure_owners(),
|
|
218
|
+
key=self._typed_owner_key,
|
|
219
|
+
item=parsed_owner,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def remove_owner(self, owner: OwnerInputType) -> None:
|
|
223
|
+
parsed_owner, was_type_specified = self._parse_owner_class(owner)
|
|
224
|
+
remove_list_unique(
|
|
225
|
+
self._ensure_owners(),
|
|
226
|
+
key=self._owner_key_method(was_type_specified),
|
|
227
|
+
item=parsed_owner,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# If you pass in a container object, we can build on top of its browse path.
|
|
232
|
+
# If you pass in a ContainerKey, we can use parent_key() to build the browse path.
|
|
233
|
+
# If you pass in a list of urns, we'll use that as the browse path. Any non-urn strings
|
|
234
|
+
# will be treated as raw ids.
|
|
235
|
+
ParentContainerInputType: TypeAlias = Union["Container", ContainerKey, List[UrnOrStr]]
|
|
181
236
|
|
|
182
237
|
|
|
183
238
|
class HasContainer(Entity):
|
|
184
239
|
__slots__ = ()
|
|
185
240
|
|
|
186
|
-
|
|
241
|
+
@staticmethod
|
|
242
|
+
def _maybe_parse_as_urn(urn: UrnOrStr) -> UrnOrStr:
|
|
243
|
+
if isinstance(urn, Urn):
|
|
244
|
+
return urn
|
|
245
|
+
elif urn.startswith("urn:li:"):
|
|
246
|
+
return Urn.from_string(urn)
|
|
247
|
+
else:
|
|
248
|
+
return urn
|
|
249
|
+
|
|
250
|
+
def _set_container(self, container: Optional[ParentContainerInputType]) -> None:
|
|
187
251
|
# We need to allow container to be None. It won't happen for datasets much, but
|
|
188
252
|
# will be required for root containers.
|
|
189
253
|
from datahub.sdk.container import Container
|
|
190
254
|
|
|
255
|
+
container_urn: Optional[str]
|
|
191
256
|
browse_path: List[Union[str, models.BrowsePathEntryClass]] = []
|
|
192
257
|
if isinstance(container, Container):
|
|
193
258
|
container_urn = container.urn.urn()
|
|
@@ -204,6 +269,29 @@ class HasContainer(Entity):
|
|
|
204
269
|
urn=container_urn,
|
|
205
270
|
),
|
|
206
271
|
]
|
|
272
|
+
elif isinstance(container, list):
|
|
273
|
+
parsed_path = [self._maybe_parse_as_urn(entry) for entry in container]
|
|
274
|
+
|
|
275
|
+
# Use the last container in the path as the container urn.
|
|
276
|
+
container_urns = [
|
|
277
|
+
urn.urn() for urn in parsed_path if isinstance(urn, ContainerUrn)
|
|
278
|
+
]
|
|
279
|
+
container_urn = container_urns[-1] if container_urns else None
|
|
280
|
+
|
|
281
|
+
browse_path = [
|
|
282
|
+
(
|
|
283
|
+
models.BrowsePathEntryClass(
|
|
284
|
+
id=str(entry),
|
|
285
|
+
urn=str(entry),
|
|
286
|
+
)
|
|
287
|
+
if isinstance(entry, Urn)
|
|
288
|
+
else models.BrowsePathEntryClass(
|
|
289
|
+
id=entry,
|
|
290
|
+
urn=None,
|
|
291
|
+
)
|
|
292
|
+
)
|
|
293
|
+
for entry in parsed_path
|
|
294
|
+
]
|
|
207
295
|
elif container is not None:
|
|
208
296
|
container_urn = container.as_urn()
|
|
209
297
|
|
|
@@ -243,6 +331,24 @@ class HasContainer(Entity):
|
|
|
243
331
|
)
|
|
244
332
|
)
|
|
245
333
|
|
|
334
|
+
@property
|
|
335
|
+
def parent_container(self) -> Optional[ContainerUrn]:
|
|
336
|
+
if container := self._get_aspect(models.ContainerClass):
|
|
337
|
+
return ContainerUrn.from_string(container.container)
|
|
338
|
+
return None
|
|
339
|
+
|
|
340
|
+
@property
|
|
341
|
+
def browse_path(self) -> Optional[List[UrnOrStr]]:
|
|
342
|
+
if browse_path := self._get_aspect(models.BrowsePathsV2Class):
|
|
343
|
+
path: List[UrnOrStr] = []
|
|
344
|
+
for entry in browse_path.path:
|
|
345
|
+
if entry.urn:
|
|
346
|
+
path.append(Urn.from_string(entry.urn))
|
|
347
|
+
else:
|
|
348
|
+
path.append(entry.id)
|
|
349
|
+
return path
|
|
350
|
+
return None
|
|
351
|
+
|
|
246
352
|
|
|
247
353
|
TagInputType: TypeAlias = Union[str, TagUrn, models.TagAssociationClass]
|
|
248
354
|
TagsInputType: TypeAlias = List[TagInputType]
|
|
@@ -251,6 +357,9 @@ TagsInputType: TypeAlias = List[TagInputType]
|
|
|
251
357
|
class HasTags(Entity):
|
|
252
358
|
__slots__ = ()
|
|
253
359
|
|
|
360
|
+
def _ensure_tags(self) -> List[models.TagAssociationClass]:
|
|
361
|
+
return self._setdefault_aspect(models.GlobalTagsClass(tags=[])).tags
|
|
362
|
+
|
|
254
363
|
# TODO: Return a custom type with deserialized urns, instead of the raw aspect.
|
|
255
364
|
@property
|
|
256
365
|
def tags(self) -> Optional[List[models.TagAssociationClass]]:
|
|
@@ -275,6 +384,24 @@ class HasTags(Entity):
|
|
|
275
384
|
)
|
|
276
385
|
)
|
|
277
386
|
|
|
387
|
+
@classmethod
|
|
388
|
+
def _tag_key(cls, tag: models.TagAssociationClass) -> str:
|
|
389
|
+
return tag.tag
|
|
390
|
+
|
|
391
|
+
def add_tag(self, tag: TagInputType) -> None:
|
|
392
|
+
add_list_unique(
|
|
393
|
+
self._ensure_tags(),
|
|
394
|
+
self._tag_key,
|
|
395
|
+
self._parse_tag_association_class(tag),
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
def remove_tag(self, tag: TagInputType) -> None:
|
|
399
|
+
remove_list_unique(
|
|
400
|
+
self._ensure_tags(),
|
|
401
|
+
self._tag_key,
|
|
402
|
+
self._parse_tag_association_class(tag),
|
|
403
|
+
)
|
|
404
|
+
|
|
278
405
|
|
|
279
406
|
TermInputType: TypeAlias = Union[
|
|
280
407
|
str, GlossaryTermUrn, models.GlossaryTermAssociationClass
|
|
@@ -285,6 +412,11 @@ TermsInputType: TypeAlias = List[TermInputType]
|
|
|
285
412
|
class HasTerms(Entity):
|
|
286
413
|
__slots__ = ()
|
|
287
414
|
|
|
415
|
+
def _ensure_terms(self) -> List[models.GlossaryTermAssociationClass]:
|
|
416
|
+
return self._setdefault_aspect(
|
|
417
|
+
models.GlossaryTermsClass(terms=[], auditStamp=self._terms_audit_stamp())
|
|
418
|
+
).terms
|
|
419
|
+
|
|
288
420
|
# TODO: Return a custom type with deserialized urns, instead of the raw aspect.
|
|
289
421
|
@property
|
|
290
422
|
def terms(self) -> Optional[List[models.GlossaryTermAssociationClass]]:
|
|
@@ -320,6 +452,24 @@ class HasTerms(Entity):
|
|
|
320
452
|
)
|
|
321
453
|
)
|
|
322
454
|
|
|
455
|
+
@classmethod
|
|
456
|
+
def _terms_key(self, term: models.GlossaryTermAssociationClass) -> str:
|
|
457
|
+
return term.urn
|
|
458
|
+
|
|
459
|
+
def add_term(self, term: TermInputType) -> None:
|
|
460
|
+
add_list_unique(
|
|
461
|
+
self._ensure_terms(),
|
|
462
|
+
self._terms_key,
|
|
463
|
+
self._parse_glossary_term_association_class(term),
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
def remove_term(self, term: TermInputType) -> None:
|
|
467
|
+
remove_list_unique(
|
|
468
|
+
self._ensure_terms(),
|
|
469
|
+
self._terms_key,
|
|
470
|
+
self._parse_glossary_term_association_class(term),
|
|
471
|
+
)
|
|
472
|
+
|
|
323
473
|
|
|
324
474
|
DomainInputType: TypeAlias = Union[str, DomainUrn]
|
|
325
475
|
|
datahub/sdk/_utils.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from typing import Any, Callable, List, Protocol, TypeVar
|
|
2
|
+
|
|
3
|
+
from datahub.errors import ItemNotFoundError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class _SupportsEq(Protocol):
|
|
7
|
+
def __eq__(self, other: Any) -> bool: ...
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
T = TypeVar("T")
|
|
11
|
+
K = TypeVar("K", bound=_SupportsEq)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def add_list_unique(lst: List[T], key: Callable[[T], K], item: T) -> None:
|
|
15
|
+
item_key = key(item)
|
|
16
|
+
for i, existing in enumerate(lst):
|
|
17
|
+
if key(existing) == item_key:
|
|
18
|
+
lst[i] = item
|
|
19
|
+
return
|
|
20
|
+
lst.append(item)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def remove_list_unique(
|
|
24
|
+
lst: List[T], key: Callable[[T], K], item: T, *, missing_ok: bool = True
|
|
25
|
+
) -> None:
|
|
26
|
+
# Poor man's patch implementation.
|
|
27
|
+
item_key = key(item)
|
|
28
|
+
removed = False
|
|
29
|
+
for i, existing in enumerate(lst):
|
|
30
|
+
if key(existing) == item_key:
|
|
31
|
+
lst.pop(i)
|
|
32
|
+
removed = True
|
|
33
|
+
# Tricky: no break. In case there's already duplicates, we want to remove all of them.
|
|
34
|
+
if not removed and not missing_ok:
|
|
35
|
+
raise ItemNotFoundError(f"Cannot remove item {item} from list: not found")
|
datahub/sdk/container.py
CHANGED
|
@@ -16,7 +16,7 @@ from datahub.metadata.urns import (
|
|
|
16
16
|
ContainerUrn,
|
|
17
17
|
Urn,
|
|
18
18
|
)
|
|
19
|
-
from datahub.sdk._entity import Entity
|
|
19
|
+
from datahub.sdk._entity import Entity, ExtraAspectsType
|
|
20
20
|
from datahub.sdk._shared import (
|
|
21
21
|
DomainInputType,
|
|
22
22
|
HasContainer,
|
|
@@ -27,11 +27,13 @@ from datahub.sdk._shared import (
|
|
|
27
27
|
HasTags,
|
|
28
28
|
HasTerms,
|
|
29
29
|
OwnersInputType,
|
|
30
|
+
ParentContainerInputType,
|
|
30
31
|
TagsInputType,
|
|
31
32
|
TermsInputType,
|
|
32
33
|
make_time_stamp,
|
|
33
34
|
parse_time_stamp,
|
|
34
35
|
)
|
|
36
|
+
from datahub.utilities.sentinels import Auto, auto
|
|
35
37
|
|
|
36
38
|
|
|
37
39
|
class Container(
|
|
@@ -54,7 +56,7 @@ class Container(
|
|
|
54
56
|
self,
|
|
55
57
|
/,
|
|
56
58
|
# Identity.
|
|
57
|
-
container_key: ContainerKey
|
|
59
|
+
container_key: ContainerKey,
|
|
58
60
|
*,
|
|
59
61
|
# Container attributes.
|
|
60
62
|
display_name: str,
|
|
@@ -66,17 +68,22 @@ class Container(
|
|
|
66
68
|
created: Optional[datetime] = None,
|
|
67
69
|
last_modified: Optional[datetime] = None,
|
|
68
70
|
# Standard aspects.
|
|
71
|
+
parent_container: Auto | ParentContainerInputType | None = auto,
|
|
69
72
|
subtype: Optional[str] = None,
|
|
70
73
|
owners: Optional[OwnersInputType] = None,
|
|
71
74
|
tags: Optional[TagsInputType] = None,
|
|
72
75
|
terms: Optional[TermsInputType] = None,
|
|
73
76
|
domain: Optional[DomainInputType] = None,
|
|
77
|
+
extra_aspects: ExtraAspectsType = None,
|
|
74
78
|
):
|
|
79
|
+
# Hack: while the type annotations say container_key is always a ContainerKey,
|
|
80
|
+
# we allow ContainerUrn to make the graph-based constructor work.
|
|
75
81
|
if isinstance(container_key, ContainerUrn):
|
|
76
82
|
urn = container_key
|
|
77
83
|
else:
|
|
78
84
|
urn = ContainerUrn.from_string(container_key.as_urn())
|
|
79
85
|
super().__init__(urn)
|
|
86
|
+
self._set_extra_aspects(extra_aspects)
|
|
80
87
|
|
|
81
88
|
# This needs to come first to ensure that the display name is registered.
|
|
82
89
|
self._ensure_container_props(name=display_name)
|
|
@@ -85,8 +92,6 @@ class Container(
|
|
|
85
92
|
if isinstance(container_key, ContainerKey):
|
|
86
93
|
self._set_platform_instance(container_key.platform, container_key.instance)
|
|
87
94
|
|
|
88
|
-
self._set_container(container_key.parent_key())
|
|
89
|
-
|
|
90
95
|
self.set_custom_properties(
|
|
91
96
|
{
|
|
92
97
|
**container_key.property_dict(),
|
|
@@ -100,6 +105,18 @@ class Container(
|
|
|
100
105
|
env = container_key.env if container_key.env in ALL_ENV_TYPES else None
|
|
101
106
|
if _INCLUDE_ENV_IN_CONTAINER_PROPERTIES and env is not None:
|
|
102
107
|
self._ensure_container_props().env = env
|
|
108
|
+
else:
|
|
109
|
+
self.set_custom_properties(extra_properties or {})
|
|
110
|
+
|
|
111
|
+
if parent_container is auto:
|
|
112
|
+
if not isinstance(container_key, ContainerKey):
|
|
113
|
+
raise SdkUsageError(
|
|
114
|
+
"Either a container_key or parent_container must be provided"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
self._set_container(container_key.parent_key())
|
|
118
|
+
else:
|
|
119
|
+
self._set_container(parent_container)
|
|
103
120
|
|
|
104
121
|
if description is not None:
|
|
105
122
|
self.set_description(description)
|
|
@@ -126,7 +143,8 @@ class Container(
|
|
|
126
143
|
@classmethod
|
|
127
144
|
def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
|
|
128
145
|
assert isinstance(urn, ContainerUrn)
|
|
129
|
-
|
|
146
|
+
|
|
147
|
+
entity = cls(urn, display_name="__dummy_value__", parent_container=None) # type: ignore[arg-type]
|
|
130
148
|
return entity._init_from_graph(current_aspects)
|
|
131
149
|
|
|
132
150
|
def _ensure_container_props(
|
datahub/sdk/dataset.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import warnings
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
from typing import Dict, List, Optional, Tuple, Type, Union
|
|
5
|
+
from typing import Dict, List, Optional, Sequence, Tuple, Type, Union
|
|
6
6
|
|
|
7
7
|
from typing_extensions import Self, TypeAlias, assert_never
|
|
8
8
|
|
|
@@ -13,13 +13,13 @@ from datahub.errors import (
|
|
|
13
13
|
IngestionAttributionWarning,
|
|
14
14
|
ItemNotFoundError,
|
|
15
15
|
SchemaFieldKeyError,
|
|
16
|
+
SdkUsageError,
|
|
16
17
|
)
|
|
17
18
|
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
18
19
|
from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn
|
|
19
20
|
from datahub.sdk._attribution import is_ingestion_attribution
|
|
20
|
-
from datahub.sdk._entity import Entity
|
|
21
|
+
from datahub.sdk._entity import Entity, ExtraAspectsType
|
|
21
22
|
from datahub.sdk._shared import (
|
|
22
|
-
ContainerInputType,
|
|
23
23
|
DatasetUrnOrStr,
|
|
24
24
|
DomainInputType,
|
|
25
25
|
HasContainer,
|
|
@@ -30,20 +30,24 @@ from datahub.sdk._shared import (
|
|
|
30
30
|
HasTags,
|
|
31
31
|
HasTerms,
|
|
32
32
|
OwnersInputType,
|
|
33
|
+
ParentContainerInputType,
|
|
34
|
+
TagInputType,
|
|
33
35
|
TagsInputType,
|
|
36
|
+
TermInputType,
|
|
34
37
|
TermsInputType,
|
|
35
38
|
make_time_stamp,
|
|
36
39
|
parse_time_stamp,
|
|
37
40
|
)
|
|
41
|
+
from datahub.sdk._utils import add_list_unique, remove_list_unique
|
|
42
|
+
from datahub.utilities.sentinels import Unset, unset
|
|
38
43
|
|
|
39
44
|
SchemaFieldInputType: TypeAlias = Union[
|
|
40
|
-
str,
|
|
41
45
|
Tuple[str, str], # (name, type)
|
|
42
46
|
Tuple[str, str, str], # (name, type, description)
|
|
43
47
|
models.SchemaFieldClass,
|
|
44
48
|
]
|
|
45
49
|
SchemaFieldsInputType: TypeAlias = Union[
|
|
46
|
-
|
|
50
|
+
Sequence[SchemaFieldInputType],
|
|
47
51
|
models.SchemaMetadataClass,
|
|
48
52
|
]
|
|
49
53
|
|
|
@@ -271,6 +275,51 @@ class SchemaField:
|
|
|
271
275
|
tags=parsed_tags
|
|
272
276
|
)
|
|
273
277
|
|
|
278
|
+
def add_tag(self, tag: TagInputType) -> None:
|
|
279
|
+
parsed_tag = self._parent._parse_tag_association_class(tag)
|
|
280
|
+
|
|
281
|
+
if is_ingestion_attribution():
|
|
282
|
+
raise SdkUsageError(
|
|
283
|
+
"Adding field tags in ingestion mode is not yet supported. "
|
|
284
|
+
"Use set_tags instead."
|
|
285
|
+
)
|
|
286
|
+
else:
|
|
287
|
+
editable_field = self._ensure_editable_schema_field()
|
|
288
|
+
if editable_field.globalTags is None:
|
|
289
|
+
editable_field.globalTags = models.GlobalTagsClass(tags=[])
|
|
290
|
+
|
|
291
|
+
add_list_unique(
|
|
292
|
+
editable_field.globalTags.tags,
|
|
293
|
+
key=self._parent._tag_key,
|
|
294
|
+
item=parsed_tag,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
def remove_tag(self, tag: TagInputType) -> None:
|
|
298
|
+
parsed_tag = self._parent._parse_tag_association_class(tag)
|
|
299
|
+
|
|
300
|
+
if is_ingestion_attribution():
|
|
301
|
+
raise SdkUsageError(
|
|
302
|
+
"Adding field tags in ingestion mode is not yet supported. "
|
|
303
|
+
"Use set_tags instead."
|
|
304
|
+
)
|
|
305
|
+
else:
|
|
306
|
+
base_field = self._base_schema_field()
|
|
307
|
+
if base_field.globalTags is not None:
|
|
308
|
+
remove_list_unique(
|
|
309
|
+
base_field.globalTags.tags,
|
|
310
|
+
key=self._parent._tag_key,
|
|
311
|
+
item=parsed_tag,
|
|
312
|
+
missing_ok=True,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
editable_field = self._ensure_editable_schema_field()
|
|
316
|
+
if editable_field.globalTags is not None:
|
|
317
|
+
remove_list_unique(
|
|
318
|
+
editable_field.globalTags.tags,
|
|
319
|
+
key=self._parent._tag_key,
|
|
320
|
+
item=parsed_tag,
|
|
321
|
+
)
|
|
322
|
+
|
|
274
323
|
@property
|
|
275
324
|
def terms(self) -> Optional[List[models.GlossaryTermAssociationClass]]:
|
|
276
325
|
# TODO: Basically the same implementation as tags - can we share code?
|
|
@@ -287,7 +336,7 @@ class SchemaField:
|
|
|
287
336
|
|
|
288
337
|
return terms
|
|
289
338
|
|
|
290
|
-
def set_terms(self, terms:
|
|
339
|
+
def set_terms(self, terms: TermsInputType) -> None:
|
|
291
340
|
parsed_terms = [
|
|
292
341
|
self._parent._parse_glossary_term_association_class(term) for term in terms
|
|
293
342
|
]
|
|
@@ -318,6 +367,55 @@ class SchemaField:
|
|
|
318
367
|
)
|
|
319
368
|
)
|
|
320
369
|
|
|
370
|
+
def add_term(self, term: TermInputType) -> None:
|
|
371
|
+
parsed_term = self._parent._parse_glossary_term_association_class(term)
|
|
372
|
+
|
|
373
|
+
if is_ingestion_attribution():
|
|
374
|
+
raise SdkUsageError(
|
|
375
|
+
"Adding field terms in ingestion mode is not yet supported. "
|
|
376
|
+
"Use set_terms instead."
|
|
377
|
+
)
|
|
378
|
+
else:
|
|
379
|
+
editable_field = self._ensure_editable_schema_field()
|
|
380
|
+
if editable_field.glossaryTerms is None:
|
|
381
|
+
editable_field.glossaryTerms = models.GlossaryTermsClass(
|
|
382
|
+
terms=[],
|
|
383
|
+
auditStamp=self._parent._terms_audit_stamp(),
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
add_list_unique(
|
|
387
|
+
editable_field.glossaryTerms.terms,
|
|
388
|
+
key=self._parent._terms_key,
|
|
389
|
+
item=parsed_term,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
def remove_term(self, term: TermInputType) -> None:
|
|
393
|
+
parsed_term = self._parent._parse_glossary_term_association_class(term)
|
|
394
|
+
|
|
395
|
+
if is_ingestion_attribution():
|
|
396
|
+
raise SdkUsageError(
|
|
397
|
+
"Removing field terms in ingestion mode is not yet supported. "
|
|
398
|
+
"Use set_terms instead."
|
|
399
|
+
)
|
|
400
|
+
else:
|
|
401
|
+
base_field = self._base_schema_field()
|
|
402
|
+
if base_field.glossaryTerms is not None:
|
|
403
|
+
remove_list_unique(
|
|
404
|
+
base_field.glossaryTerms.terms,
|
|
405
|
+
key=self._parent._terms_key,
|
|
406
|
+
item=parsed_term,
|
|
407
|
+
missing_ok=True,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
editable_field = self._ensure_editable_schema_field()
|
|
411
|
+
if editable_field.glossaryTerms is not None:
|
|
412
|
+
remove_list_unique(
|
|
413
|
+
editable_field.glossaryTerms.terms,
|
|
414
|
+
key=self._parent._terms_key,
|
|
415
|
+
item=parsed_term,
|
|
416
|
+
missing_ok=True,
|
|
417
|
+
)
|
|
418
|
+
|
|
321
419
|
|
|
322
420
|
class Dataset(
|
|
323
421
|
HasPlatformInstance,
|
|
@@ -352,13 +450,14 @@ class Dataset(
|
|
|
352
450
|
created: Optional[datetime] = None,
|
|
353
451
|
last_modified: Optional[datetime] = None,
|
|
354
452
|
# Standard aspects.
|
|
453
|
+
parent_container: ParentContainerInputType | Unset = unset,
|
|
355
454
|
subtype: Optional[str] = None,
|
|
356
|
-
container: Optional[ContainerInputType] = None,
|
|
357
455
|
owners: Optional[OwnersInputType] = None,
|
|
358
456
|
tags: Optional[TagsInputType] = None,
|
|
359
457
|
terms: Optional[TermsInputType] = None,
|
|
360
458
|
# TODO structured_properties
|
|
361
459
|
domain: Optional[DomainInputType] = None,
|
|
460
|
+
extra_aspects: ExtraAspectsType = None,
|
|
362
461
|
# Dataset-specific aspects.
|
|
363
462
|
schema: Optional[SchemaFieldsInputType] = None,
|
|
364
463
|
upstreams: Optional[models.UpstreamLineageClass] = None,
|
|
@@ -370,6 +469,7 @@ class Dataset(
|
|
|
370
469
|
env=env,
|
|
371
470
|
)
|
|
372
471
|
super().__init__(urn)
|
|
472
|
+
self._set_extra_aspects(extra_aspects)
|
|
373
473
|
|
|
374
474
|
self._set_platform_instance(urn.platform, platform_instance)
|
|
375
475
|
|
|
@@ -393,10 +493,10 @@ class Dataset(
|
|
|
393
493
|
if last_modified is not None:
|
|
394
494
|
self.set_last_modified(last_modified)
|
|
395
495
|
|
|
496
|
+
if parent_container is not unset:
|
|
497
|
+
self._set_container(parent_container)
|
|
396
498
|
if subtype is not None:
|
|
397
499
|
self.set_subtype(subtype)
|
|
398
|
-
if container is not None:
|
|
399
|
-
self._set_container(container)
|
|
400
500
|
if owners is not None:
|
|
401
501
|
self.set_owners(owners)
|
|
402
502
|
if tags is not None:
|
|
@@ -537,14 +637,6 @@ class Dataset(
|
|
|
537
637
|
nativeDataType=field_type,
|
|
538
638
|
description=description,
|
|
539
639
|
)
|
|
540
|
-
elif isinstance(schema_field_input, str):
|
|
541
|
-
# TODO: Not sure this branch makes sense - we should probably just require types?
|
|
542
|
-
return models.SchemaFieldClass(
|
|
543
|
-
fieldPath=schema_field_input,
|
|
544
|
-
type=models.SchemaFieldDataTypeClass(models.NullTypeClass()),
|
|
545
|
-
nativeDataType="unknown",
|
|
546
|
-
description=None,
|
|
547
|
-
)
|
|
548
640
|
else:
|
|
549
641
|
assert_never(schema_field_input)
|
|
550
642
|
|
datahub/sdk/main_client.py
CHANGED
|
@@ -41,10 +41,24 @@ class DataHubClient:
|
|
|
41
41
|
|
|
42
42
|
@classmethod
|
|
43
43
|
def from_env(cls) -> "DataHubClient":
|
|
44
|
+
"""Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
|
|
45
|
+
|
|
46
|
+
This will first check DATAHUB_GMS_URL and DATAHUB_GMS_TOKEN. If not present,
|
|
47
|
+
it will read credentials from ~/.datahubenv. That file can be created using
|
|
48
|
+
the `datahub init` command.
|
|
49
|
+
|
|
50
|
+
If you're looking to specify the server/token in code, use the
|
|
51
|
+
DataHubClient(server=..., token=...) constructor instead.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
A DataHubClient instance.
|
|
55
|
+
"""
|
|
56
|
+
|
|
44
57
|
# Inspired by the DockerClient.from_env() method.
|
|
45
58
|
# TODO: This one also reads from ~/.datahubenv, so the "from_env" name might be a bit confusing.
|
|
46
59
|
# That file is part of the "environment", but is not a traditional "env variable".
|
|
47
60
|
graph = get_default_graph()
|
|
61
|
+
|
|
48
62
|
return cls(graph=graph)
|
|
49
63
|
|
|
50
64
|
@property
|
|
@@ -54,3 +68,6 @@ class DataHubClient:
|
|
|
54
68
|
@property
|
|
55
69
|
def resolve(self) -> ResolverClient:
|
|
56
70
|
return ResolverClient(self)
|
|
71
|
+
|
|
72
|
+
# TODO: search client
|
|
73
|
+
# TODO: lineage client
|