acryl-datahub 0.15.0.6rc1__py3-none-any.whl → 0.15.0.6rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc1.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/METADATA +2394 -2394
- {acryl_datahub-0.15.0.6rc1.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/RECORD +22 -11
- datahub/_version.py +1 -1
- datahub/errors.py +35 -0
- datahub/ingestion/source/mongodb.py +17 -16
- datahub/ingestion/source/s3/source.py +14 -5
- datahub/ingestion/source/snowflake/snowflake_schema.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +11 -14
- datahub/sdk/__init__.py +33 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_attribution.py +48 -0
- datahub/sdk/_entity.py +89 -0
- datahub/sdk/_shared.py +338 -0
- datahub/sdk/container.py +193 -0
- datahub/sdk/dataset.py +584 -0
- datahub/sdk/entity_client.py +115 -0
- datahub/sdk/main_client.py +56 -0
- datahub/sdk/resolver_client.py +101 -0
- {acryl_datahub-0.15.0.6rc1.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc1.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.6rc1.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.6rc1.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/top_level.txt +0 -0
datahub/sdk/dataset.py
ADDED
|
@@ -0,0 +1,584 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Dict, List, Optional, Tuple, Type, Union
|
|
6
|
+
|
|
7
|
+
from typing_extensions import Self, TypeAlias, assert_never
|
|
8
|
+
|
|
9
|
+
import datahub.metadata.schema_classes as models
|
|
10
|
+
from datahub.cli.cli_utils import first_non_null
|
|
11
|
+
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
12
|
+
from datahub.errors import (
|
|
13
|
+
IngestionAttributionWarning,
|
|
14
|
+
ItemNotFoundError,
|
|
15
|
+
SchemaFieldKeyError,
|
|
16
|
+
)
|
|
17
|
+
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
18
|
+
from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn
|
|
19
|
+
from datahub.sdk._attribution import is_ingestion_attribution
|
|
20
|
+
from datahub.sdk._entity import Entity
|
|
21
|
+
from datahub.sdk._shared import (
|
|
22
|
+
ContainerInputType,
|
|
23
|
+
DatasetUrnOrStr,
|
|
24
|
+
DomainInputType,
|
|
25
|
+
HasContainer,
|
|
26
|
+
HasDomain,
|
|
27
|
+
HasOwnership,
|
|
28
|
+
HasPlatformInstance,
|
|
29
|
+
HasSubtype,
|
|
30
|
+
HasTags,
|
|
31
|
+
HasTerms,
|
|
32
|
+
OwnersInputType,
|
|
33
|
+
TagsInputType,
|
|
34
|
+
TermsInputType,
|
|
35
|
+
make_time_stamp,
|
|
36
|
+
parse_time_stamp,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
SchemaFieldInputType: TypeAlias = Union[
|
|
40
|
+
str,
|
|
41
|
+
Tuple[str, str], # (name, type)
|
|
42
|
+
Tuple[str, str, str], # (name, type, description)
|
|
43
|
+
models.SchemaFieldClass,
|
|
44
|
+
]
|
|
45
|
+
SchemaFieldsInputType: TypeAlias = Union[
|
|
46
|
+
List[SchemaFieldInputType],
|
|
47
|
+
models.SchemaMetadataClass,
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
UpstreamInputType: TypeAlias = Union[
|
|
51
|
+
# Dataset upstream variants.
|
|
52
|
+
DatasetUrnOrStr,
|
|
53
|
+
models.UpstreamClass,
|
|
54
|
+
# Column upstream variants.
|
|
55
|
+
models.FineGrainedLineageClass,
|
|
56
|
+
]
|
|
57
|
+
# Mapping of { downstream_column -> [upstream_columns] }
|
|
58
|
+
ColumnLineageMapping: TypeAlias = Dict[str, List[str]]
|
|
59
|
+
UpstreamLineageInputType: TypeAlias = Union[
|
|
60
|
+
models.UpstreamLineageClass,
|
|
61
|
+
List[UpstreamInputType],
|
|
62
|
+
# Combined variant.
|
|
63
|
+
# Map of { upstream_dataset -> { downstream_column -> [upstream_column] } }
|
|
64
|
+
Dict[DatasetUrnOrStr, ColumnLineageMapping],
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _parse_upstream_input(
|
|
69
|
+
upstream_input: UpstreamInputType,
|
|
70
|
+
) -> Union[models.UpstreamClass, models.FineGrainedLineageClass]:
|
|
71
|
+
if isinstance(upstream_input, models.UpstreamClass):
|
|
72
|
+
return upstream_input
|
|
73
|
+
elif isinstance(upstream_input, models.FineGrainedLineageClass):
|
|
74
|
+
return upstream_input
|
|
75
|
+
elif isinstance(upstream_input, (str, DatasetUrn)):
|
|
76
|
+
return models.UpstreamClass(
|
|
77
|
+
dataset=str(upstream_input),
|
|
78
|
+
type=models.DatasetLineageTypeClass.TRANSFORMED,
|
|
79
|
+
)
|
|
80
|
+
else:
|
|
81
|
+
assert_never(upstream_input)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _parse_cll_mapping(
|
|
85
|
+
*,
|
|
86
|
+
upstream: DatasetUrnOrStr,
|
|
87
|
+
downstream: DatasetUrnOrStr,
|
|
88
|
+
cll_mapping: ColumnLineageMapping,
|
|
89
|
+
) -> List[models.FineGrainedLineageClass]:
|
|
90
|
+
cll = []
|
|
91
|
+
for downstream_column, upstream_columns in cll_mapping.items():
|
|
92
|
+
cll.append(
|
|
93
|
+
models.FineGrainedLineageClass(
|
|
94
|
+
upstreamType=models.FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
95
|
+
downstreamType=models.FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
96
|
+
upstreams=[
|
|
97
|
+
SchemaFieldUrn(upstream, upstream_column).urn()
|
|
98
|
+
for upstream_column in upstream_columns
|
|
99
|
+
],
|
|
100
|
+
downstreams=[SchemaFieldUrn(downstream, downstream_column).urn()],
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
return cll
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _parse_upstream_lineage_input(
|
|
107
|
+
upstream_input: UpstreamLineageInputType, downstream_urn: DatasetUrn
|
|
108
|
+
) -> models.UpstreamLineageClass:
|
|
109
|
+
if isinstance(upstream_input, models.UpstreamLineageClass):
|
|
110
|
+
return upstream_input
|
|
111
|
+
elif isinstance(upstream_input, list):
|
|
112
|
+
upstreams = [_parse_upstream_input(upstream) for upstream in upstream_input]
|
|
113
|
+
|
|
114
|
+
# Partition into table and column lineages.
|
|
115
|
+
tll = [
|
|
116
|
+
upstream
|
|
117
|
+
for upstream in upstreams
|
|
118
|
+
if isinstance(upstream, models.UpstreamClass)
|
|
119
|
+
]
|
|
120
|
+
cll = [
|
|
121
|
+
upstream
|
|
122
|
+
for upstream in upstreams
|
|
123
|
+
if not isinstance(upstream, models.UpstreamClass)
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
# TODO: check that all things in cll are also in tll
|
|
127
|
+
return models.UpstreamLineageClass(upstreams=tll, fineGrainedLineages=cll)
|
|
128
|
+
elif isinstance(upstream_input, dict):
|
|
129
|
+
tll = []
|
|
130
|
+
cll = []
|
|
131
|
+
for dataset_urn, column_lineage in upstream_input.items():
|
|
132
|
+
tll.append(
|
|
133
|
+
models.UpstreamClass(
|
|
134
|
+
dataset=str(dataset_urn),
|
|
135
|
+
type=models.DatasetLineageTypeClass.TRANSFORMED,
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
cll.extend(
|
|
139
|
+
_parse_cll_mapping(
|
|
140
|
+
upstream=dataset_urn,
|
|
141
|
+
downstream=downstream_urn,
|
|
142
|
+
cll_mapping=column_lineage,
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return models.UpstreamLineageClass(upstreams=tll, fineGrainedLineages=cll)
|
|
147
|
+
else:
|
|
148
|
+
assert_never(upstream_input)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class SchemaField:
|
|
152
|
+
__slots__ = ("_parent", "_field_path")
|
|
153
|
+
|
|
154
|
+
def __init__(self, parent: Dataset, field_path: str):
|
|
155
|
+
self._parent = parent
|
|
156
|
+
self._field_path = field_path
|
|
157
|
+
|
|
158
|
+
def _base_schema_field(self) -> models.SchemaFieldClass:
|
|
159
|
+
# This must exist - if it doesn't, we've got a larger bug.
|
|
160
|
+
schema_dict = self._parent._schema_dict()
|
|
161
|
+
return schema_dict[self._field_path]
|
|
162
|
+
|
|
163
|
+
def _get_editable_schema_field(
|
|
164
|
+
self,
|
|
165
|
+
) -> Optional[models.EditableSchemaFieldInfoClass]:
|
|
166
|
+
# This method does not make any mutations.
|
|
167
|
+
editable_schema = self._parent._get_aspect(models.EditableSchemaMetadataClass)
|
|
168
|
+
if editable_schema is None:
|
|
169
|
+
return None
|
|
170
|
+
for field in editable_schema.editableSchemaFieldInfo:
|
|
171
|
+
if field.fieldPath == self._field_path:
|
|
172
|
+
return field
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
def _ensure_editable_schema_field(self) -> models.EditableSchemaFieldInfoClass:
|
|
176
|
+
if is_ingestion_attribution():
|
|
177
|
+
warnings.warn(
|
|
178
|
+
"This method should not be used in ingestion mode.",
|
|
179
|
+
IngestionAttributionWarning,
|
|
180
|
+
stacklevel=2,
|
|
181
|
+
)
|
|
182
|
+
editable_schema = self._parent._setdefault_aspect(
|
|
183
|
+
models.EditableSchemaMetadataClass(editableSchemaFieldInfo=[])
|
|
184
|
+
)
|
|
185
|
+
for field in editable_schema.editableSchemaFieldInfo:
|
|
186
|
+
if field.fieldPath == self._field_path:
|
|
187
|
+
return field
|
|
188
|
+
|
|
189
|
+
# If we don't have an entry for this field yet, create one.
|
|
190
|
+
field = models.EditableSchemaFieldInfoClass(fieldPath=self._field_path)
|
|
191
|
+
editable_schema.editableSchemaFieldInfo.append(field)
|
|
192
|
+
return field
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def field_path(self) -> str:
|
|
196
|
+
return self._field_path
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def mapped_type(self) -> models.SchemaFieldDataTypeClass:
|
|
200
|
+
return self._base_schema_field().type
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def native_type(self) -> str:
|
|
204
|
+
return self._base_schema_field().nativeDataType
|
|
205
|
+
|
|
206
|
+
# TODO expose nullability and primary/foreign key details
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def description(self) -> Optional[str]:
|
|
210
|
+
editable_field = self._get_editable_schema_field()
|
|
211
|
+
return first_non_null(
|
|
212
|
+
[
|
|
213
|
+
editable_field.description if editable_field is not None else None,
|
|
214
|
+
self._base_schema_field().description,
|
|
215
|
+
]
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def set_description(self, description: str) -> None:
|
|
219
|
+
if is_ingestion_attribution():
|
|
220
|
+
editable_field = self._get_editable_schema_field()
|
|
221
|
+
if editable_field and editable_field.description is not None:
|
|
222
|
+
warnings.warn(
|
|
223
|
+
"The field description will be hidden by UI-based edits. "
|
|
224
|
+
"Change the edit mode to OVERWRITE_UI to override this behavior.",
|
|
225
|
+
category=IngestionAttributionWarning,
|
|
226
|
+
stacklevel=2,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
self._base_schema_field().description = description
|
|
230
|
+
else:
|
|
231
|
+
self._ensure_editable_schema_field().description = description
|
|
232
|
+
|
|
233
|
+
@property
|
|
234
|
+
def tags(self) -> Optional[List[models.TagAssociationClass]]:
|
|
235
|
+
# Tricky: if either has a non-null globalTags, this will not return None.
|
|
236
|
+
tags = None
|
|
237
|
+
|
|
238
|
+
if (base_tags := self._base_schema_field().globalTags) is not None:
|
|
239
|
+
tags = tags or []
|
|
240
|
+
tags.extend(base_tags.tags)
|
|
241
|
+
|
|
242
|
+
if editable_field := self._get_editable_schema_field():
|
|
243
|
+
if (editable_tags := editable_field.globalTags) is not None:
|
|
244
|
+
tags = tags or []
|
|
245
|
+
tags.extend(editable_tags.tags)
|
|
246
|
+
|
|
247
|
+
return tags
|
|
248
|
+
|
|
249
|
+
def set_tags(self, tags: TagsInputType) -> None:
|
|
250
|
+
parsed_tags = [self._parent._parse_tag_association_class(tag) for tag in tags]
|
|
251
|
+
|
|
252
|
+
if is_ingestion_attribution():
|
|
253
|
+
editable_field = self._get_editable_schema_field()
|
|
254
|
+
if editable_field and editable_field.globalTags:
|
|
255
|
+
warnings.warn(
|
|
256
|
+
"Overwriting non-ingestion tags from ingestion is an anti-pattern.",
|
|
257
|
+
category=IngestionAttributionWarning,
|
|
258
|
+
stacklevel=2,
|
|
259
|
+
)
|
|
260
|
+
editable_field.globalTags = None
|
|
261
|
+
|
|
262
|
+
self._base_schema_field().globalTags = models.GlobalTagsClass(
|
|
263
|
+
tags=parsed_tags
|
|
264
|
+
)
|
|
265
|
+
else:
|
|
266
|
+
base_field = self._base_schema_field()
|
|
267
|
+
if base_field.globalTags:
|
|
268
|
+
base_field.globalTags = None
|
|
269
|
+
|
|
270
|
+
self._ensure_editable_schema_field().globalTags = models.GlobalTagsClass(
|
|
271
|
+
tags=parsed_tags
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
@property
|
|
275
|
+
def terms(self) -> Optional[List[models.GlossaryTermAssociationClass]]:
|
|
276
|
+
# TODO: Basically the same implementation as tags - can we share code?
|
|
277
|
+
terms = None
|
|
278
|
+
|
|
279
|
+
if (base_terms := self._base_schema_field().glossaryTerms) is not None:
|
|
280
|
+
terms = terms or []
|
|
281
|
+
terms.extend(base_terms.terms)
|
|
282
|
+
|
|
283
|
+
if editable_field := self._get_editable_schema_field():
|
|
284
|
+
if (editable_terms := editable_field.glossaryTerms) is not None:
|
|
285
|
+
terms = terms or []
|
|
286
|
+
terms.extend(editable_terms.terms)
|
|
287
|
+
|
|
288
|
+
return terms
|
|
289
|
+
|
|
290
|
+
def set_terms(self, terms: List[models.GlossaryTermAssociationClass]) -> None:
|
|
291
|
+
parsed_terms = [
|
|
292
|
+
self._parent._parse_glossary_term_association_class(term) for term in terms
|
|
293
|
+
]
|
|
294
|
+
|
|
295
|
+
if is_ingestion_attribution():
|
|
296
|
+
editable_field = self._get_editable_schema_field()
|
|
297
|
+
if editable_field and editable_field.glossaryTerms:
|
|
298
|
+
warnings.warn(
|
|
299
|
+
"Overwriting non-ingestion terms from ingestion is an anti-pattern.",
|
|
300
|
+
category=IngestionAttributionWarning,
|
|
301
|
+
stacklevel=2,
|
|
302
|
+
)
|
|
303
|
+
editable_field.glossaryTerms = None
|
|
304
|
+
|
|
305
|
+
self._base_schema_field().glossaryTerms = models.GlossaryTermsClass(
|
|
306
|
+
terms=parsed_terms,
|
|
307
|
+
auditStamp=self._parent._terms_audit_stamp(),
|
|
308
|
+
)
|
|
309
|
+
else:
|
|
310
|
+
base_field = self._base_schema_field()
|
|
311
|
+
if base_field.glossaryTerms:
|
|
312
|
+
base_field.glossaryTerms = None
|
|
313
|
+
|
|
314
|
+
self._ensure_editable_schema_field().glossaryTerms = (
|
|
315
|
+
models.GlossaryTermsClass(
|
|
316
|
+
terms=parsed_terms,
|
|
317
|
+
auditStamp=self._parent._terms_audit_stamp(),
|
|
318
|
+
)
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class Dataset(
|
|
323
|
+
HasPlatformInstance,
|
|
324
|
+
HasSubtype,
|
|
325
|
+
HasContainer,
|
|
326
|
+
HasOwnership,
|
|
327
|
+
HasTags,
|
|
328
|
+
HasTerms,
|
|
329
|
+
HasDomain,
|
|
330
|
+
Entity,
|
|
331
|
+
):
|
|
332
|
+
__slots__ = ()
|
|
333
|
+
|
|
334
|
+
@classmethod
|
|
335
|
+
def get_urn_type(cls) -> Type[DatasetUrn]:
|
|
336
|
+
return DatasetUrn
|
|
337
|
+
|
|
338
|
+
def __init__(
|
|
339
|
+
self,
|
|
340
|
+
*,
|
|
341
|
+
# Identity.
|
|
342
|
+
platform: str,
|
|
343
|
+
name: str,
|
|
344
|
+
platform_instance: Optional[str] = None,
|
|
345
|
+
env: str = DEFAULT_ENV,
|
|
346
|
+
# Dataset properties.
|
|
347
|
+
description: Optional[str] = None,
|
|
348
|
+
display_name: Optional[str] = None,
|
|
349
|
+
qualified_name: Optional[str] = None,
|
|
350
|
+
external_url: Optional[str] = None,
|
|
351
|
+
custom_properties: Optional[Dict[str, str]] = None,
|
|
352
|
+
created: Optional[datetime] = None,
|
|
353
|
+
last_modified: Optional[datetime] = None,
|
|
354
|
+
# Standard aspects.
|
|
355
|
+
subtype: Optional[str] = None,
|
|
356
|
+
container: Optional[ContainerInputType] = None,
|
|
357
|
+
owners: Optional[OwnersInputType] = None,
|
|
358
|
+
tags: Optional[TagsInputType] = None,
|
|
359
|
+
terms: Optional[TermsInputType] = None,
|
|
360
|
+
# TODO structured_properties
|
|
361
|
+
domain: Optional[DomainInputType] = None,
|
|
362
|
+
# Dataset-specific aspects.
|
|
363
|
+
schema: Optional[SchemaFieldsInputType] = None,
|
|
364
|
+
upstreams: Optional[models.UpstreamLineageClass] = None,
|
|
365
|
+
):
|
|
366
|
+
urn = DatasetUrn.create_from_ids(
|
|
367
|
+
platform_id=platform,
|
|
368
|
+
table_name=name,
|
|
369
|
+
platform_instance=platform_instance,
|
|
370
|
+
env=env,
|
|
371
|
+
)
|
|
372
|
+
super().__init__(urn)
|
|
373
|
+
|
|
374
|
+
self._set_platform_instance(urn.platform, platform_instance)
|
|
375
|
+
|
|
376
|
+
if schema is not None:
|
|
377
|
+
self._set_schema(schema)
|
|
378
|
+
if upstreams is not None:
|
|
379
|
+
self.set_upstreams(upstreams)
|
|
380
|
+
|
|
381
|
+
if description is not None:
|
|
382
|
+
self.set_description(description)
|
|
383
|
+
if display_name is not None:
|
|
384
|
+
self.set_display_name(display_name)
|
|
385
|
+
if qualified_name is not None:
|
|
386
|
+
self.set_qualified_name(qualified_name)
|
|
387
|
+
if external_url is not None:
|
|
388
|
+
self.set_external_url(external_url)
|
|
389
|
+
if custom_properties is not None:
|
|
390
|
+
self.set_custom_properties(custom_properties)
|
|
391
|
+
if created is not None:
|
|
392
|
+
self.set_created(created)
|
|
393
|
+
if last_modified is not None:
|
|
394
|
+
self.set_last_modified(last_modified)
|
|
395
|
+
|
|
396
|
+
if subtype is not None:
|
|
397
|
+
self.set_subtype(subtype)
|
|
398
|
+
if container is not None:
|
|
399
|
+
self._set_container(container)
|
|
400
|
+
if owners is not None:
|
|
401
|
+
self.set_owners(owners)
|
|
402
|
+
if tags is not None:
|
|
403
|
+
self.set_tags(tags)
|
|
404
|
+
if terms is not None:
|
|
405
|
+
self.set_terms(terms)
|
|
406
|
+
if domain is not None:
|
|
407
|
+
self.set_domain(domain)
|
|
408
|
+
|
|
409
|
+
@classmethod
|
|
410
|
+
def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
|
|
411
|
+
assert isinstance(urn, DatasetUrn)
|
|
412
|
+
entity = cls(
|
|
413
|
+
platform=urn.platform,
|
|
414
|
+
name=urn.name,
|
|
415
|
+
env=urn.env,
|
|
416
|
+
)
|
|
417
|
+
return entity._init_from_graph(current_aspects)
|
|
418
|
+
|
|
419
|
+
@property
|
|
420
|
+
def urn(self) -> DatasetUrn:
|
|
421
|
+
return self._urn # type: ignore
|
|
422
|
+
|
|
423
|
+
def _ensure_dataset_props(self) -> models.DatasetPropertiesClass:
|
|
424
|
+
return self._setdefault_aspect(models.DatasetPropertiesClass())
|
|
425
|
+
|
|
426
|
+
def _get_editable_props(self) -> Optional[models.EditableDatasetPropertiesClass]:
|
|
427
|
+
return self._get_aspect(models.EditableDatasetPropertiesClass)
|
|
428
|
+
|
|
429
|
+
def _ensure_editable_props(self) -> models.EditableDatasetPropertiesClass:
|
|
430
|
+
# Note that most of the fields in this aspect are not used.
|
|
431
|
+
# The only one that's relevant for us is the description.
|
|
432
|
+
return self._setdefault_aspect(models.EditableDatasetPropertiesClass())
|
|
433
|
+
|
|
434
|
+
@property
|
|
435
|
+
def description(self) -> Optional[str]:
|
|
436
|
+
editable_props = self._get_editable_props()
|
|
437
|
+
return first_non_null(
|
|
438
|
+
[
|
|
439
|
+
editable_props.description if editable_props is not None else None,
|
|
440
|
+
self._ensure_dataset_props().description,
|
|
441
|
+
]
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
def set_description(self, description: str) -> None:
|
|
445
|
+
if is_ingestion_attribution():
|
|
446
|
+
editable_props = self._get_editable_props()
|
|
447
|
+
if editable_props is not None and editable_props.description is not None:
|
|
448
|
+
warnings.warn(
|
|
449
|
+
"Overwriting non-ingestion description from ingestion is an anti-pattern.",
|
|
450
|
+
category=IngestionAttributionWarning,
|
|
451
|
+
stacklevel=2,
|
|
452
|
+
)
|
|
453
|
+
# Force the ingestion description to show up.
|
|
454
|
+
editable_props.description = None
|
|
455
|
+
|
|
456
|
+
self._ensure_dataset_props().description = description
|
|
457
|
+
else:
|
|
458
|
+
self._ensure_editable_props().description = description
|
|
459
|
+
|
|
460
|
+
@property
|
|
461
|
+
def display_name(self) -> Optional[str]:
|
|
462
|
+
return self._ensure_dataset_props().name
|
|
463
|
+
|
|
464
|
+
def set_display_name(self, display_name: str) -> None:
|
|
465
|
+
self._ensure_dataset_props().name = display_name
|
|
466
|
+
|
|
467
|
+
@property
|
|
468
|
+
def qualified_name(self) -> Optional[str]:
|
|
469
|
+
return self._ensure_dataset_props().qualifiedName
|
|
470
|
+
|
|
471
|
+
def set_qualified_name(self, qualified_name: str) -> None:
|
|
472
|
+
self._ensure_dataset_props().qualifiedName = qualified_name
|
|
473
|
+
|
|
474
|
+
@property
|
|
475
|
+
def external_url(self) -> Optional[str]:
|
|
476
|
+
return self._ensure_dataset_props().externalUrl
|
|
477
|
+
|
|
478
|
+
def set_external_url(self, external_url: str) -> None:
|
|
479
|
+
self._ensure_dataset_props().externalUrl = external_url
|
|
480
|
+
|
|
481
|
+
@property
|
|
482
|
+
def custom_properties(self) -> Dict[str, str]:
|
|
483
|
+
return self._ensure_dataset_props().customProperties
|
|
484
|
+
|
|
485
|
+
def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
|
|
486
|
+
self._ensure_dataset_props().customProperties = custom_properties
|
|
487
|
+
|
|
488
|
+
@property
|
|
489
|
+
def created(self) -> Optional[datetime]:
|
|
490
|
+
return parse_time_stamp(self._ensure_dataset_props().created)
|
|
491
|
+
|
|
492
|
+
def set_created(self, created: datetime) -> None:
|
|
493
|
+
self._ensure_dataset_props().created = make_time_stamp(created)
|
|
494
|
+
|
|
495
|
+
@property
|
|
496
|
+
def last_modified(self) -> Optional[datetime]:
|
|
497
|
+
return parse_time_stamp(self._ensure_dataset_props().lastModified)
|
|
498
|
+
|
|
499
|
+
def set_last_modified(self, last_modified: datetime) -> None:
|
|
500
|
+
self._ensure_dataset_props().lastModified = make_time_stamp(last_modified)
|
|
501
|
+
|
|
502
|
+
def _schema_dict(self) -> Dict[str, models.SchemaFieldClass]:
|
|
503
|
+
schema_metadata = self._get_aspect(models.SchemaMetadataClass)
|
|
504
|
+
if schema_metadata is None:
|
|
505
|
+
raise ItemNotFoundError(f"Schema is not set for dataset {self.urn}")
|
|
506
|
+
return {field.fieldPath: field for field in schema_metadata.fields}
|
|
507
|
+
|
|
508
|
+
@property
|
|
509
|
+
def schema(self) -> List[SchemaField]:
|
|
510
|
+
# TODO: Add some caching here to avoid iterating over the schema every time.
|
|
511
|
+
schema_dict = self._schema_dict()
|
|
512
|
+
return [SchemaField(self, field_path) for field_path in schema_dict]
|
|
513
|
+
|
|
514
|
+
def _parse_schema_field_input(
|
|
515
|
+
self, schema_field_input: SchemaFieldInputType
|
|
516
|
+
) -> models.SchemaFieldClass:
|
|
517
|
+
if isinstance(schema_field_input, models.SchemaFieldClass):
|
|
518
|
+
return schema_field_input
|
|
519
|
+
elif isinstance(schema_field_input, tuple):
|
|
520
|
+
# Support (name, type) and (name, type, description) forms
|
|
521
|
+
if len(schema_field_input) == 2:
|
|
522
|
+
name, field_type = schema_field_input
|
|
523
|
+
description = None
|
|
524
|
+
elif len(schema_field_input) == 3:
|
|
525
|
+
name, field_type, description = schema_field_input
|
|
526
|
+
else:
|
|
527
|
+
assert_never(schema_field_input)
|
|
528
|
+
return models.SchemaFieldClass(
|
|
529
|
+
fieldPath=name,
|
|
530
|
+
type=models.SchemaFieldDataTypeClass(
|
|
531
|
+
resolve_sql_type(
|
|
532
|
+
field_type,
|
|
533
|
+
platform=self.urn.get_data_platform_urn().platform_name,
|
|
534
|
+
)
|
|
535
|
+
or models.NullTypeClass()
|
|
536
|
+
),
|
|
537
|
+
nativeDataType=field_type,
|
|
538
|
+
description=description,
|
|
539
|
+
)
|
|
540
|
+
elif isinstance(schema_field_input, str):
|
|
541
|
+
# TODO: Not sure this branch makes sense - we should probably just require types?
|
|
542
|
+
return models.SchemaFieldClass(
|
|
543
|
+
fieldPath=schema_field_input,
|
|
544
|
+
type=models.SchemaFieldDataTypeClass(models.NullTypeClass()),
|
|
545
|
+
nativeDataType="unknown",
|
|
546
|
+
description=None,
|
|
547
|
+
)
|
|
548
|
+
else:
|
|
549
|
+
assert_never(schema_field_input)
|
|
550
|
+
|
|
551
|
+
def _set_schema(self, schema: SchemaFieldsInputType) -> None:
|
|
552
|
+
# This method is not public. Ingestion/restatement users should be setting
|
|
553
|
+
# the schema via the constructor. SDK users that got a dataset from the graph
|
|
554
|
+
# probably shouldn't be adding/removing fields ad-hoc. The field-level mutators
|
|
555
|
+
# can be used instead.
|
|
556
|
+
if isinstance(schema, models.SchemaMetadataClass):
|
|
557
|
+
self._set_aspect(schema)
|
|
558
|
+
else:
|
|
559
|
+
parsed_schema = [self._parse_schema_field_input(field) for field in schema]
|
|
560
|
+
self._set_aspect(
|
|
561
|
+
models.SchemaMetadataClass(
|
|
562
|
+
fields=parsed_schema,
|
|
563
|
+
# The rest of these fields are not used, and so we can set them to dummy/default values.
|
|
564
|
+
schemaName="",
|
|
565
|
+
platform=self.urn.platform,
|
|
566
|
+
version=0,
|
|
567
|
+
hash="",
|
|
568
|
+
platformSchema=models.SchemalessClass(),
|
|
569
|
+
)
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
def __getitem__(self, field_path: str) -> SchemaField:
|
|
573
|
+
# TODO: Automatically deal with field path v2?
|
|
574
|
+
schema_dict = self._schema_dict()
|
|
575
|
+
if field_path not in schema_dict:
|
|
576
|
+
raise SchemaFieldKeyError(f"Field {field_path} not found in schema")
|
|
577
|
+
return SchemaField(self, field_path)
|
|
578
|
+
|
|
579
|
+
@property
|
|
580
|
+
def upstreams(self) -> Optional[models.UpstreamLineageClass]:
|
|
581
|
+
return self._get_aspect(models.UpstreamLineageClass)
|
|
582
|
+
|
|
583
|
+
def set_upstreams(self, upstreams: UpstreamLineageInputType) -> None:
|
|
584
|
+
self._set_aspect(_parse_upstream_lineage_input(upstreams, self.urn))
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from typing import TYPE_CHECKING, Union, overload
|
|
5
|
+
|
|
6
|
+
import datahub.metadata.schema_classes as models
|
|
7
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
8
|
+
from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
|
|
9
|
+
from datahub.errors import IngestionAttributionWarning, ItemNotFoundError, SdkUsageError
|
|
10
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
11
|
+
from datahub.metadata.urns import (
|
|
12
|
+
ContainerUrn,
|
|
13
|
+
DatasetUrn,
|
|
14
|
+
Urn,
|
|
15
|
+
)
|
|
16
|
+
from datahub.sdk._all_entities import ENTITY_CLASSES
|
|
17
|
+
from datahub.sdk._entity import Entity
|
|
18
|
+
from datahub.sdk._shared import UrnOrStr
|
|
19
|
+
from datahub.sdk.container import Container
|
|
20
|
+
from datahub.sdk.dataset import Dataset
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from datahub.sdk.main_client import DataHubClient
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class EntityClient:
|
|
27
|
+
def __init__(self, client: DataHubClient):
|
|
28
|
+
self._client = client
|
|
29
|
+
|
|
30
|
+
# TODO: Make all of these methods sync by default.
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def _graph(self) -> DataHubGraph:
|
|
34
|
+
return self._client._graph
|
|
35
|
+
|
|
36
|
+
@overload
|
|
37
|
+
def get(self, urn: ContainerUrn) -> Container: ...
|
|
38
|
+
@overload
|
|
39
|
+
def get(self, urn: DatasetUrn) -> Dataset: ...
|
|
40
|
+
@overload
|
|
41
|
+
def get(self, urn: Union[Urn, str]) -> Entity: ...
|
|
42
|
+
def get(self, urn: UrnOrStr) -> Entity:
|
|
43
|
+
if not isinstance(urn, Urn):
|
|
44
|
+
urn = Urn.from_string(urn)
|
|
45
|
+
|
|
46
|
+
# TODO: add error handling around this with a suggested alternative if not yet supported
|
|
47
|
+
EntityClass = ENTITY_CLASSES[urn.entity_type]
|
|
48
|
+
|
|
49
|
+
if not self._graph.exists(str(urn)):
|
|
50
|
+
raise ItemNotFoundError(f"Entity {urn} not found")
|
|
51
|
+
|
|
52
|
+
aspects = self._graph.get_entity_semityped(str(urn))
|
|
53
|
+
|
|
54
|
+
# TODO: save the timestamp so we can use If-Unmodified-Since on the updates
|
|
55
|
+
return EntityClass._new_from_graph(urn, aspects)
|
|
56
|
+
|
|
57
|
+
def create(self, entity: Entity) -> None:
|
|
58
|
+
mcps = []
|
|
59
|
+
|
|
60
|
+
if self._graph.exists(str(entity.urn)):
|
|
61
|
+
raise SdkUsageError(
|
|
62
|
+
f"Entity {entity.urn} already exists. Use client.entities.upsert() to update it."
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Extra safety check: by putting this first, we can ensure that
|
|
66
|
+
# the request fails if the entity already exists.
|
|
67
|
+
mcps.append(
|
|
68
|
+
MetadataChangeProposalWrapper(
|
|
69
|
+
entityUrn=str(entity.urn),
|
|
70
|
+
aspect=entity.urn.to_key_aspect(),
|
|
71
|
+
changeType=models.ChangeTypeClass.CREATE_ENTITY,
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
mcps.extend(entity._as_mcps(models.ChangeTypeClass.CREATE))
|
|
75
|
+
|
|
76
|
+
self._graph.emit_mcps(mcps)
|
|
77
|
+
|
|
78
|
+
def upsert(self, entity: Entity) -> None:
|
|
79
|
+
if entity._prev_aspects is None and self._graph.exists(str(entity.urn)):
|
|
80
|
+
warnings.warn(
|
|
81
|
+
f"The entity {entity.urn} already exists. This operation will partially overwrite the existing entity.",
|
|
82
|
+
IngestionAttributionWarning,
|
|
83
|
+
stacklevel=2,
|
|
84
|
+
)
|
|
85
|
+
# TODO: If there are no previous aspects but the entity exists, should we delete aspects that are not present here?
|
|
86
|
+
|
|
87
|
+
mcps = entity._as_mcps(models.ChangeTypeClass.UPSERT)
|
|
88
|
+
self._graph.emit_mcps(mcps)
|
|
89
|
+
|
|
90
|
+
def update(self, entity: Union[Entity, MetadataPatchProposal]) -> None:
|
|
91
|
+
if isinstance(entity, MetadataPatchProposal):
|
|
92
|
+
return self._update_patch(entity)
|
|
93
|
+
|
|
94
|
+
if entity._prev_aspects is None:
|
|
95
|
+
raise SdkUsageError(
|
|
96
|
+
f"For entities created via {entity.__class__.__name__}(...), use client.entities.create() or client.entities.upsert() instead"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# TODO: respect If-Unmodified-Since?
|
|
100
|
+
# -> probably add a "mode" parameter that can be "update" (e.g. if not modified) or "update_force"
|
|
101
|
+
|
|
102
|
+
mcps = entity._as_mcps(models.ChangeTypeClass.UPSERT)
|
|
103
|
+
self._graph.emit_mcps(mcps)
|
|
104
|
+
|
|
105
|
+
def _update_patch(
|
|
106
|
+
self, updater: MetadataPatchProposal, check_exists: bool = True
|
|
107
|
+
) -> None:
|
|
108
|
+
if check_exists and not self._graph.exists(updater.urn):
|
|
109
|
+
raise SdkUsageError(
|
|
110
|
+
f"Entity {updater.urn} does not exist, and hence cannot be updated. "
|
|
111
|
+
"You can bypass this check by setting check_exists=False."
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
mcps = updater.build()
|
|
115
|
+
self._graph.emit_mcps(mcps)
|