acryl-datahub 0.15.0.5rc10__py3-none-any.whl → 0.15.0.6rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (35) hide show
  1. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/METADATA +2482 -2482
  2. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/RECORD +35 -24
  3. datahub/_version.py +1 -1
  4. datahub/errors.py +35 -0
  5. datahub/ingestion/source/common/subtypes.py +1 -0
  6. datahub/ingestion/source/mongodb.py +17 -16
  7. datahub/ingestion/source/powerbi/config.py +1 -0
  8. datahub/ingestion/source/powerbi/powerbi.py +28 -3
  9. datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -2
  10. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +11 -36
  11. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +17 -4
  12. datahub/ingestion/source/s3/source.py +14 -5
  13. datahub/ingestion/source/snowflake/constants.py +1 -0
  14. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  15. datahub/ingestion/source/snowflake/snowflake_queries.py +45 -10
  16. datahub/ingestion/source/snowflake/snowflake_query.py +20 -1
  17. datahub/ingestion/source/snowflake/snowflake_report.py +6 -0
  18. datahub/ingestion/source/snowflake/snowflake_schema.py +108 -4
  19. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +298 -69
  20. datahub/ingestion/source/snowflake/snowflake_utils.py +17 -8
  21. datahub/ingestion/source/snowflake/snowflake_v2.py +15 -3
  22. datahub/sdk/__init__.py +33 -0
  23. datahub/sdk/_all_entities.py +15 -0
  24. datahub/sdk/_attribution.py +48 -0
  25. datahub/sdk/_entity.py +89 -0
  26. datahub/sdk/_shared.py +338 -0
  27. datahub/sdk/container.py +193 -0
  28. datahub/sdk/dataset.py +584 -0
  29. datahub/sdk/entity_client.py +115 -0
  30. datahub/sdk/main_client.py +56 -0
  31. datahub/sdk/resolver_client.py +101 -0
  32. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/LICENSE +0 -0
  33. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/WHEEL +0 -0
  34. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/entry_points.txt +0 -0
  35. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/top_level.txt +0 -0
datahub/sdk/dataset.py ADDED
@@ -0,0 +1,584 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from datetime import datetime
5
+ from typing import Dict, List, Optional, Tuple, Type, Union
6
+
7
+ from typing_extensions import Self, TypeAlias, assert_never
8
+
9
+ import datahub.metadata.schema_classes as models
10
+ from datahub.cli.cli_utils import first_non_null
11
+ from datahub.emitter.mce_builder import DEFAULT_ENV
12
+ from datahub.errors import (
13
+ IngestionAttributionWarning,
14
+ ItemNotFoundError,
15
+ SchemaFieldKeyError,
16
+ )
17
+ from datahub.ingestion.source.sql.sql_types import resolve_sql_type
18
+ from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn
19
+ from datahub.sdk._attribution import is_ingestion_attribution
20
+ from datahub.sdk._entity import Entity
21
+ from datahub.sdk._shared import (
22
+ ContainerInputType,
23
+ DatasetUrnOrStr,
24
+ DomainInputType,
25
+ HasContainer,
26
+ HasDomain,
27
+ HasOwnership,
28
+ HasPlatformInstance,
29
+ HasSubtype,
30
+ HasTags,
31
+ HasTerms,
32
+ OwnersInputType,
33
+ TagsInputType,
34
+ TermsInputType,
35
+ make_time_stamp,
36
+ parse_time_stamp,
37
+ )
38
+
39
+ SchemaFieldInputType: TypeAlias = Union[
40
+ str,
41
+ Tuple[str, str], # (name, type)
42
+ Tuple[str, str, str], # (name, type, description)
43
+ models.SchemaFieldClass,
44
+ ]
45
+ SchemaFieldsInputType: TypeAlias = Union[
46
+ List[SchemaFieldInputType],
47
+ models.SchemaMetadataClass,
48
+ ]
49
+
50
+ UpstreamInputType: TypeAlias = Union[
51
+ # Dataset upstream variants.
52
+ DatasetUrnOrStr,
53
+ models.UpstreamClass,
54
+ # Column upstream variants.
55
+ models.FineGrainedLineageClass,
56
+ ]
57
+ # Mapping of { downstream_column -> [upstream_columns] }
58
+ ColumnLineageMapping: TypeAlias = Dict[str, List[str]]
59
+ UpstreamLineageInputType: TypeAlias = Union[
60
+ models.UpstreamLineageClass,
61
+ List[UpstreamInputType],
62
+ # Combined variant.
63
+ # Map of { upstream_dataset -> { downstream_column -> [upstream_column] } }
64
+ Dict[DatasetUrnOrStr, ColumnLineageMapping],
65
+ ]
66
+
67
+
68
+ def _parse_upstream_input(
69
+ upstream_input: UpstreamInputType,
70
+ ) -> Union[models.UpstreamClass, models.FineGrainedLineageClass]:
71
+ if isinstance(upstream_input, models.UpstreamClass):
72
+ return upstream_input
73
+ elif isinstance(upstream_input, models.FineGrainedLineageClass):
74
+ return upstream_input
75
+ elif isinstance(upstream_input, (str, DatasetUrn)):
76
+ return models.UpstreamClass(
77
+ dataset=str(upstream_input),
78
+ type=models.DatasetLineageTypeClass.TRANSFORMED,
79
+ )
80
+ else:
81
+ assert_never(upstream_input)
82
+
83
+
84
+ def _parse_cll_mapping(
85
+ *,
86
+ upstream: DatasetUrnOrStr,
87
+ downstream: DatasetUrnOrStr,
88
+ cll_mapping: ColumnLineageMapping,
89
+ ) -> List[models.FineGrainedLineageClass]:
90
+ cll = []
91
+ for downstream_column, upstream_columns in cll_mapping.items():
92
+ cll.append(
93
+ models.FineGrainedLineageClass(
94
+ upstreamType=models.FineGrainedLineageUpstreamTypeClass.FIELD_SET,
95
+ downstreamType=models.FineGrainedLineageDownstreamTypeClass.FIELD,
96
+ upstreams=[
97
+ SchemaFieldUrn(upstream, upstream_column).urn()
98
+ for upstream_column in upstream_columns
99
+ ],
100
+ downstreams=[SchemaFieldUrn(downstream, downstream_column).urn()],
101
+ )
102
+ )
103
+ return cll
104
+
105
+
106
+ def _parse_upstream_lineage_input(
107
+ upstream_input: UpstreamLineageInputType, downstream_urn: DatasetUrn
108
+ ) -> models.UpstreamLineageClass:
109
+ if isinstance(upstream_input, models.UpstreamLineageClass):
110
+ return upstream_input
111
+ elif isinstance(upstream_input, list):
112
+ upstreams = [_parse_upstream_input(upstream) for upstream in upstream_input]
113
+
114
+ # Partition into table and column lineages.
115
+ tll = [
116
+ upstream
117
+ for upstream in upstreams
118
+ if isinstance(upstream, models.UpstreamClass)
119
+ ]
120
+ cll = [
121
+ upstream
122
+ for upstream in upstreams
123
+ if not isinstance(upstream, models.UpstreamClass)
124
+ ]
125
+
126
+ # TODO: check that all things in cll are also in tll
127
+ return models.UpstreamLineageClass(upstreams=tll, fineGrainedLineages=cll)
128
+ elif isinstance(upstream_input, dict):
129
+ tll = []
130
+ cll = []
131
+ for dataset_urn, column_lineage in upstream_input.items():
132
+ tll.append(
133
+ models.UpstreamClass(
134
+ dataset=str(dataset_urn),
135
+ type=models.DatasetLineageTypeClass.TRANSFORMED,
136
+ )
137
+ )
138
+ cll.extend(
139
+ _parse_cll_mapping(
140
+ upstream=dataset_urn,
141
+ downstream=downstream_urn,
142
+ cll_mapping=column_lineage,
143
+ )
144
+ )
145
+
146
+ return models.UpstreamLineageClass(upstreams=tll, fineGrainedLineages=cll)
147
+ else:
148
+ assert_never(upstream_input)
149
+
150
+
151
+ class SchemaField:
152
+ __slots__ = ("_parent", "_field_path")
153
+
154
+ def __init__(self, parent: Dataset, field_path: str):
155
+ self._parent = parent
156
+ self._field_path = field_path
157
+
158
+ def _base_schema_field(self) -> models.SchemaFieldClass:
159
+ # This must exist - if it doesn't, we've got a larger bug.
160
+ schema_dict = self._parent._schema_dict()
161
+ return schema_dict[self._field_path]
162
+
163
+ def _get_editable_schema_field(
164
+ self,
165
+ ) -> Optional[models.EditableSchemaFieldInfoClass]:
166
+ # This method does not make any mutations.
167
+ editable_schema = self._parent._get_aspect(models.EditableSchemaMetadataClass)
168
+ if editable_schema is None:
169
+ return None
170
+ for field in editable_schema.editableSchemaFieldInfo:
171
+ if field.fieldPath == self._field_path:
172
+ return field
173
+ return None
174
+
175
+ def _ensure_editable_schema_field(self) -> models.EditableSchemaFieldInfoClass:
176
+ if is_ingestion_attribution():
177
+ warnings.warn(
178
+ "This method should not be used in ingestion mode.",
179
+ IngestionAttributionWarning,
180
+ stacklevel=2,
181
+ )
182
+ editable_schema = self._parent._setdefault_aspect(
183
+ models.EditableSchemaMetadataClass(editableSchemaFieldInfo=[])
184
+ )
185
+ for field in editable_schema.editableSchemaFieldInfo:
186
+ if field.fieldPath == self._field_path:
187
+ return field
188
+
189
+ # If we don't have an entry for this field yet, create one.
190
+ field = models.EditableSchemaFieldInfoClass(fieldPath=self._field_path)
191
+ editable_schema.editableSchemaFieldInfo.append(field)
192
+ return field
193
+
194
+ @property
195
+ def field_path(self) -> str:
196
+ return self._field_path
197
+
198
+ @property
199
+ def mapped_type(self) -> models.SchemaFieldDataTypeClass:
200
+ return self._base_schema_field().type
201
+
202
+ @property
203
+ def native_type(self) -> str:
204
+ return self._base_schema_field().nativeDataType
205
+
206
+ # TODO expose nullability and primary/foreign key details
207
+
208
+ @property
209
+ def description(self) -> Optional[str]:
210
+ editable_field = self._get_editable_schema_field()
211
+ return first_non_null(
212
+ [
213
+ editable_field.description if editable_field is not None else None,
214
+ self._base_schema_field().description,
215
+ ]
216
+ )
217
+
218
+ def set_description(self, description: str) -> None:
219
+ if is_ingestion_attribution():
220
+ editable_field = self._get_editable_schema_field()
221
+ if editable_field and editable_field.description is not None:
222
+ warnings.warn(
223
+ "The field description will be hidden by UI-based edits. "
224
+ "Change the edit mode to OVERWRITE_UI to override this behavior.",
225
+ category=IngestionAttributionWarning,
226
+ stacklevel=2,
227
+ )
228
+
229
+ self._base_schema_field().description = description
230
+ else:
231
+ self._ensure_editable_schema_field().description = description
232
+
233
+ @property
234
+ def tags(self) -> Optional[List[models.TagAssociationClass]]:
235
+ # Tricky: if either has a non-null globalTags, this will not return None.
236
+ tags = None
237
+
238
+ if (base_tags := self._base_schema_field().globalTags) is not None:
239
+ tags = tags or []
240
+ tags.extend(base_tags.tags)
241
+
242
+ if editable_field := self._get_editable_schema_field():
243
+ if (editable_tags := editable_field.globalTags) is not None:
244
+ tags = tags or []
245
+ tags.extend(editable_tags.tags)
246
+
247
+ return tags
248
+
249
+ def set_tags(self, tags: TagsInputType) -> None:
250
+ parsed_tags = [self._parent._parse_tag_association_class(tag) for tag in tags]
251
+
252
+ if is_ingestion_attribution():
253
+ editable_field = self._get_editable_schema_field()
254
+ if editable_field and editable_field.globalTags:
255
+ warnings.warn(
256
+ "Overwriting non-ingestion tags from ingestion is an anti-pattern.",
257
+ category=IngestionAttributionWarning,
258
+ stacklevel=2,
259
+ )
260
+ editable_field.globalTags = None
261
+
262
+ self._base_schema_field().globalTags = models.GlobalTagsClass(
263
+ tags=parsed_tags
264
+ )
265
+ else:
266
+ base_field = self._base_schema_field()
267
+ if base_field.globalTags:
268
+ base_field.globalTags = None
269
+
270
+ self._ensure_editable_schema_field().globalTags = models.GlobalTagsClass(
271
+ tags=parsed_tags
272
+ )
273
+
274
+ @property
275
+ def terms(self) -> Optional[List[models.GlossaryTermAssociationClass]]:
276
+ # TODO: Basically the same implementation as tags - can we share code?
277
+ terms = None
278
+
279
+ if (base_terms := self._base_schema_field().glossaryTerms) is not None:
280
+ terms = terms or []
281
+ terms.extend(base_terms.terms)
282
+
283
+ if editable_field := self._get_editable_schema_field():
284
+ if (editable_terms := editable_field.glossaryTerms) is not None:
285
+ terms = terms or []
286
+ terms.extend(editable_terms.terms)
287
+
288
+ return terms
289
+
290
+ def set_terms(self, terms: List[models.GlossaryTermAssociationClass]) -> None:
291
+ parsed_terms = [
292
+ self._parent._parse_glossary_term_association_class(term) for term in terms
293
+ ]
294
+
295
+ if is_ingestion_attribution():
296
+ editable_field = self._get_editable_schema_field()
297
+ if editable_field and editable_field.glossaryTerms:
298
+ warnings.warn(
299
+ "Overwriting non-ingestion terms from ingestion is an anti-pattern.",
300
+ category=IngestionAttributionWarning,
301
+ stacklevel=2,
302
+ )
303
+ editable_field.glossaryTerms = None
304
+
305
+ self._base_schema_field().glossaryTerms = models.GlossaryTermsClass(
306
+ terms=parsed_terms,
307
+ auditStamp=self._parent._terms_audit_stamp(),
308
+ )
309
+ else:
310
+ base_field = self._base_schema_field()
311
+ if base_field.glossaryTerms:
312
+ base_field.glossaryTerms = None
313
+
314
+ self._ensure_editable_schema_field().glossaryTerms = (
315
+ models.GlossaryTermsClass(
316
+ terms=parsed_terms,
317
+ auditStamp=self._parent._terms_audit_stamp(),
318
+ )
319
+ )
320
+
321
+
322
+ class Dataset(
323
+ HasPlatformInstance,
324
+ HasSubtype,
325
+ HasContainer,
326
+ HasOwnership,
327
+ HasTags,
328
+ HasTerms,
329
+ HasDomain,
330
+ Entity,
331
+ ):
332
+ __slots__ = ()
333
+
334
+ @classmethod
335
+ def get_urn_type(cls) -> Type[DatasetUrn]:
336
+ return DatasetUrn
337
+
338
+ def __init__(
339
+ self,
340
+ *,
341
+ # Identity.
342
+ platform: str,
343
+ name: str,
344
+ platform_instance: Optional[str] = None,
345
+ env: str = DEFAULT_ENV,
346
+ # Dataset properties.
347
+ description: Optional[str] = None,
348
+ display_name: Optional[str] = None,
349
+ qualified_name: Optional[str] = None,
350
+ external_url: Optional[str] = None,
351
+ custom_properties: Optional[Dict[str, str]] = None,
352
+ created: Optional[datetime] = None,
353
+ last_modified: Optional[datetime] = None,
354
+ # Standard aspects.
355
+ subtype: Optional[str] = None,
356
+ container: Optional[ContainerInputType] = None,
357
+ owners: Optional[OwnersInputType] = None,
358
+ tags: Optional[TagsInputType] = None,
359
+ terms: Optional[TermsInputType] = None,
360
+ # TODO structured_properties
361
+ domain: Optional[DomainInputType] = None,
362
+ # Dataset-specific aspects.
363
+ schema: Optional[SchemaFieldsInputType] = None,
364
+ upstreams: Optional[models.UpstreamLineageClass] = None,
365
+ ):
366
+ urn = DatasetUrn.create_from_ids(
367
+ platform_id=platform,
368
+ table_name=name,
369
+ platform_instance=platform_instance,
370
+ env=env,
371
+ )
372
+ super().__init__(urn)
373
+
374
+ self._set_platform_instance(urn.platform, platform_instance)
375
+
376
+ if schema is not None:
377
+ self._set_schema(schema)
378
+ if upstreams is not None:
379
+ self.set_upstreams(upstreams)
380
+
381
+ if description is not None:
382
+ self.set_description(description)
383
+ if display_name is not None:
384
+ self.set_display_name(display_name)
385
+ if qualified_name is not None:
386
+ self.set_qualified_name(qualified_name)
387
+ if external_url is not None:
388
+ self.set_external_url(external_url)
389
+ if custom_properties is not None:
390
+ self.set_custom_properties(custom_properties)
391
+ if created is not None:
392
+ self.set_created(created)
393
+ if last_modified is not None:
394
+ self.set_last_modified(last_modified)
395
+
396
+ if subtype is not None:
397
+ self.set_subtype(subtype)
398
+ if container is not None:
399
+ self._set_container(container)
400
+ if owners is not None:
401
+ self.set_owners(owners)
402
+ if tags is not None:
403
+ self.set_tags(tags)
404
+ if terms is not None:
405
+ self.set_terms(terms)
406
+ if domain is not None:
407
+ self.set_domain(domain)
408
+
409
+ @classmethod
410
+ def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
411
+ assert isinstance(urn, DatasetUrn)
412
+ entity = cls(
413
+ platform=urn.platform,
414
+ name=urn.name,
415
+ env=urn.env,
416
+ )
417
+ return entity._init_from_graph(current_aspects)
418
+
419
+ @property
420
+ def urn(self) -> DatasetUrn:
421
+ return self._urn # type: ignore
422
+
423
+ def _ensure_dataset_props(self) -> models.DatasetPropertiesClass:
424
+ return self._setdefault_aspect(models.DatasetPropertiesClass())
425
+
426
+ def _get_editable_props(self) -> Optional[models.EditableDatasetPropertiesClass]:
427
+ return self._get_aspect(models.EditableDatasetPropertiesClass)
428
+
429
+ def _ensure_editable_props(self) -> models.EditableDatasetPropertiesClass:
430
+ # Note that most of the fields in this aspect are not used.
431
+ # The only one that's relevant for us is the description.
432
+ return self._setdefault_aspect(models.EditableDatasetPropertiesClass())
433
+
434
+ @property
435
+ def description(self) -> Optional[str]:
436
+ editable_props = self._get_editable_props()
437
+ return first_non_null(
438
+ [
439
+ editable_props.description if editable_props is not None else None,
440
+ self._ensure_dataset_props().description,
441
+ ]
442
+ )
443
+
444
+ def set_description(self, description: str) -> None:
445
+ if is_ingestion_attribution():
446
+ editable_props = self._get_editable_props()
447
+ if editable_props is not None and editable_props.description is not None:
448
+ warnings.warn(
449
+ "Overwriting non-ingestion description from ingestion is an anti-pattern.",
450
+ category=IngestionAttributionWarning,
451
+ stacklevel=2,
452
+ )
453
+ # Force the ingestion description to show up.
454
+ editable_props.description = None
455
+
456
+ self._ensure_dataset_props().description = description
457
+ else:
458
+ self._ensure_editable_props().description = description
459
+
460
+ @property
461
+ def display_name(self) -> Optional[str]:
462
+ return self._ensure_dataset_props().name
463
+
464
+ def set_display_name(self, display_name: str) -> None:
465
+ self._ensure_dataset_props().name = display_name
466
+
467
+ @property
468
+ def qualified_name(self) -> Optional[str]:
469
+ return self._ensure_dataset_props().qualifiedName
470
+
471
+ def set_qualified_name(self, qualified_name: str) -> None:
472
+ self._ensure_dataset_props().qualifiedName = qualified_name
473
+
474
+ @property
475
+ def external_url(self) -> Optional[str]:
476
+ return self._ensure_dataset_props().externalUrl
477
+
478
+ def set_external_url(self, external_url: str) -> None:
479
+ self._ensure_dataset_props().externalUrl = external_url
480
+
481
+ @property
482
+ def custom_properties(self) -> Dict[str, str]:
483
+ return self._ensure_dataset_props().customProperties
484
+
485
+ def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
486
+ self._ensure_dataset_props().customProperties = custom_properties
487
+
488
+ @property
489
+ def created(self) -> Optional[datetime]:
490
+ return parse_time_stamp(self._ensure_dataset_props().created)
491
+
492
+ def set_created(self, created: datetime) -> None:
493
+ self._ensure_dataset_props().created = make_time_stamp(created)
494
+
495
+ @property
496
+ def last_modified(self) -> Optional[datetime]:
497
+ return parse_time_stamp(self._ensure_dataset_props().lastModified)
498
+
499
+ def set_last_modified(self, last_modified: datetime) -> None:
500
+ self._ensure_dataset_props().lastModified = make_time_stamp(last_modified)
501
+
502
+ def _schema_dict(self) -> Dict[str, models.SchemaFieldClass]:
503
+ schema_metadata = self._get_aspect(models.SchemaMetadataClass)
504
+ if schema_metadata is None:
505
+ raise ItemNotFoundError(f"Schema is not set for dataset {self.urn}")
506
+ return {field.fieldPath: field for field in schema_metadata.fields}
507
+
508
+ @property
509
+ def schema(self) -> List[SchemaField]:
510
+ # TODO: Add some caching here to avoid iterating over the schema every time.
511
+ schema_dict = self._schema_dict()
512
+ return [SchemaField(self, field_path) for field_path in schema_dict]
513
+
514
+ def _parse_schema_field_input(
515
+ self, schema_field_input: SchemaFieldInputType
516
+ ) -> models.SchemaFieldClass:
517
+ if isinstance(schema_field_input, models.SchemaFieldClass):
518
+ return schema_field_input
519
+ elif isinstance(schema_field_input, tuple):
520
+ # Support (name, type) and (name, type, description) forms
521
+ if len(schema_field_input) == 2:
522
+ name, field_type = schema_field_input
523
+ description = None
524
+ elif len(schema_field_input) == 3:
525
+ name, field_type, description = schema_field_input
526
+ else:
527
+ assert_never(schema_field_input)
528
+ return models.SchemaFieldClass(
529
+ fieldPath=name,
530
+ type=models.SchemaFieldDataTypeClass(
531
+ resolve_sql_type(
532
+ field_type,
533
+ platform=self.urn.get_data_platform_urn().platform_name,
534
+ )
535
+ or models.NullTypeClass()
536
+ ),
537
+ nativeDataType=field_type,
538
+ description=description,
539
+ )
540
+ elif isinstance(schema_field_input, str):
541
+ # TODO: Not sure this branch makes sense - we should probably just require types?
542
+ return models.SchemaFieldClass(
543
+ fieldPath=schema_field_input,
544
+ type=models.SchemaFieldDataTypeClass(models.NullTypeClass()),
545
+ nativeDataType="unknown",
546
+ description=None,
547
+ )
548
+ else:
549
+ assert_never(schema_field_input)
550
+
551
+ def _set_schema(self, schema: SchemaFieldsInputType) -> None:
552
+ # This method is not public. Ingestion/restatement users should be setting
553
+ # the schema via the constructor. SDK users that got a dataset from the graph
554
+ # probably shouldn't be adding/removing fields ad-hoc. The field-level mutators
555
+ # can be used instead.
556
+ if isinstance(schema, models.SchemaMetadataClass):
557
+ self._set_aspect(schema)
558
+ else:
559
+ parsed_schema = [self._parse_schema_field_input(field) for field in schema]
560
+ self._set_aspect(
561
+ models.SchemaMetadataClass(
562
+ fields=parsed_schema,
563
+ # The rest of these fields are not used, and so we can set them to dummy/default values.
564
+ schemaName="",
565
+ platform=self.urn.platform,
566
+ version=0,
567
+ hash="",
568
+ platformSchema=models.SchemalessClass(),
569
+ )
570
+ )
571
+
572
+ def __getitem__(self, field_path: str) -> SchemaField:
573
+ # TODO: Automatically deal with field path v2?
574
+ schema_dict = self._schema_dict()
575
+ if field_path not in schema_dict:
576
+ raise SchemaFieldKeyError(f"Field {field_path} not found in schema")
577
+ return SchemaField(self, field_path)
578
+
579
+ @property
580
+ def upstreams(self) -> Optional[models.UpstreamLineageClass]:
581
+ return self._get_aspect(models.UpstreamLineageClass)
582
+
583
+ def set_upstreams(self, upstreams: UpstreamLineageInputType) -> None:
584
+ self._set_aspect(_parse_upstream_lineage_input(upstreams, self.urn))
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from typing import TYPE_CHECKING, Union, overload
5
+
6
+ import datahub.metadata.schema_classes as models
7
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
8
+ from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
9
+ from datahub.errors import IngestionAttributionWarning, ItemNotFoundError, SdkUsageError
10
+ from datahub.ingestion.graph.client import DataHubGraph
11
+ from datahub.metadata.urns import (
12
+ ContainerUrn,
13
+ DatasetUrn,
14
+ Urn,
15
+ )
16
+ from datahub.sdk._all_entities import ENTITY_CLASSES
17
+ from datahub.sdk._entity import Entity
18
+ from datahub.sdk._shared import UrnOrStr
19
+ from datahub.sdk.container import Container
20
+ from datahub.sdk.dataset import Dataset
21
+
22
+ if TYPE_CHECKING:
23
+ from datahub.sdk.main_client import DataHubClient
24
+
25
+
26
+ class EntityClient:
27
+ def __init__(self, client: DataHubClient):
28
+ self._client = client
29
+
30
+ # TODO: Make all of these methods sync by default.
31
+
32
+ @property
33
+ def _graph(self) -> DataHubGraph:
34
+ return self._client._graph
35
+
36
+ @overload
37
+ def get(self, urn: ContainerUrn) -> Container: ...
38
+ @overload
39
+ def get(self, urn: DatasetUrn) -> Dataset: ...
40
+ @overload
41
+ def get(self, urn: Union[Urn, str]) -> Entity: ...
42
+ def get(self, urn: UrnOrStr) -> Entity:
43
+ if not isinstance(urn, Urn):
44
+ urn = Urn.from_string(urn)
45
+
46
+ # TODO: add error handling around this with a suggested alternative if not yet supported
47
+ EntityClass = ENTITY_CLASSES[urn.entity_type]
48
+
49
+ if not self._graph.exists(str(urn)):
50
+ raise ItemNotFoundError(f"Entity {urn} not found")
51
+
52
+ aspects = self._graph.get_entity_semityped(str(urn))
53
+
54
+ # TODO: save the timestamp so we can use If-Unmodified-Since on the updates
55
+ return EntityClass._new_from_graph(urn, aspects)
56
+
57
+ def create(self, entity: Entity) -> None:
58
+ mcps = []
59
+
60
+ if self._graph.exists(str(entity.urn)):
61
+ raise SdkUsageError(
62
+ f"Entity {entity.urn} already exists. Use client.entities.upsert() to update it."
63
+ )
64
+
65
+ # Extra safety check: by putting this first, we can ensure that
66
+ # the request fails if the entity already exists.
67
+ mcps.append(
68
+ MetadataChangeProposalWrapper(
69
+ entityUrn=str(entity.urn),
70
+ aspect=entity.urn.to_key_aspect(),
71
+ changeType=models.ChangeTypeClass.CREATE_ENTITY,
72
+ )
73
+ )
74
+ mcps.extend(entity._as_mcps(models.ChangeTypeClass.CREATE))
75
+
76
+ self._graph.emit_mcps(mcps)
77
+
78
+ def upsert(self, entity: Entity) -> None:
79
+ if entity._prev_aspects is None and self._graph.exists(str(entity.urn)):
80
+ warnings.warn(
81
+ f"The entity {entity.urn} already exists. This operation will partially overwrite the existing entity.",
82
+ IngestionAttributionWarning,
83
+ stacklevel=2,
84
+ )
85
+ # TODO: If there are no previous aspects but the entity exists, should we delete aspects that are not present here?
86
+
87
+ mcps = entity._as_mcps(models.ChangeTypeClass.UPSERT)
88
+ self._graph.emit_mcps(mcps)
89
+
90
+ def update(self, entity: Union[Entity, MetadataPatchProposal]) -> None:
91
+ if isinstance(entity, MetadataPatchProposal):
92
+ return self._update_patch(entity)
93
+
94
+ if entity._prev_aspects is None:
95
+ raise SdkUsageError(
96
+ f"For entities created via {entity.__class__.__name__}(...), use client.entities.create() or client.entities.upsert() instead"
97
+ )
98
+
99
+ # TODO: respect If-Unmodified-Since?
100
+ # -> probably add a "mode" parameter that can be "update" (e.g. if not modified) or "update_force"
101
+
102
+ mcps = entity._as_mcps(models.ChangeTypeClass.UPSERT)
103
+ self._graph.emit_mcps(mcps)
104
+
105
+ def _update_patch(
106
+ self, updater: MetadataPatchProposal, check_exists: bool = True
107
+ ) -> None:
108
+ if check_exists and not self._graph.exists(updater.urn):
109
+ raise SdkUsageError(
110
+ f"Entity {updater.urn} does not exist, and hence cannot be updated. "
111
+ "You can bypass this check by setting check_exists=False."
112
+ )
113
+
114
+ mcps = updater.build()
115
+ self._graph.emit_mcps(mcps)