acryl-datahub 1.0.0rc8__py3-none-any.whl → 1.0.0rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (46) hide show
  1. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/METADATA +2445 -2445
  2. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/RECORD +46 -42
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +731 -42
  5. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  6. datahub/cli/specific/dataset_cli.py +128 -14
  7. datahub/ingestion/graph/client.py +15 -11
  8. datahub/ingestion/graph/filters.py +64 -37
  9. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  10. datahub/ingestion/source/preset.py +7 -4
  11. datahub/ingestion/source/superset.py +158 -24
  12. datahub/metadata/_schema_classes.py +157 -14
  13. datahub/metadata/_urns/urn_defs.py +58 -58
  14. datahub/metadata/schema.avsc +23 -10
  15. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  16. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  17. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  18. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  19. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  20. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  21. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  22. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  23. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  24. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  25. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  26. datahub/metadata/schemas/PostKey.avsc +2 -1
  27. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  28. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  29. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  30. datahub/pydantic/__init__.py +0 -0
  31. datahub/pydantic/compat.py +58 -0
  32. datahub/sdk/__init__.py +1 -0
  33. datahub/sdk/_all_entities.py +1 -1
  34. datahub/sdk/_shared.py +88 -3
  35. datahub/sdk/container.py +7 -1
  36. datahub/sdk/dataset.py +7 -1
  37. datahub/sdk/{_entity.py → entity.py} +4 -0
  38. datahub/sdk/entity_client.py +1 -1
  39. datahub/sdk/main_client.py +7 -1
  40. datahub/sdk/resolver_client.py +17 -29
  41. datahub/sdk/search_client.py +50 -0
  42. datahub/sdk/search_filters.py +374 -0
  43. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/LICENSE +0 -0
  44. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/WHEEL +0 -0
  45. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/entry_points.txt +0 -0
  46. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ from dataclasses import dataclass, field
3
4
  from datetime import datetime
4
5
  from functools import lru_cache
5
6
  from typing import Any, Dict, Iterable, List, Optional
@@ -22,6 +23,7 @@ from datahub.emitter.mce_builder import (
22
23
  make_dataset_urn,
23
24
  make_dataset_urn_with_platform_instance,
24
25
  make_domain_urn,
26
+ make_user_urn,
25
27
  )
26
28
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
27
29
  from datahub.ingestion.api.common import PipelineContext
@@ -46,7 +48,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
46
48
  StatefulIngestionSourceBase,
47
49
  )
48
50
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
49
- AuditStamp,
50
51
  ChangeAuditStamps,
51
52
  Status,
52
53
  TimeStamp,
@@ -65,17 +66,22 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
65
66
  SchemaMetadata,
66
67
  )
67
68
  from datahub.metadata.schema_classes import (
69
+ AuditStampClass,
68
70
  ChartInfoClass,
69
71
  ChartTypeClass,
70
72
  DashboardInfoClass,
71
73
  DatasetLineageTypeClass,
72
74
  DatasetPropertiesClass,
73
75
  GlobalTagsClass,
76
+ OwnerClass,
77
+ OwnershipClass,
78
+ OwnershipTypeClass,
74
79
  TagAssociationClass,
75
80
  UpstreamClass,
76
81
  UpstreamLineageClass,
77
82
  )
78
83
  from datahub.utilities import config_clean
84
+ from datahub.utilities.lossy_collections import LossyList
79
85
  from datahub.utilities.registries.domain_registry import DomainRegistry
80
86
 
81
87
  logger = logging.getLogger(__name__)
@@ -103,6 +109,14 @@ chart_type_from_viz_type = {
103
109
  platform_without_databases = ["druid"]
104
110
 
105
111
 
112
+ @dataclass
113
+ class SupersetSourceReport(StaleEntityRemovalSourceReport):
114
+ filtered: LossyList[str] = field(default_factory=LossyList)
115
+
116
+ def report_dropped(self, name: str) -> None:
117
+ self.filtered.append(name)
118
+
119
+
106
120
  class SupersetDataset(BaseModel):
107
121
  id: int
108
122
  table_name: str
@@ -138,6 +152,18 @@ class SupersetConfig(
138
152
  default=dict(),
139
153
  description="regex patterns for tables to filter to assign domain_key. ",
140
154
  )
155
+ dataset_pattern: AllowDenyPattern = Field(
156
+ default=AllowDenyPattern.allow_all(),
157
+ description="Regex patterns for dataset to filter in ingestion.",
158
+ )
159
+ chart_pattern: AllowDenyPattern = Field(
160
+ AllowDenyPattern.allow_all(),
161
+ description="Patterns for selecting chart names that are to be included",
162
+ )
163
+ dashboard_pattern: AllowDenyPattern = Field(
164
+ AllowDenyPattern.allow_all(),
165
+ description="Patterns for selecting dashboard names that are to be included",
166
+ )
141
167
  username: Optional[str] = Field(default=None, description="Superset username.")
142
168
  password: Optional[str] = Field(default=None, description="Superset password.")
143
169
  # Configuration for stateful ingestion
@@ -218,7 +244,7 @@ class SupersetSource(StatefulIngestionSourceBase):
218
244
  """
219
245
 
220
246
  config: SupersetConfig
221
- report: StaleEntityRemovalSourceReport
247
+ report: SupersetSourceReport
222
248
  platform = "superset"
223
249
 
224
250
  def __hash__(self):
@@ -227,13 +253,14 @@ class SupersetSource(StatefulIngestionSourceBase):
227
253
  def __init__(self, ctx: PipelineContext, config: SupersetConfig):
228
254
  super().__init__(config, ctx)
229
255
  self.config = config
230
- self.report = StaleEntityRemovalSourceReport()
256
+ self.report = SupersetSourceReport()
231
257
  if self.config.domain:
232
258
  self.domain_registry = DomainRegistry(
233
259
  cached_domains=[domain_id for domain_id in self.config.domain],
234
260
  graph=self.ctx.graph,
235
261
  )
236
262
  self.session = self.login()
263
+ self.owner_info = self.parse_owner_info()
237
264
 
238
265
  def login(self) -> requests.Session:
239
266
  login_response = requests.post(
@@ -273,7 +300,7 @@ class SupersetSource(StatefulIngestionSourceBase):
273
300
 
274
301
  while current_page * page_size < total_items:
275
302
  response = self.session.get(
276
- f"{self.config.connect_uri}/api/v1/{entity_type}/",
303
+ f"{self.config.connect_uri}/api/v1/{entity_type}",
277
304
  params={"q": f"(page:{current_page},page_size:{page_size})"},
278
305
  )
279
306
 
@@ -289,6 +316,25 @@ class SupersetSource(StatefulIngestionSourceBase):
289
316
 
290
317
  current_page += 1
291
318
 
319
+ def parse_owner_info(self) -> Dict[str, Any]:
320
+ entity_types = ["dataset", "dashboard", "chart"]
321
+ owners_info = {}
322
+
323
+ for entity in entity_types:
324
+ for owner in self.paginate_entity_api_results(f"{entity}/related/owners"):
325
+ owner_id = owner.get("value")
326
+ if owner_id:
327
+ owners_info[owner_id] = owner.get("extra", {}).get("email", "")
328
+
329
+ return owners_info
330
+
331
+ def build_owner_urn(self, data: Dict[str, Any]) -> List[str]:
332
+ return [
333
+ make_user_urn(self.owner_info.get(owner.get("id"), ""))
334
+ for owner in data.get("owners", [])
335
+ if owner.get("id")
336
+ ]
337
+
292
338
  @lru_cache(maxsize=None)
293
339
  def get_dataset_info(self, dataset_id: int) -> dict:
294
340
  dataset_response = self.session.get(
@@ -346,15 +392,16 @@ class SupersetSource(StatefulIngestionSourceBase):
346
392
  aspects=[Status(removed=False)],
347
393
  )
348
394
 
349
- modified_actor = f"urn:li:corpuser:{(dashboard_data.get('changed_by') or {}).get('username', 'unknown')}"
395
+ modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
350
396
  modified_ts = int(
351
397
  dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
352
398
  )
353
399
  title = dashboard_data.get("dashboard_title", "")
354
400
  # note: the API does not currently supply created_by usernames due to a bug
355
- last_modified = ChangeAuditStamps(
356
- created=None,
357
- lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
401
+ last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
402
+
403
+ change_audit_stamps = ChangeAuditStamps(
404
+ created=None, lastModified=last_modified
358
405
  )
359
406
  dashboard_url = f"{self.config.display_uri}{dashboard_data.get('url', '')}"
360
407
 
@@ -380,7 +427,7 @@ class SupersetSource(StatefulIngestionSourceBase):
380
427
  "IsPublished": str(dashboard_data.get("published", False)).lower(),
381
428
  "Owners": ", ".join(
382
429
  map(
383
- lambda owner: owner.get("username", "unknown"),
430
+ lambda owner: self.owner_info.get(owner.get("id", -1), "unknown"),
384
431
  dashboard_data.get("owners", []),
385
432
  )
386
433
  ),
@@ -400,16 +447,39 @@ class SupersetSource(StatefulIngestionSourceBase):
400
447
  description="",
401
448
  title=title,
402
449
  charts=chart_urns,
403
- lastModified=last_modified,
404
450
  dashboardUrl=dashboard_url,
405
451
  customProperties=custom_properties,
452
+ lastModified=change_audit_stamps,
406
453
  )
407
454
  dashboard_snapshot.aspects.append(dashboard_info)
455
+
456
+ dashboard_owners_list = self.build_owner_urn(dashboard_data)
457
+ owners_info = OwnershipClass(
458
+ owners=[
459
+ OwnerClass(
460
+ owner=urn,
461
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
462
+ )
463
+ for urn in (dashboard_owners_list or [])
464
+ ],
465
+ lastModified=last_modified,
466
+ )
467
+ dashboard_snapshot.aspects.append(owners_info)
468
+
408
469
  return dashboard_snapshot
409
470
 
410
471
  def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
411
- for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
472
+ for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
412
473
  try:
474
+ dashboard_id = str(dashboard_data.get("id"))
475
+ dashboard_title = dashboard_data.get("dashboard_title", "")
476
+
477
+ if not self.config.dashboard_pattern.allowed(dashboard_title):
478
+ self.report.report_dropped(
479
+ f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
480
+ )
481
+ continue
482
+
413
483
  dashboard_snapshot = self.construct_dashboard_from_api_data(
414
484
  dashboard_data
415
485
  )
@@ -422,7 +492,7 @@ class SupersetSource(StatefulIngestionSourceBase):
422
492
  mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
423
493
  yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
424
494
  yield from self._get_domain_wu(
425
- title=dashboard_data.get("dashboard_title", ""),
495
+ title=dashboard_title,
426
496
  entity_urn=dashboard_snapshot.urn,
427
497
  )
428
498
 
@@ -437,17 +507,19 @@ class SupersetSource(StatefulIngestionSourceBase):
437
507
  aspects=[Status(removed=False)],
438
508
  )
439
509
 
440
- modified_actor = f"urn:li:corpuser:{(chart_data.get('changed_by') or {}).get('username', 'unknown')}"
510
+ modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
441
511
  modified_ts = int(
442
512
  dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() * 1000
443
513
  )
444
514
  title = chart_data.get("slice_name", "")
445
515
 
446
516
  # note: the API does not currently supply created_by usernames due to a bug
447
- last_modified = ChangeAuditStamps(
448
- created=None,
449
- lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
517
+ last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
518
+
519
+ change_audit_stamps = ChangeAuditStamps(
520
+ created=None, lastModified=last_modified
450
521
  )
522
+
451
523
  chart_type = chart_type_from_viz_type.get(chart_data.get("viz_type", ""))
452
524
  chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
453
525
 
@@ -504,23 +576,61 @@ class SupersetSource(StatefulIngestionSourceBase):
504
576
  type=chart_type,
505
577
  description="",
506
578
  title=title,
507
- lastModified=last_modified,
508
579
  chartUrl=chart_url,
509
580
  inputs=[datasource_urn] if datasource_urn else None,
510
581
  customProperties=custom_properties,
582
+ lastModified=change_audit_stamps,
511
583
  )
512
584
  chart_snapshot.aspects.append(chart_info)
585
+
586
+ chart_owners_list = self.build_owner_urn(chart_data)
587
+ owners_info = OwnershipClass(
588
+ owners=[
589
+ OwnerClass(
590
+ owner=urn,
591
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
592
+ )
593
+ for urn in (chart_owners_list or [])
594
+ ],
595
+ lastModified=last_modified,
596
+ )
597
+ chart_snapshot.aspects.append(owners_info)
513
598
  return chart_snapshot
514
599
 
515
600
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
516
- for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
601
+ for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
517
602
  try:
603
+ chart_id = str(chart_data.get("id"))
604
+ chart_name = chart_data.get("slice_name", "")
605
+
606
+ if not self.config.chart_pattern.allowed(chart_name):
607
+ self.report.report_dropped(
608
+ f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
609
+ )
610
+ continue
611
+
612
+ # Emit a warning if charts use data from a dataset that will be filtered out
613
+ if self.config.dataset_pattern != AllowDenyPattern.allow_all():
614
+ datasource_id = chart_data.get("datasource_id")
615
+ if datasource_id:
616
+ dataset_response = self.get_dataset_info(datasource_id)
617
+ dataset_name = dataset_response.get("result", {}).get(
618
+ "table_name", ""
619
+ )
620
+
621
+ if dataset_name and not self.config.dataset_pattern.allowed(
622
+ dataset_name
623
+ ):
624
+ self.report.warning(
625
+ f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
626
+ )
627
+
518
628
  chart_snapshot = self.construct_chart_from_chart_data(chart_data)
519
629
 
520
630
  mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
521
631
  except Exception as e:
522
632
  self.report.warning(
523
- f"Failed to construct chart snapshot. Chart name: {chart_data.get('table_name')}. Error: \n{e}"
633
+ f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
524
634
  )
525
635
  continue
526
636
  # Emit the chart
@@ -583,6 +693,12 @@ class SupersetSource(StatefulIngestionSourceBase):
583
693
  )
584
694
  dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
585
695
 
696
+ modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
697
+ modified_ts = int(
698
+ dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
699
+ )
700
+ last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
701
+
586
702
  upstream_warehouse_platform = (
587
703
  dataset_response.get("result", {}).get("database", {}).get("backend")
588
704
  )
@@ -618,10 +734,8 @@ class SupersetSource(StatefulIngestionSourceBase):
618
734
  dataset_info = DatasetPropertiesClass(
619
735
  name=dataset.table_name,
620
736
  description="",
621
- lastModified=(
622
- TimeStamp(time=dataset.modified_ts) if dataset.modified_ts else None
623
- ),
624
737
  externalUrl=dataset_url,
738
+ lastModified=TimeStamp(time=modified_ts),
625
739
  )
626
740
  global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
627
741
 
@@ -640,13 +754,33 @@ class SupersetSource(StatefulIngestionSourceBase):
640
754
  aspects=aspects_items,
641
755
  )
642
756
 
643
- logger.info(f"Constructed dataset {datasource_urn}")
757
+ dataset_owners_list = self.build_owner_urn(dataset_data)
758
+ owners_info = OwnershipClass(
759
+ owners=[
760
+ OwnerClass(
761
+ owner=urn,
762
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
763
+ )
764
+ for urn in (dataset_owners_list or [])
765
+ ],
766
+ lastModified=last_modified,
767
+ )
768
+ aspects_items.append(owners_info)
644
769
 
645
770
  return dataset_snapshot
646
771
 
647
772
  def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
648
- for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
773
+ for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
649
774
  try:
775
+ dataset_name = dataset_data.get("table_name", "")
776
+
777
+ # Check if dataset should be filtered by dataset name
778
+ if not self.config.dataset_pattern.allowed(dataset_name):
779
+ self.report.report_dropped(
780
+ f"Dataset '{dataset_name}' filtered by dataset_pattern"
781
+ )
782
+ continue
783
+
650
784
  dataset_snapshot = self.construct_dataset_from_dataset_data(
651
785
  dataset_data
652
786
  )