acryl-datahub 1.2.0.10rc1__py3-none-any.whl → 1.2.0.10rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (29) hide show
  1. {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/METADATA +2616 -2616
  2. {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/RECORD +29 -29
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/autogenerated/capability_summary.json +12 -0
  5. datahub/ingestion/source/dbt/dbt_common.py +65 -5
  6. datahub/ingestion/source/ge_data_profiler.py +15 -2
  7. datahub/ingestion/source/looker/looker_common.py +75 -74
  8. datahub/ingestion/source/looker/looker_source.py +445 -548
  9. datahub/ingestion/source/looker/lookml_source.py +46 -88
  10. datahub/ingestion/source/redash.py +1 -1
  11. datahub/ingestion/source/superset.py +121 -13
  12. datahub/ingestion/source/tableau/tableau.py +48 -8
  13. datahub/ingestion/source/tableau/tableau_common.py +5 -0
  14. datahub/ingestion/source/tableau/tableau_constant.py +1 -0
  15. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  16. datahub/metadata/_internal_schema_classes.py +202 -2
  17. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  18. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  19. datahub/metadata/schema.avsc +98 -2
  20. datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
  21. datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
  22. datahub/metadata/schemas/LogicalParent.avsc +2 -1
  23. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  24. datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
  25. datahub/sdk/dashboard.py +0 -2
  26. {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/WHEEL +0 -0
  27. {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/entry_points.txt +0 -0
  28. {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/licenses/LICENSE +0 -0
  29. {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ import tempfile
4
4
  from collections import OrderedDict
5
5
  from dataclasses import dataclass
6
6
  from datetime import datetime, timezone
7
- from typing import Dict, Iterable, List, Optional, Set, Tuple
7
+ from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
8
8
 
9
9
  import lkml
10
10
  import lkml.simple
@@ -12,8 +12,7 @@ from looker_sdk.error import SDKError
12
12
 
13
13
  from datahub.configuration.git import GitInfo
14
14
  from datahub.emitter.mce_builder import make_schema_field_urn
15
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
16
- from datahub.emitter.mcp_builder import gen_containers
15
+ from datahub.emitter.mcp_builder import mcps_from_mce
17
16
  from datahub.ingestion.api.common import PipelineContext
18
17
  from datahub.ingestion.api.decorators import (
19
18
  SupportStatus,
@@ -77,7 +76,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
77
76
  from datahub.ingestion.source.state.stateful_ingestion_base import (
78
77
  StatefulIngestionSourceBase,
79
78
  )
80
- from datahub.metadata.com.linkedin.pegasus2avro.common import BrowsePaths, Status
79
+ from datahub.metadata.com.linkedin.pegasus2avro.common import Status
81
80
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
82
81
  DatasetLineageTypeClass,
83
82
  FineGrainedLineageDownstreamType,
@@ -85,18 +84,15 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
85
84
  UpstreamLineage,
86
85
  ViewProperties,
87
86
  )
88
- from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
89
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
90
87
  from datahub.metadata.schema_classes import (
91
88
  AuditStampClass,
92
- BrowsePathEntryClass,
93
- BrowsePathsV2Class,
94
- ContainerClass,
95
89
  DatasetPropertiesClass,
96
90
  FineGrainedLineageClass,
97
91
  FineGrainedLineageUpstreamTypeClass,
98
- SubTypesClass,
99
92
  )
93
+ from datahub.sdk.container import Container
94
+ from datahub.sdk.dataset import Dataset
95
+ from datahub.sdk.entity import Entity
100
96
  from datahub.sql_parsing.sqlglot_lineage import ColumnRef
101
97
 
102
98
  VIEW_LANGUAGE_LOOKML: str = "lookml"
@@ -428,69 +424,40 @@ class LookMLSource(StatefulIngestionSourceBase):
428
424
 
429
425
  return dataset_props
430
426
 
431
- def _build_dataset_mcps(
432
- self, looker_view: LookerView
433
- ) -> List[MetadataChangeProposalWrapper]:
434
- view_urn = looker_view.id.get_urn(self.source_config)
435
-
436
- subTypeEvent = MetadataChangeProposalWrapper(
437
- entityUrn=view_urn,
438
- aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
439
- )
440
- events = [subTypeEvent]
427
+ def _build_dataset_entities(self, looker_view: LookerView) -> Iterable[Dataset]:
428
+ dataset_extra_aspects: List[Union[ViewProperties, Status]] = [
429
+ Status(removed=False)
430
+ ]
441
431
  if looker_view.view_details is not None:
442
- viewEvent = MetadataChangeProposalWrapper(
443
- entityUrn=view_urn,
444
- aspect=looker_view.view_details,
445
- )
446
- events.append(viewEvent)
447
-
448
- project_key = gen_project_key(self.source_config, looker_view.id.project_name)
449
-
450
- container = ContainerClass(container=project_key.as_urn())
451
- events.append(
452
- MetadataChangeProposalWrapper(entityUrn=view_urn, aspect=container)
453
- )
454
-
455
- events.append(
456
- MetadataChangeProposalWrapper(
457
- entityUrn=view_urn,
458
- aspect=looker_view.id.get_browse_path_v2(self.source_config),
459
- )
460
- )
461
-
462
- return events
463
-
464
- def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent:
465
- """
466
- Creates MetadataChangeEvent for the dataset, creating upstream lineage links
467
- """
468
- logger.debug(f"looker_view = {looker_view.id}")
432
+ dataset_extra_aspects.append(looker_view.view_details)
469
433
 
470
- dataset_snapshot = DatasetSnapshot(
471
- urn=looker_view.id.get_urn(self.source_config),
472
- aspects=[], # we append to this list later on
473
- )
474
- browse_paths = BrowsePaths(
475
- paths=[looker_view.id.get_browse_path(self.source_config)]
476
- )
477
-
478
- dataset_snapshot.aspects.append(browse_paths)
479
- dataset_snapshot.aspects.append(Status(removed=False))
480
- upstream_lineage = self._get_upstream_lineage(looker_view)
481
- if upstream_lineage is not None:
482
- dataset_snapshot.aspects.append(upstream_lineage)
483
434
  schema_metadata = LookerUtil._get_schema(
484
435
  self.source_config.platform_name,
485
436
  looker_view.id.view_name,
486
437
  looker_view.fields,
487
438
  self.reporter,
488
439
  )
489
- if schema_metadata is not None:
490
- dataset_snapshot.aspects.append(schema_metadata)
491
- dataset_snapshot.aspects.append(self._get_custom_properties(looker_view))
492
440
 
493
- return MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
441
+ custom_properties: DatasetPropertiesClass = self._get_custom_properties(
442
+ looker_view
443
+ )
444
+
445
+ yield Dataset(
446
+ platform=self.source_config.platform_name,
447
+ name=looker_view.id.get_view_dataset_name(self.source_config),
448
+ display_name=looker_view.id.view_name,
449
+ platform_instance=self.source_config.platform_instance,
450
+ env=self.source_config.env,
451
+ subtype=DatasetSubTypes.VIEW,
452
+ parent_container=looker_view.id.get_view_dataset_parent_container(
453
+ self.source_config
454
+ ),
455
+ schema=schema_metadata,
456
+ custom_properties=custom_properties.customProperties,
457
+ external_url=custom_properties.externalUrl,
458
+ upstreams=self._get_upstream_lineage(looker_view),
459
+ extra_aspects=dataset_extra_aspects,
460
+ )
494
461
 
495
462
  def get_project_name(self, model_name: str) -> str:
496
463
  if self.source_config.project_name is not None:
@@ -554,7 +521,7 @@ class LookMLSource(StatefulIngestionSourceBase):
554
521
  ).workunit_processor,
555
522
  ]
556
523
 
557
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
524
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
558
525
  with tempfile.TemporaryDirectory("lookml_tmp") as tmp_dir:
559
526
  # Clone the base_folder if necessary.
560
527
  if not self.source_config.base_folder:
@@ -715,7 +682,7 @@ class LookMLSource(StatefulIngestionSourceBase):
715
682
  tmp_dir, project, project_visited, manifest_constants
716
683
  )
717
684
 
718
- def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
685
+ def get_internal_workunits(self) -> Iterable[Union[MetadataWorkUnit, Entity]]: # noqa: C901
719
686
  assert self.source_config.base_folder
720
687
  viewfile_loader = LookerViewFileLoader(
721
688
  self.source_config.project_name,
@@ -949,7 +916,7 @@ class LookMLSource(StatefulIngestionSourceBase):
949
916
  maybe_looker_view.id.project_name
950
917
  not in self.processed_projects
951
918
  ):
952
- yield from self.gen_project_workunits(
919
+ yield from self.gen_project_containers(
953
920
  maybe_looker_view.id.project_name
954
921
  )
955
922
 
@@ -957,15 +924,10 @@ class LookMLSource(StatefulIngestionSourceBase):
957
924
  maybe_looker_view.id.project_name
958
925
  )
959
926
 
960
- for mcp in self._build_dataset_mcps(
927
+ yield from self._build_dataset_entities(
961
928
  maybe_looker_view
962
- ):
963
- yield mcp.as_workunit()
964
- mce = self._build_dataset_mce(maybe_looker_view)
965
- yield MetadataWorkUnit(
966
- id=f"lookml-view-{maybe_looker_view.id}",
967
- mce=mce,
968
929
  )
930
+
969
931
  processed_view_files.add(include.include)
970
932
  else:
971
933
  (
@@ -994,28 +956,24 @@ class LookMLSource(StatefulIngestionSourceBase):
994
956
  self.source_config.tag_measures_and_dimensions
995
957
  and self.reporter.events_produced != 0
996
958
  ):
997
- # Emit tag MCEs for measures and dimensions:
959
+ # Emit tag MCEs for measures and dimensions if we produced any explores:
998
960
  for tag_mce in LookerUtil.get_tag_mces():
999
- yield MetadataWorkUnit(
1000
- id=f"tag-{tag_mce.proposedSnapshot.urn}", mce=tag_mce
1001
- )
961
+ # Convert MCE to MCPs
962
+ for mcp in mcps_from_mce(tag_mce):
963
+ yield mcp.as_workunit()
1002
964
 
1003
- def gen_project_workunits(self, project_name: str) -> Iterable[MetadataWorkUnit]:
965
+ def gen_project_containers(self, project_name: str) -> Iterable[Container]:
1004
966
  project_key = gen_project_key(
1005
967
  self.source_config,
1006
968
  project_name,
1007
969
  )
1008
- yield from gen_containers(
970
+
971
+ yield Container(
1009
972
  container_key=project_key,
1010
- name=project_name,
1011
- sub_types=[BIContainerSubTypes.LOOKML_PROJECT],
973
+ display_name=project_name,
974
+ subtype=BIContainerSubTypes.LOOKML_PROJECT,
975
+ parent_container=["Folders"],
1012
976
  )
1013
- yield MetadataChangeProposalWrapper(
1014
- entityUrn=project_key.as_urn(),
1015
- aspect=BrowsePathsV2Class(
1016
- path=[BrowsePathEntryClass("Folders")],
1017
- ),
1018
- ).as_workunit()
1019
977
 
1020
978
  def report_skipped_unreachable_views(
1021
979
  self,
@@ -447,7 +447,7 @@ class RedashSource(StatefulIngestionSourceBase):
447
447
  dataset_urns = sql_parser_in_tables.in_tables
448
448
  if sql_parser_in_tables.debug_info.table_error:
449
449
  self.report.queries_problem_parsing.add(str(query_id))
450
- self.error(
450
+ self.warn(
451
451
  logger,
452
452
  "sql-parsing",
453
453
  f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
@@ -12,6 +12,8 @@ import sqlglot
12
12
  from pydantic import BaseModel
13
13
  from pydantic.class_validators import root_validator, validator
14
14
  from pydantic.fields import Field
15
+ from requests.adapters import HTTPAdapter
16
+ from urllib3.util.retry import Retry
15
17
 
16
18
  import datahub.emitter.mce_builder as builder
17
19
  from datahub.configuration.common import AllowDenyPattern
@@ -109,6 +111,12 @@ logger = logging.getLogger(__name__)
109
111
 
110
112
  PAGE_SIZE = 25
111
113
 
114
+ # Retry configuration constants
115
+ RETRY_MAX_TIMES = 3
116
+ RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
117
+ RETRY_BACKOFF_FACTOR = 1
118
+ RETRY_ALLOWED_METHODS = ["GET"]
119
+
112
120
 
113
121
  chart_type_from_viz_type = {
114
122
  "line": ChartTypeClass.LINE,
@@ -282,6 +290,7 @@ def get_filter_name(filter_obj):
282
290
  )
283
291
  @capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
284
292
  @capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
293
+ @capability(SourceCapability.TAGS, "Supported by default")
285
294
  class SupersetSource(StatefulIngestionSourceBase):
286
295
  """
287
296
  This plugin extracts the following:
@@ -327,6 +336,19 @@ class SupersetSource(StatefulIngestionSourceBase):
327
336
  logger.debug("Got access token from superset")
328
337
 
329
338
  requests_session = requests.Session()
339
+
340
+ # Configure retry strategy for transient failures
341
+ retry_strategy = Retry(
342
+ total=RETRY_MAX_TIMES,
343
+ status_forcelist=RETRY_STATUS_CODES,
344
+ backoff_factor=RETRY_BACKOFF_FACTOR,
345
+ allowed_methods=RETRY_ALLOWED_METHODS,
346
+ raise_on_status=False,
347
+ )
348
+ adapter = HTTPAdapter(max_retries=retry_strategy)
349
+ requests_session.mount("http://", adapter)
350
+ requests_session.mount("https://", adapter)
351
+
330
352
  requests_session.headers.update(
331
353
  {
332
354
  "Authorization": f"Bearer {self.access_token}",
@@ -359,8 +381,13 @@ class SupersetSource(StatefulIngestionSourceBase):
359
381
  )
360
382
 
361
383
  if response.status_code != 200:
362
- logger.warning(f"Failed to get {entity_type} data: {response.text}")
363
- continue
384
+ self.report.warning(
385
+ title="Failed to fetch data from Superset API",
386
+ message="Incomplete metadata extraction due to Superset API failure",
387
+ context=f"Entity Type: {entity_type}, HTTP Status Code: {response.status_code}, Page: {current_page}. Response: {response.text}",
388
+ )
389
+ # we stop pagination for this entity type and we continue the overall ingestion
390
+ break
364
391
 
365
392
  payload = response.json()
366
393
  # Update total_items with the actual count from the response
@@ -521,6 +548,11 @@ class SupersetSource(StatefulIngestionSourceBase):
521
548
  )
522
549
  dashboard_snapshot.aspects.append(owners_info)
523
550
 
551
+ superset_tags = self._extract_and_map_tags(dashboard_data.get("tags", []))
552
+ tags = self._merge_tags_with_existing(dashboard_urn, superset_tags)
553
+ if tags:
554
+ dashboard_snapshot.aspects.append(tags)
555
+
524
556
  return dashboard_snapshot
525
557
 
526
558
  def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
@@ -919,6 +951,12 @@ class SupersetSource(StatefulIngestionSourceBase):
919
951
  lastModified=last_modified,
920
952
  )
921
953
  chart_snapshot.aspects.append(owners_info)
954
+
955
+ superset_tags = self._extract_and_map_tags(chart_data.get("tags", []))
956
+ tags = self._merge_tags_with_existing(chart_urn, superset_tags)
957
+ if tags:
958
+ chart_snapshot.aspects.append(tags)
959
+
922
960
  yield MetadataWorkUnit(
923
961
  id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
924
962
  )
@@ -1288,17 +1326,18 @@ class SupersetSource(StatefulIngestionSourceBase):
1288
1326
  externalUrl=dataset_url,
1289
1327
  lastModified=TimeStamp(time=modified_ts),
1290
1328
  )
1291
- global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
1292
-
1293
- aspects_items: List[Any] = []
1294
- aspects_items.extend(
1295
- [
1296
- self.gen_schema_metadata(dataset_response),
1297
- dataset_info,
1298
- upstream_lineage,
1299
- global_tags,
1300
- ]
1301
- )
1329
+
1330
+ dataset_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
1331
+ tags = self._merge_tags_with_existing(datasource_urn, dataset_tags)
1332
+
1333
+ aspects_items: List[Any] = [
1334
+ self.gen_schema_metadata(dataset_response),
1335
+ dataset_info,
1336
+ upstream_lineage,
1337
+ ]
1338
+
1339
+ if tags:
1340
+ aspects_items.append(tags)
1302
1341
 
1303
1342
  dataset_snapshot = DatasetSnapshot(
1304
1343
  urn=datasource_urn,
@@ -1320,6 +1359,75 @@ class SupersetSource(StatefulIngestionSourceBase):
1320
1359
 
1321
1360
  return dataset_snapshot
1322
1361
 
1362
+ def _extract_and_map_tags(
1363
+ self, raw_tags: List[Dict[str, Any]]
1364
+ ) -> Optional[GlobalTagsClass]:
1365
+ """Extract and map Superset tags to DataHub GlobalTagsClass.
1366
+
1367
+ Filters out system-generated tags (type != 1) and only processes user-defined tags
1368
+ from the Superset API response.
1369
+
1370
+ Args:
1371
+ raw_tags: List of tag dictionaries from Superset API
1372
+
1373
+ Returns:
1374
+ GlobalTagsClass with user-defined tags, or None if no tags found
1375
+ """
1376
+ user_tags = [
1377
+ tag.get("name", "")
1378
+ for tag in raw_tags
1379
+ if tag.get("type") == 1 and tag.get("name")
1380
+ ]
1381
+
1382
+ if not user_tags:
1383
+ return None
1384
+
1385
+ tag_urns = [builder.make_tag_urn(tag) for tag in user_tags]
1386
+ return GlobalTagsClass(
1387
+ tags=[TagAssociationClass(tag=tag_urn) for tag_urn in tag_urns]
1388
+ )
1389
+
1390
+ def _merge_tags_with_existing(
1391
+ self, entity_urn: str, new_tags: Optional[GlobalTagsClass]
1392
+ ) -> Optional[GlobalTagsClass]:
1393
+ """Merge new tags with existing ones from DataHub to preserve manually added tags.
1394
+
1395
+ This method ensures that tags manually added via DataHub UI are not overwritten
1396
+ during ingestion. It fetches existing tags from the graph and merges them with
1397
+ new tags from the source system, avoiding duplicates.
1398
+
1399
+ Args:
1400
+ entity_urn: URN of the entity to check for existing tags
1401
+ new_tags: New tags to add as GlobalTagsClass object
1402
+
1403
+ Returns:
1404
+ GlobalTagsClass with merged tags preserving existing ones, or None if no tags
1405
+ """
1406
+ if not new_tags or not new_tags.tags:
1407
+ return None
1408
+
1409
+ # Fetch existing tags from DataHub
1410
+ existing_global_tags = None
1411
+ if self.ctx.graph:
1412
+ existing_global_tags = self.ctx.graph.get_aspect(
1413
+ entity_urn=entity_urn, aspect_type=GlobalTagsClass
1414
+ )
1415
+
1416
+ # Merge existing tags with new ones, avoiding duplicates
1417
+ all_tags = []
1418
+ existing_tag_urns = set()
1419
+
1420
+ if existing_global_tags and existing_global_tags.tags:
1421
+ all_tags.extend(existing_global_tags.tags)
1422
+ existing_tag_urns = {tag.tag for tag in existing_global_tags.tags}
1423
+
1424
+ # Add new tags that don't already exist
1425
+ for new_tag in new_tags.tags:
1426
+ if new_tag.tag not in existing_tag_urns:
1427
+ all_tags.append(new_tag)
1428
+
1429
+ return GlobalTagsClass(tags=all_tags) if all_tags else None
1430
+
1323
1431
  def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
1324
1432
  dataset_name = ""
1325
1433
  try:
@@ -524,6 +524,10 @@ class TableauConfig(
524
524
  default=False,
525
525
  description="Ingest Owner from source. This will override Owner info entered from UI",
526
526
  )
527
+ use_email_as_username: bool = Field(
528
+ default=False,
529
+ description="Use email address instead of username for entity owners. Requires ingest_owner to be True.",
530
+ )
527
531
  ingest_tables_external: bool = Field(
528
532
  default=False,
529
533
  description="Ingest details for tables external to (not embedded in) tableau as entities.",
@@ -678,6 +682,14 @@ class TableauConfig(
678
682
  raise ValueError(
679
683
  "tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
680
684
  )
685
+
686
+ use_email_as_username = values.get("use_email_as_username")
687
+ ingest_owner = values.get("ingest_owner")
688
+ if use_email_as_username and not ingest_owner:
689
+ raise ValueError(
690
+ "use_email_as_username requires ingest_owner to be enabled."
691
+ )
692
+
681
693
  return values
682
694
 
683
695
 
@@ -839,6 +851,9 @@ class TableauSourceReport(
839
851
  default_factory=(lambda: defaultdict(int))
840
852
  )
841
853
 
854
+ # Owner extraction statistics
855
+ num_email_fallback_to_username: int = 0
856
+
842
857
 
843
858
  def report_user_role(report: TableauSourceReport, server: Server) -> None:
844
859
  title: str = "Insufficient Permissions"
@@ -2716,13 +2731,12 @@ class TableauSiteSource:
2716
2731
  dataset_snapshot.aspects.append(browse_paths)
2717
2732
 
2718
2733
  # Ownership
2719
- owner = (
2720
- self._get_ownership(datasource_info[c.OWNER][c.USERNAME])
2721
- if datasource_info
2722
- and datasource_info.get(c.OWNER)
2723
- and datasource_info[c.OWNER].get(c.USERNAME)
2734
+ owner_identifier = (
2735
+ self._get_owner_identifier(datasource_info[c.OWNER])
2736
+ if datasource_info and datasource_info.get(c.OWNER)
2724
2737
  else None
2725
2738
  )
2739
+ owner = self._get_ownership(owner_identifier) if owner_identifier else None
2726
2740
  if owner is not None:
2727
2741
  dataset_snapshot.aspects.append(owner)
2728
2742
 
@@ -3127,7 +3141,7 @@ class TableauSiteSource:
3127
3141
 
3128
3142
  creator: Optional[str] = None
3129
3143
  if workbook is not None and workbook.get(c.OWNER) is not None:
3130
- creator = workbook[c.OWNER].get(c.USERNAME)
3144
+ creator = self._get_owner_identifier(workbook[c.OWNER])
3131
3145
  created_at = sheet.get(c.CREATED_AT, datetime.now())
3132
3146
  updated_at = sheet.get(c.UPDATED_AT, datetime.now())
3133
3147
  last_modified = self.get_last_modified(creator, created_at, updated_at)
@@ -3276,7 +3290,7 @@ class TableauSiteSource:
3276
3290
 
3277
3291
  def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUnit]:
3278
3292
  workbook_container_key = self.gen_workbook_key(workbook[c.ID])
3279
- creator = workbook.get(c.OWNER, {}).get(c.USERNAME)
3293
+ creator = self._get_owner_identifier(workbook.get(c.OWNER, {}))
3280
3294
 
3281
3295
  owner_urn = (
3282
3296
  builder.make_user_urn(creator)
@@ -3458,7 +3472,7 @@ class TableauSiteSource:
3458
3472
 
3459
3473
  creator: Optional[str] = None
3460
3474
  if workbook is not None and workbook.get(c.OWNER) is not None:
3461
- creator = workbook[c.OWNER].get(c.USERNAME)
3475
+ creator = self._get_owner_identifier(workbook[c.OWNER])
3462
3476
  created_at = dashboard.get(c.CREATED_AT, datetime.now())
3463
3477
  updated_at = dashboard.get(c.UPDATED_AT, datetime.now())
3464
3478
  last_modified = self.get_last_modified(creator, created_at, updated_at)
@@ -3605,6 +3619,20 @@ class TableauSiteSource:
3605
3619
  )
3606
3620
  return last_modified
3607
3621
 
3622
+ def _get_owner_identifier(self, owner_dict: dict) -> Optional[str]:
3623
+ """Extract owner identifier (email or username) based on configuration."""
3624
+ if not owner_dict:
3625
+ return None
3626
+
3627
+ if self.config.use_email_as_username:
3628
+ email = owner_dict.get(c.EMAIL)
3629
+ if email:
3630
+ return email
3631
+ # Fall back to username if email is not available
3632
+ self.report.num_email_fallback_to_username += 1
3633
+
3634
+ return owner_dict.get(c.USERNAME)
3635
+
3608
3636
  @lru_cache(maxsize=None)
3609
3637
  def _get_ownership(self, user: str) -> Optional[OwnershipClass]:
3610
3638
  if self.config.ingest_owner and user:
@@ -3828,3 +3856,15 @@ class TableauSiteSource:
3828
3856
  self.report.emit_upstream_tables_timer[self.site_content_url] = (
3829
3857
  timer.elapsed_seconds(digits=2)
3830
3858
  )
3859
+
3860
+ # Log owner extraction statistics if there were fallbacks
3861
+ if (
3862
+ self.config.use_email_as_username
3863
+ and self.config.ingest_owner
3864
+ and self.report.num_email_fallback_to_username > 0
3865
+ ):
3866
+ logger.info(
3867
+ f"Owner extraction summary for site '{self.site_content_url}': "
3868
+ f"{self.report.num_email_fallback_to_username} entities fell back from email to username "
3869
+ f"(email was not available)"
3870
+ )
@@ -65,6 +65,7 @@ workbook_graphql_query = """
65
65
  projectName
66
66
  owner {
67
67
  username
68
+ email
68
69
  }
69
70
  description
70
71
  uri
@@ -107,6 +108,7 @@ sheet_graphql_query = """
107
108
  luid
108
109
  owner {
109
110
  username
111
+ email
110
112
  }
111
113
  }
112
114
  datasourceFields {
@@ -185,6 +187,7 @@ dashboard_graphql_query = """
185
187
  luid
186
188
  owner {
187
189
  username
190
+ email
188
191
  }
189
192
  }
190
193
  }
@@ -268,6 +271,7 @@ embedded_datasource_graphql_query = """
268
271
  luid
269
272
  owner {
270
273
  username
274
+ email
271
275
  }
272
276
  }
273
277
  }
@@ -424,6 +428,7 @@ published_datasource_graphql_query = """
424
428
  }
425
429
  owner {
426
430
  username
431
+ email
427
432
  }
428
433
  description
429
434
  uri
@@ -59,6 +59,7 @@ LUID = "luid"
59
59
  EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
60
60
  OWNER = "owner"
61
61
  USERNAME = "username"
62
+ EMAIL = "email"
62
63
  HAS_EXTRACTS = "hasExtracts"
63
64
  EXTRACT_LAST_REFRESH_TIME = "extractLastRefreshTime"
64
65
  EXTRACT_LAST_INCREMENTAL_UPDATE_TIME = "extractLastIncrementalUpdateTime"
@@ -1,4 +1,5 @@
1
1
  from dataclasses import dataclass
2
+ from typing import Optional
2
3
 
3
4
  from tableauserverclient import Server, UserItem
4
5
 
@@ -10,6 +11,7 @@ class UserInfo:
10
11
  user_name: str
11
12
  site_role: str
12
13
  site_id: str
14
+ email: Optional[str] = None
13
15
 
14
16
  def has_site_administrator_explorer_privileges(self):
15
17
  return self.site_role in [
@@ -34,4 +36,5 @@ class UserInfo:
34
36
  user_name=user.name,
35
37
  site_role=user.site_role,
36
38
  site_id=server.site_id,
39
+ email=user.email,
37
40
  )