acryl-datahub 1.2.0.10rc1__py3-none-any.whl → 1.2.0.10rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/METADATA +2616 -2616
- {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/RECORD +29 -29
- datahub/_version.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +12 -0
- datahub/ingestion/source/dbt/dbt_common.py +65 -5
- datahub/ingestion/source/ge_data_profiler.py +15 -2
- datahub/ingestion/source/looker/looker_common.py +75 -74
- datahub/ingestion/source/looker/looker_source.py +445 -548
- datahub/ingestion/source/looker/lookml_source.py +46 -88
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/superset.py +121 -13
- datahub/ingestion/source/tableau/tableau.py +48 -8
- datahub/ingestion/source/tableau/tableau_common.py +5 -0
- datahub/ingestion/source/tableau/tableau_constant.py +1 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/metadata/_internal_schema_classes.py +202 -2
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/schema.avsc +98 -2
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
- datahub/metadata/schemas/LogicalParent.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
- datahub/sdk/dashboard.py +0 -2
- {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.10rc1.dist-info → acryl_datahub-1.2.0.10rc3.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ import tempfile
|
|
|
4
4
|
from collections import OrderedDict
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from datetime import datetime, timezone
|
|
7
|
-
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
|
7
|
+
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
8
8
|
|
|
9
9
|
import lkml
|
|
10
10
|
import lkml.simple
|
|
@@ -12,8 +12,7 @@ from looker_sdk.error import SDKError
|
|
|
12
12
|
|
|
13
13
|
from datahub.configuration.git import GitInfo
|
|
14
14
|
from datahub.emitter.mce_builder import make_schema_field_urn
|
|
15
|
-
from datahub.emitter.
|
|
16
|
-
from datahub.emitter.mcp_builder import gen_containers
|
|
15
|
+
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
17
16
|
from datahub.ingestion.api.common import PipelineContext
|
|
18
17
|
from datahub.ingestion.api.decorators import (
|
|
19
18
|
SupportStatus,
|
|
@@ -77,7 +76,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
77
76
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
78
77
|
StatefulIngestionSourceBase,
|
|
79
78
|
)
|
|
80
|
-
from datahub.metadata.com.linkedin.pegasus2avro.common import
|
|
79
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import Status
|
|
81
80
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
82
81
|
DatasetLineageTypeClass,
|
|
83
82
|
FineGrainedLineageDownstreamType,
|
|
@@ -85,18 +84,15 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
85
84
|
UpstreamLineage,
|
|
86
85
|
ViewProperties,
|
|
87
86
|
)
|
|
88
|
-
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
89
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
90
87
|
from datahub.metadata.schema_classes import (
|
|
91
88
|
AuditStampClass,
|
|
92
|
-
BrowsePathEntryClass,
|
|
93
|
-
BrowsePathsV2Class,
|
|
94
|
-
ContainerClass,
|
|
95
89
|
DatasetPropertiesClass,
|
|
96
90
|
FineGrainedLineageClass,
|
|
97
91
|
FineGrainedLineageUpstreamTypeClass,
|
|
98
|
-
SubTypesClass,
|
|
99
92
|
)
|
|
93
|
+
from datahub.sdk.container import Container
|
|
94
|
+
from datahub.sdk.dataset import Dataset
|
|
95
|
+
from datahub.sdk.entity import Entity
|
|
100
96
|
from datahub.sql_parsing.sqlglot_lineage import ColumnRef
|
|
101
97
|
|
|
102
98
|
VIEW_LANGUAGE_LOOKML: str = "lookml"
|
|
@@ -428,69 +424,40 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
428
424
|
|
|
429
425
|
return dataset_props
|
|
430
426
|
|
|
431
|
-
def
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
subTypeEvent = MetadataChangeProposalWrapper(
|
|
437
|
-
entityUrn=view_urn,
|
|
438
|
-
aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
|
|
439
|
-
)
|
|
440
|
-
events = [subTypeEvent]
|
|
427
|
+
def _build_dataset_entities(self, looker_view: LookerView) -> Iterable[Dataset]:
|
|
428
|
+
dataset_extra_aspects: List[Union[ViewProperties, Status]] = [
|
|
429
|
+
Status(removed=False)
|
|
430
|
+
]
|
|
441
431
|
if looker_view.view_details is not None:
|
|
442
|
-
|
|
443
|
-
entityUrn=view_urn,
|
|
444
|
-
aspect=looker_view.view_details,
|
|
445
|
-
)
|
|
446
|
-
events.append(viewEvent)
|
|
447
|
-
|
|
448
|
-
project_key = gen_project_key(self.source_config, looker_view.id.project_name)
|
|
449
|
-
|
|
450
|
-
container = ContainerClass(container=project_key.as_urn())
|
|
451
|
-
events.append(
|
|
452
|
-
MetadataChangeProposalWrapper(entityUrn=view_urn, aspect=container)
|
|
453
|
-
)
|
|
454
|
-
|
|
455
|
-
events.append(
|
|
456
|
-
MetadataChangeProposalWrapper(
|
|
457
|
-
entityUrn=view_urn,
|
|
458
|
-
aspect=looker_view.id.get_browse_path_v2(self.source_config),
|
|
459
|
-
)
|
|
460
|
-
)
|
|
461
|
-
|
|
462
|
-
return events
|
|
463
|
-
|
|
464
|
-
def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent:
|
|
465
|
-
"""
|
|
466
|
-
Creates MetadataChangeEvent for the dataset, creating upstream lineage links
|
|
467
|
-
"""
|
|
468
|
-
logger.debug(f"looker_view = {looker_view.id}")
|
|
432
|
+
dataset_extra_aspects.append(looker_view.view_details)
|
|
469
433
|
|
|
470
|
-
dataset_snapshot = DatasetSnapshot(
|
|
471
|
-
urn=looker_view.id.get_urn(self.source_config),
|
|
472
|
-
aspects=[], # we append to this list later on
|
|
473
|
-
)
|
|
474
|
-
browse_paths = BrowsePaths(
|
|
475
|
-
paths=[looker_view.id.get_browse_path(self.source_config)]
|
|
476
|
-
)
|
|
477
|
-
|
|
478
|
-
dataset_snapshot.aspects.append(browse_paths)
|
|
479
|
-
dataset_snapshot.aspects.append(Status(removed=False))
|
|
480
|
-
upstream_lineage = self._get_upstream_lineage(looker_view)
|
|
481
|
-
if upstream_lineage is not None:
|
|
482
|
-
dataset_snapshot.aspects.append(upstream_lineage)
|
|
483
434
|
schema_metadata = LookerUtil._get_schema(
|
|
484
435
|
self.source_config.platform_name,
|
|
485
436
|
looker_view.id.view_name,
|
|
486
437
|
looker_view.fields,
|
|
487
438
|
self.reporter,
|
|
488
439
|
)
|
|
489
|
-
if schema_metadata is not None:
|
|
490
|
-
dataset_snapshot.aspects.append(schema_metadata)
|
|
491
|
-
dataset_snapshot.aspects.append(self._get_custom_properties(looker_view))
|
|
492
440
|
|
|
493
|
-
|
|
441
|
+
custom_properties: DatasetPropertiesClass = self._get_custom_properties(
|
|
442
|
+
looker_view
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
yield Dataset(
|
|
446
|
+
platform=self.source_config.platform_name,
|
|
447
|
+
name=looker_view.id.get_view_dataset_name(self.source_config),
|
|
448
|
+
display_name=looker_view.id.view_name,
|
|
449
|
+
platform_instance=self.source_config.platform_instance,
|
|
450
|
+
env=self.source_config.env,
|
|
451
|
+
subtype=DatasetSubTypes.VIEW,
|
|
452
|
+
parent_container=looker_view.id.get_view_dataset_parent_container(
|
|
453
|
+
self.source_config
|
|
454
|
+
),
|
|
455
|
+
schema=schema_metadata,
|
|
456
|
+
custom_properties=custom_properties.customProperties,
|
|
457
|
+
external_url=custom_properties.externalUrl,
|
|
458
|
+
upstreams=self._get_upstream_lineage(looker_view),
|
|
459
|
+
extra_aspects=dataset_extra_aspects,
|
|
460
|
+
)
|
|
494
461
|
|
|
495
462
|
def get_project_name(self, model_name: str) -> str:
|
|
496
463
|
if self.source_config.project_name is not None:
|
|
@@ -554,7 +521,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
554
521
|
).workunit_processor,
|
|
555
522
|
]
|
|
556
523
|
|
|
557
|
-
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
524
|
+
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
558
525
|
with tempfile.TemporaryDirectory("lookml_tmp") as tmp_dir:
|
|
559
526
|
# Clone the base_folder if necessary.
|
|
560
527
|
if not self.source_config.base_folder:
|
|
@@ -715,7 +682,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
715
682
|
tmp_dir, project, project_visited, manifest_constants
|
|
716
683
|
)
|
|
717
684
|
|
|
718
|
-
def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
|
|
685
|
+
def get_internal_workunits(self) -> Iterable[Union[MetadataWorkUnit, Entity]]: # noqa: C901
|
|
719
686
|
assert self.source_config.base_folder
|
|
720
687
|
viewfile_loader = LookerViewFileLoader(
|
|
721
688
|
self.source_config.project_name,
|
|
@@ -949,7 +916,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
949
916
|
maybe_looker_view.id.project_name
|
|
950
917
|
not in self.processed_projects
|
|
951
918
|
):
|
|
952
|
-
yield from self.
|
|
919
|
+
yield from self.gen_project_containers(
|
|
953
920
|
maybe_looker_view.id.project_name
|
|
954
921
|
)
|
|
955
922
|
|
|
@@ -957,15 +924,10 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
957
924
|
maybe_looker_view.id.project_name
|
|
958
925
|
)
|
|
959
926
|
|
|
960
|
-
|
|
927
|
+
yield from self._build_dataset_entities(
|
|
961
928
|
maybe_looker_view
|
|
962
|
-
):
|
|
963
|
-
yield mcp.as_workunit()
|
|
964
|
-
mce = self._build_dataset_mce(maybe_looker_view)
|
|
965
|
-
yield MetadataWorkUnit(
|
|
966
|
-
id=f"lookml-view-{maybe_looker_view.id}",
|
|
967
|
-
mce=mce,
|
|
968
929
|
)
|
|
930
|
+
|
|
969
931
|
processed_view_files.add(include.include)
|
|
970
932
|
else:
|
|
971
933
|
(
|
|
@@ -994,28 +956,24 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
994
956
|
self.source_config.tag_measures_and_dimensions
|
|
995
957
|
and self.reporter.events_produced != 0
|
|
996
958
|
):
|
|
997
|
-
# Emit tag MCEs for measures and dimensions:
|
|
959
|
+
# Emit tag MCEs for measures and dimensions if we produced any explores:
|
|
998
960
|
for tag_mce in LookerUtil.get_tag_mces():
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
961
|
+
# Convert MCE to MCPs
|
|
962
|
+
for mcp in mcps_from_mce(tag_mce):
|
|
963
|
+
yield mcp.as_workunit()
|
|
1002
964
|
|
|
1003
|
-
def
|
|
965
|
+
def gen_project_containers(self, project_name: str) -> Iterable[Container]:
|
|
1004
966
|
project_key = gen_project_key(
|
|
1005
967
|
self.source_config,
|
|
1006
968
|
project_name,
|
|
1007
969
|
)
|
|
1008
|
-
|
|
970
|
+
|
|
971
|
+
yield Container(
|
|
1009
972
|
container_key=project_key,
|
|
1010
|
-
|
|
1011
|
-
|
|
973
|
+
display_name=project_name,
|
|
974
|
+
subtype=BIContainerSubTypes.LOOKML_PROJECT,
|
|
975
|
+
parent_container=["Folders"],
|
|
1012
976
|
)
|
|
1013
|
-
yield MetadataChangeProposalWrapper(
|
|
1014
|
-
entityUrn=project_key.as_urn(),
|
|
1015
|
-
aspect=BrowsePathsV2Class(
|
|
1016
|
-
path=[BrowsePathEntryClass("Folders")],
|
|
1017
|
-
),
|
|
1018
|
-
).as_workunit()
|
|
1019
977
|
|
|
1020
978
|
def report_skipped_unreachable_views(
|
|
1021
979
|
self,
|
|
@@ -447,7 +447,7 @@ class RedashSource(StatefulIngestionSourceBase):
|
|
|
447
447
|
dataset_urns = sql_parser_in_tables.in_tables
|
|
448
448
|
if sql_parser_in_tables.debug_info.table_error:
|
|
449
449
|
self.report.queries_problem_parsing.add(str(query_id))
|
|
450
|
-
self.
|
|
450
|
+
self.warn(
|
|
451
451
|
logger,
|
|
452
452
|
"sql-parsing",
|
|
453
453
|
f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
|
|
@@ -12,6 +12,8 @@ import sqlglot
|
|
|
12
12
|
from pydantic import BaseModel
|
|
13
13
|
from pydantic.class_validators import root_validator, validator
|
|
14
14
|
from pydantic.fields import Field
|
|
15
|
+
from requests.adapters import HTTPAdapter
|
|
16
|
+
from urllib3.util.retry import Retry
|
|
15
17
|
|
|
16
18
|
import datahub.emitter.mce_builder as builder
|
|
17
19
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -109,6 +111,12 @@ logger = logging.getLogger(__name__)
|
|
|
109
111
|
|
|
110
112
|
PAGE_SIZE = 25
|
|
111
113
|
|
|
114
|
+
# Retry configuration constants
|
|
115
|
+
RETRY_MAX_TIMES = 3
|
|
116
|
+
RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
|
|
117
|
+
RETRY_BACKOFF_FACTOR = 1
|
|
118
|
+
RETRY_ALLOWED_METHODS = ["GET"]
|
|
119
|
+
|
|
112
120
|
|
|
113
121
|
chart_type_from_viz_type = {
|
|
114
122
|
"line": ChartTypeClass.LINE,
|
|
@@ -282,6 +290,7 @@ def get_filter_name(filter_obj):
|
|
|
282
290
|
)
|
|
283
291
|
@capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
|
|
284
292
|
@capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
|
|
293
|
+
@capability(SourceCapability.TAGS, "Supported by default")
|
|
285
294
|
class SupersetSource(StatefulIngestionSourceBase):
|
|
286
295
|
"""
|
|
287
296
|
This plugin extracts the following:
|
|
@@ -327,6 +336,19 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
327
336
|
logger.debug("Got access token from superset")
|
|
328
337
|
|
|
329
338
|
requests_session = requests.Session()
|
|
339
|
+
|
|
340
|
+
# Configure retry strategy for transient failures
|
|
341
|
+
retry_strategy = Retry(
|
|
342
|
+
total=RETRY_MAX_TIMES,
|
|
343
|
+
status_forcelist=RETRY_STATUS_CODES,
|
|
344
|
+
backoff_factor=RETRY_BACKOFF_FACTOR,
|
|
345
|
+
allowed_methods=RETRY_ALLOWED_METHODS,
|
|
346
|
+
raise_on_status=False,
|
|
347
|
+
)
|
|
348
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
349
|
+
requests_session.mount("http://", adapter)
|
|
350
|
+
requests_session.mount("https://", adapter)
|
|
351
|
+
|
|
330
352
|
requests_session.headers.update(
|
|
331
353
|
{
|
|
332
354
|
"Authorization": f"Bearer {self.access_token}",
|
|
@@ -359,8 +381,13 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
359
381
|
)
|
|
360
382
|
|
|
361
383
|
if response.status_code != 200:
|
|
362
|
-
|
|
363
|
-
|
|
384
|
+
self.report.warning(
|
|
385
|
+
title="Failed to fetch data from Superset API",
|
|
386
|
+
message="Incomplete metadata extraction due to Superset API failure",
|
|
387
|
+
context=f"Entity Type: {entity_type}, HTTP Status Code: {response.status_code}, Page: {current_page}. Response: {response.text}",
|
|
388
|
+
)
|
|
389
|
+
# we stop pagination for this entity type and we continue the overall ingestion
|
|
390
|
+
break
|
|
364
391
|
|
|
365
392
|
payload = response.json()
|
|
366
393
|
# Update total_items with the actual count from the response
|
|
@@ -521,6 +548,11 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
521
548
|
)
|
|
522
549
|
dashboard_snapshot.aspects.append(owners_info)
|
|
523
550
|
|
|
551
|
+
superset_tags = self._extract_and_map_tags(dashboard_data.get("tags", []))
|
|
552
|
+
tags = self._merge_tags_with_existing(dashboard_urn, superset_tags)
|
|
553
|
+
if tags:
|
|
554
|
+
dashboard_snapshot.aspects.append(tags)
|
|
555
|
+
|
|
524
556
|
return dashboard_snapshot
|
|
525
557
|
|
|
526
558
|
def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
@@ -919,6 +951,12 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
919
951
|
lastModified=last_modified,
|
|
920
952
|
)
|
|
921
953
|
chart_snapshot.aspects.append(owners_info)
|
|
954
|
+
|
|
955
|
+
superset_tags = self._extract_and_map_tags(chart_data.get("tags", []))
|
|
956
|
+
tags = self._merge_tags_with_existing(chart_urn, superset_tags)
|
|
957
|
+
if tags:
|
|
958
|
+
chart_snapshot.aspects.append(tags)
|
|
959
|
+
|
|
922
960
|
yield MetadataWorkUnit(
|
|
923
961
|
id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
924
962
|
)
|
|
@@ -1288,17 +1326,18 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1288
1326
|
externalUrl=dataset_url,
|
|
1289
1327
|
lastModified=TimeStamp(time=modified_ts),
|
|
1290
1328
|
)
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1329
|
+
|
|
1330
|
+
dataset_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
|
|
1331
|
+
tags = self._merge_tags_with_existing(datasource_urn, dataset_tags)
|
|
1332
|
+
|
|
1333
|
+
aspects_items: List[Any] = [
|
|
1334
|
+
self.gen_schema_metadata(dataset_response),
|
|
1335
|
+
dataset_info,
|
|
1336
|
+
upstream_lineage,
|
|
1337
|
+
]
|
|
1338
|
+
|
|
1339
|
+
if tags:
|
|
1340
|
+
aspects_items.append(tags)
|
|
1302
1341
|
|
|
1303
1342
|
dataset_snapshot = DatasetSnapshot(
|
|
1304
1343
|
urn=datasource_urn,
|
|
@@ -1320,6 +1359,75 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1320
1359
|
|
|
1321
1360
|
return dataset_snapshot
|
|
1322
1361
|
|
|
1362
|
+
def _extract_and_map_tags(
|
|
1363
|
+
self, raw_tags: List[Dict[str, Any]]
|
|
1364
|
+
) -> Optional[GlobalTagsClass]:
|
|
1365
|
+
"""Extract and map Superset tags to DataHub GlobalTagsClass.
|
|
1366
|
+
|
|
1367
|
+
Filters out system-generated tags (type != 1) and only processes user-defined tags
|
|
1368
|
+
from the Superset API response.
|
|
1369
|
+
|
|
1370
|
+
Args:
|
|
1371
|
+
raw_tags: List of tag dictionaries from Superset API
|
|
1372
|
+
|
|
1373
|
+
Returns:
|
|
1374
|
+
GlobalTagsClass with user-defined tags, or None if no tags found
|
|
1375
|
+
"""
|
|
1376
|
+
user_tags = [
|
|
1377
|
+
tag.get("name", "")
|
|
1378
|
+
for tag in raw_tags
|
|
1379
|
+
if tag.get("type") == 1 and tag.get("name")
|
|
1380
|
+
]
|
|
1381
|
+
|
|
1382
|
+
if not user_tags:
|
|
1383
|
+
return None
|
|
1384
|
+
|
|
1385
|
+
tag_urns = [builder.make_tag_urn(tag) for tag in user_tags]
|
|
1386
|
+
return GlobalTagsClass(
|
|
1387
|
+
tags=[TagAssociationClass(tag=tag_urn) for tag_urn in tag_urns]
|
|
1388
|
+
)
|
|
1389
|
+
|
|
1390
|
+
def _merge_tags_with_existing(
|
|
1391
|
+
self, entity_urn: str, new_tags: Optional[GlobalTagsClass]
|
|
1392
|
+
) -> Optional[GlobalTagsClass]:
|
|
1393
|
+
"""Merge new tags with existing ones from DataHub to preserve manually added tags.
|
|
1394
|
+
|
|
1395
|
+
This method ensures that tags manually added via DataHub UI are not overwritten
|
|
1396
|
+
during ingestion. It fetches existing tags from the graph and merges them with
|
|
1397
|
+
new tags from the source system, avoiding duplicates.
|
|
1398
|
+
|
|
1399
|
+
Args:
|
|
1400
|
+
entity_urn: URN of the entity to check for existing tags
|
|
1401
|
+
new_tags: New tags to add as GlobalTagsClass object
|
|
1402
|
+
|
|
1403
|
+
Returns:
|
|
1404
|
+
GlobalTagsClass with merged tags preserving existing ones, or None if no tags
|
|
1405
|
+
"""
|
|
1406
|
+
if not new_tags or not new_tags.tags:
|
|
1407
|
+
return None
|
|
1408
|
+
|
|
1409
|
+
# Fetch existing tags from DataHub
|
|
1410
|
+
existing_global_tags = None
|
|
1411
|
+
if self.ctx.graph:
|
|
1412
|
+
existing_global_tags = self.ctx.graph.get_aspect(
|
|
1413
|
+
entity_urn=entity_urn, aspect_type=GlobalTagsClass
|
|
1414
|
+
)
|
|
1415
|
+
|
|
1416
|
+
# Merge existing tags with new ones, avoiding duplicates
|
|
1417
|
+
all_tags = []
|
|
1418
|
+
existing_tag_urns = set()
|
|
1419
|
+
|
|
1420
|
+
if existing_global_tags and existing_global_tags.tags:
|
|
1421
|
+
all_tags.extend(existing_global_tags.tags)
|
|
1422
|
+
existing_tag_urns = {tag.tag for tag in existing_global_tags.tags}
|
|
1423
|
+
|
|
1424
|
+
# Add new tags that don't already exist
|
|
1425
|
+
for new_tag in new_tags.tags:
|
|
1426
|
+
if new_tag.tag not in existing_tag_urns:
|
|
1427
|
+
all_tags.append(new_tag)
|
|
1428
|
+
|
|
1429
|
+
return GlobalTagsClass(tags=all_tags) if all_tags else None
|
|
1430
|
+
|
|
1323
1431
|
def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
1324
1432
|
dataset_name = ""
|
|
1325
1433
|
try:
|
|
@@ -524,6 +524,10 @@ class TableauConfig(
|
|
|
524
524
|
default=False,
|
|
525
525
|
description="Ingest Owner from source. This will override Owner info entered from UI",
|
|
526
526
|
)
|
|
527
|
+
use_email_as_username: bool = Field(
|
|
528
|
+
default=False,
|
|
529
|
+
description="Use email address instead of username for entity owners. Requires ingest_owner to be True.",
|
|
530
|
+
)
|
|
527
531
|
ingest_tables_external: bool = Field(
|
|
528
532
|
default=False,
|
|
529
533
|
description="Ingest details for tables external to (not embedded in) tableau as entities.",
|
|
@@ -678,6 +682,14 @@ class TableauConfig(
|
|
|
678
682
|
raise ValueError(
|
|
679
683
|
"tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
|
|
680
684
|
)
|
|
685
|
+
|
|
686
|
+
use_email_as_username = values.get("use_email_as_username")
|
|
687
|
+
ingest_owner = values.get("ingest_owner")
|
|
688
|
+
if use_email_as_username and not ingest_owner:
|
|
689
|
+
raise ValueError(
|
|
690
|
+
"use_email_as_username requires ingest_owner to be enabled."
|
|
691
|
+
)
|
|
692
|
+
|
|
681
693
|
return values
|
|
682
694
|
|
|
683
695
|
|
|
@@ -839,6 +851,9 @@ class TableauSourceReport(
|
|
|
839
851
|
default_factory=(lambda: defaultdict(int))
|
|
840
852
|
)
|
|
841
853
|
|
|
854
|
+
# Owner extraction statistics
|
|
855
|
+
num_email_fallback_to_username: int = 0
|
|
856
|
+
|
|
842
857
|
|
|
843
858
|
def report_user_role(report: TableauSourceReport, server: Server) -> None:
|
|
844
859
|
title: str = "Insufficient Permissions"
|
|
@@ -2716,13 +2731,12 @@ class TableauSiteSource:
|
|
|
2716
2731
|
dataset_snapshot.aspects.append(browse_paths)
|
|
2717
2732
|
|
|
2718
2733
|
# Ownership
|
|
2719
|
-
|
|
2720
|
-
self.
|
|
2721
|
-
if datasource_info
|
|
2722
|
-
and datasource_info.get(c.OWNER)
|
|
2723
|
-
and datasource_info[c.OWNER].get(c.USERNAME)
|
|
2734
|
+
owner_identifier = (
|
|
2735
|
+
self._get_owner_identifier(datasource_info[c.OWNER])
|
|
2736
|
+
if datasource_info and datasource_info.get(c.OWNER)
|
|
2724
2737
|
else None
|
|
2725
2738
|
)
|
|
2739
|
+
owner = self._get_ownership(owner_identifier) if owner_identifier else None
|
|
2726
2740
|
if owner is not None:
|
|
2727
2741
|
dataset_snapshot.aspects.append(owner)
|
|
2728
2742
|
|
|
@@ -3127,7 +3141,7 @@ class TableauSiteSource:
|
|
|
3127
3141
|
|
|
3128
3142
|
creator: Optional[str] = None
|
|
3129
3143
|
if workbook is not None and workbook.get(c.OWNER) is not None:
|
|
3130
|
-
creator = workbook[c.OWNER]
|
|
3144
|
+
creator = self._get_owner_identifier(workbook[c.OWNER])
|
|
3131
3145
|
created_at = sheet.get(c.CREATED_AT, datetime.now())
|
|
3132
3146
|
updated_at = sheet.get(c.UPDATED_AT, datetime.now())
|
|
3133
3147
|
last_modified = self.get_last_modified(creator, created_at, updated_at)
|
|
@@ -3276,7 +3290,7 @@ class TableauSiteSource:
|
|
|
3276
3290
|
|
|
3277
3291
|
def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUnit]:
|
|
3278
3292
|
workbook_container_key = self.gen_workbook_key(workbook[c.ID])
|
|
3279
|
-
creator = workbook.get(c.OWNER, {})
|
|
3293
|
+
creator = self._get_owner_identifier(workbook.get(c.OWNER, {}))
|
|
3280
3294
|
|
|
3281
3295
|
owner_urn = (
|
|
3282
3296
|
builder.make_user_urn(creator)
|
|
@@ -3458,7 +3472,7 @@ class TableauSiteSource:
|
|
|
3458
3472
|
|
|
3459
3473
|
creator: Optional[str] = None
|
|
3460
3474
|
if workbook is not None and workbook.get(c.OWNER) is not None:
|
|
3461
|
-
creator = workbook[c.OWNER]
|
|
3475
|
+
creator = self._get_owner_identifier(workbook[c.OWNER])
|
|
3462
3476
|
created_at = dashboard.get(c.CREATED_AT, datetime.now())
|
|
3463
3477
|
updated_at = dashboard.get(c.UPDATED_AT, datetime.now())
|
|
3464
3478
|
last_modified = self.get_last_modified(creator, created_at, updated_at)
|
|
@@ -3605,6 +3619,20 @@ class TableauSiteSource:
|
|
|
3605
3619
|
)
|
|
3606
3620
|
return last_modified
|
|
3607
3621
|
|
|
3622
|
+
def _get_owner_identifier(self, owner_dict: dict) -> Optional[str]:
|
|
3623
|
+
"""Extract owner identifier (email or username) based on configuration."""
|
|
3624
|
+
if not owner_dict:
|
|
3625
|
+
return None
|
|
3626
|
+
|
|
3627
|
+
if self.config.use_email_as_username:
|
|
3628
|
+
email = owner_dict.get(c.EMAIL)
|
|
3629
|
+
if email:
|
|
3630
|
+
return email
|
|
3631
|
+
# Fall back to username if email is not available
|
|
3632
|
+
self.report.num_email_fallback_to_username += 1
|
|
3633
|
+
|
|
3634
|
+
return owner_dict.get(c.USERNAME)
|
|
3635
|
+
|
|
3608
3636
|
@lru_cache(maxsize=None)
|
|
3609
3637
|
def _get_ownership(self, user: str) -> Optional[OwnershipClass]:
|
|
3610
3638
|
if self.config.ingest_owner and user:
|
|
@@ -3828,3 +3856,15 @@ class TableauSiteSource:
|
|
|
3828
3856
|
self.report.emit_upstream_tables_timer[self.site_content_url] = (
|
|
3829
3857
|
timer.elapsed_seconds(digits=2)
|
|
3830
3858
|
)
|
|
3859
|
+
|
|
3860
|
+
# Log owner extraction statistics if there were fallbacks
|
|
3861
|
+
if (
|
|
3862
|
+
self.config.use_email_as_username
|
|
3863
|
+
and self.config.ingest_owner
|
|
3864
|
+
and self.report.num_email_fallback_to_username > 0
|
|
3865
|
+
):
|
|
3866
|
+
logger.info(
|
|
3867
|
+
f"Owner extraction summary for site '{self.site_content_url}': "
|
|
3868
|
+
f"{self.report.num_email_fallback_to_username} entities fell back from email to username "
|
|
3869
|
+
f"(email was not available)"
|
|
3870
|
+
)
|
|
@@ -65,6 +65,7 @@ workbook_graphql_query = """
|
|
|
65
65
|
projectName
|
|
66
66
|
owner {
|
|
67
67
|
username
|
|
68
|
+
email
|
|
68
69
|
}
|
|
69
70
|
description
|
|
70
71
|
uri
|
|
@@ -107,6 +108,7 @@ sheet_graphql_query = """
|
|
|
107
108
|
luid
|
|
108
109
|
owner {
|
|
109
110
|
username
|
|
111
|
+
email
|
|
110
112
|
}
|
|
111
113
|
}
|
|
112
114
|
datasourceFields {
|
|
@@ -185,6 +187,7 @@ dashboard_graphql_query = """
|
|
|
185
187
|
luid
|
|
186
188
|
owner {
|
|
187
189
|
username
|
|
190
|
+
email
|
|
188
191
|
}
|
|
189
192
|
}
|
|
190
193
|
}
|
|
@@ -268,6 +271,7 @@ embedded_datasource_graphql_query = """
|
|
|
268
271
|
luid
|
|
269
272
|
owner {
|
|
270
273
|
username
|
|
274
|
+
email
|
|
271
275
|
}
|
|
272
276
|
}
|
|
273
277
|
}
|
|
@@ -424,6 +428,7 @@ published_datasource_graphql_query = """
|
|
|
424
428
|
}
|
|
425
429
|
owner {
|
|
426
430
|
username
|
|
431
|
+
email
|
|
427
432
|
}
|
|
428
433
|
description
|
|
429
434
|
uri
|
|
@@ -59,6 +59,7 @@ LUID = "luid"
|
|
|
59
59
|
EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
|
|
60
60
|
OWNER = "owner"
|
|
61
61
|
USERNAME = "username"
|
|
62
|
+
EMAIL = "email"
|
|
62
63
|
HAS_EXTRACTS = "hasExtracts"
|
|
63
64
|
EXTRACT_LAST_REFRESH_TIME = "extractLastRefreshTime"
|
|
64
65
|
EXTRACT_LAST_INCREMENTAL_UPDATE_TIME = "extractLastIncrementalUpdateTime"
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
from tableauserverclient import Server, UserItem
|
|
4
5
|
|
|
@@ -10,6 +11,7 @@ class UserInfo:
|
|
|
10
11
|
user_name: str
|
|
11
12
|
site_role: str
|
|
12
13
|
site_id: str
|
|
14
|
+
email: Optional[str] = None
|
|
13
15
|
|
|
14
16
|
def has_site_administrator_explorer_privileges(self):
|
|
15
17
|
return self.site_role in [
|
|
@@ -34,4 +36,5 @@ class UserInfo:
|
|
|
34
36
|
user_name=user.name,
|
|
35
37
|
site_role=user.site_role,
|
|
36
38
|
site_id=server.site_id,
|
|
39
|
+
email=user.email,
|
|
37
40
|
)
|