acryl-datahub 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2568 -2626
  2. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +120 -113
  3. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/dataproduct/dataproduct.py +6 -3
  8. datahub/api/entities/dataset/dataset.py +9 -18
  9. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  10. datahub/api/graphql/operation.py +10 -6
  11. datahub/cli/docker_check.py +2 -2
  12. datahub/configuration/common.py +29 -1
  13. datahub/configuration/connection_resolver.py +5 -2
  14. datahub/configuration/import_resolver.py +7 -4
  15. datahub/configuration/pydantic_migration_helpers.py +0 -9
  16. datahub/configuration/source_common.py +3 -2
  17. datahub/configuration/validate_field_deprecation.py +5 -2
  18. datahub/configuration/validate_field_removal.py +5 -2
  19. datahub/configuration/validate_field_rename.py +6 -5
  20. datahub/configuration/validate_multiline_string.py +5 -2
  21. datahub/ingestion/autogenerated/capability_summary.json +45 -1
  22. datahub/ingestion/run/pipeline_config.py +2 -2
  23. datahub/ingestion/source/azure/azure_common.py +1 -1
  24. datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
  25. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  26. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -0
  27. datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
  28. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  29. datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
  30. datahub/ingestion/source/datahub/config.py +8 -9
  31. datahub/ingestion/source/dbt/dbt_common.py +65 -5
  32. datahub/ingestion/source/delta_lake/config.py +1 -1
  33. datahub/ingestion/source/dremio/dremio_config.py +3 -4
  34. datahub/ingestion/source/feast.py +8 -10
  35. datahub/ingestion/source/fivetran/config.py +1 -1
  36. datahub/ingestion/source/gcs/gcs_source.py +19 -2
  37. datahub/ingestion/source/ge_data_profiler.py +15 -2
  38. datahub/ingestion/source/ge_profiling_config.py +26 -22
  39. datahub/ingestion/source/grafana/grafana_config.py +2 -2
  40. datahub/ingestion/source/grafana/models.py +12 -14
  41. datahub/ingestion/source/hex/hex.py +6 -1
  42. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  43. datahub/ingestion/source/kafka_connect/common.py +2 -2
  44. datahub/ingestion/source/looker/looker_common.py +76 -75
  45. datahub/ingestion/source/looker/looker_config.py +15 -4
  46. datahub/ingestion/source/looker/looker_source.py +493 -547
  47. datahub/ingestion/source/looker/lookml_config.py +1 -1
  48. datahub/ingestion/source/looker/lookml_source.py +46 -88
  49. datahub/ingestion/source/metabase.py +9 -2
  50. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  51. datahub/ingestion/source/metadata/lineage.py +1 -1
  52. datahub/ingestion/source/mode.py +13 -5
  53. datahub/ingestion/source/nifi.py +1 -1
  54. datahub/ingestion/source/powerbi/config.py +14 -21
  55. datahub/ingestion/source/preset.py +1 -1
  56. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  57. datahub/ingestion/source/redash.py +1 -1
  58. datahub/ingestion/source/redshift/config.py +6 -3
  59. datahub/ingestion/source/redshift/query.py +23 -19
  60. datahub/ingestion/source/s3/source.py +26 -24
  61. datahub/ingestion/source/salesforce.py +13 -9
  62. datahub/ingestion/source/schema/json_schema.py +14 -14
  63. datahub/ingestion/source/sigma/data_classes.py +3 -0
  64. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  65. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  66. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  67. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  68. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  69. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  70. datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
  71. datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
  72. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
  73. datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
  74. datahub/ingestion/source/sql/athena.py +2 -1
  75. datahub/ingestion/source/sql/clickhouse.py +12 -7
  76. datahub/ingestion/source/sql/cockroachdb.py +5 -3
  77. datahub/ingestion/source/sql/druid.py +2 -2
  78. datahub/ingestion/source/sql/hive.py +4 -3
  79. datahub/ingestion/source/sql/hive_metastore.py +7 -9
  80. datahub/ingestion/source/sql/mssql/source.py +2 -2
  81. datahub/ingestion/source/sql/mysql.py +2 -2
  82. datahub/ingestion/source/sql/oracle.py +3 -3
  83. datahub/ingestion/source/sql/presto.py +2 -1
  84. datahub/ingestion/source/sql/teradata.py +4 -4
  85. datahub/ingestion/source/sql/trino.py +2 -1
  86. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  87. datahub/ingestion/source/sql/vertica.py +1 -1
  88. datahub/ingestion/source/sql_queries.py +6 -6
  89. datahub/ingestion/source/state/checkpoint.py +5 -1
  90. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  91. datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
  92. datahub/ingestion/source/superset.py +122 -15
  93. datahub/ingestion/source/tableau/tableau.py +68 -14
  94. datahub/ingestion/source/tableau/tableau_common.py +5 -0
  95. datahub/ingestion/source/tableau/tableau_constant.py +1 -0
  96. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  97. datahub/ingestion/source/unity/config.py +7 -3
  98. datahub/ingestion/source/usage/usage_common.py +3 -3
  99. datahub/ingestion/source_config/pulsar.py +3 -1
  100. datahub/ingestion/transformer/set_browse_path.py +112 -0
  101. datahub/metadata/_internal_schema_classes.py +728 -528
  102. datahub/metadata/_urns/urn_defs.py +1702 -1702
  103. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  104. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  105. datahub/metadata/schema.avsc +17434 -17732
  106. datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
  107. datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
  108. datahub/metadata/schemas/LogicalParent.avsc +2 -1
  109. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  110. datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
  111. datahub/sdk/_shared.py +126 -0
  112. datahub/sdk/chart.py +87 -30
  113. datahub/sdk/dashboard.py +79 -34
  114. datahub/sdk/entity_client.py +11 -4
  115. datahub/sdk/lineage_client.py +3 -3
  116. datahub/sdk/search_filters.py +1 -7
  117. datahub/sql_parsing/split_statements.py +13 -0
  118. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
  119. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
  120. {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0
@@ -9,9 +9,10 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
9
9
  import dateutil.parser as dp
10
10
  import requests
11
11
  import sqlglot
12
- from pydantic import BaseModel
13
- from pydantic.class_validators import root_validator, validator
12
+ from pydantic import BaseModel, root_validator, validator
14
13
  from pydantic.fields import Field
14
+ from requests.adapters import HTTPAdapter
15
+ from urllib3.util.retry import Retry
15
16
 
16
17
  import datahub.emitter.mce_builder as builder
17
18
  from datahub.configuration.common import AllowDenyPattern
@@ -109,6 +110,12 @@ logger = logging.getLogger(__name__)
109
110
 
110
111
  PAGE_SIZE = 25
111
112
 
113
+ # Retry configuration constants
114
+ RETRY_MAX_TIMES = 3
115
+ RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
116
+ RETRY_BACKOFF_FACTOR = 1
117
+ RETRY_ALLOWED_METHODS = ["GET"]
118
+
112
119
 
113
120
  chart_type_from_viz_type = {
114
121
  "line": ChartTypeClass.LINE,
@@ -282,6 +289,7 @@ def get_filter_name(filter_obj):
282
289
  )
283
290
  @capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
284
291
  @capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
292
+ @capability(SourceCapability.TAGS, "Supported by default")
285
293
  class SupersetSource(StatefulIngestionSourceBase):
286
294
  """
287
295
  This plugin extracts the following:
@@ -327,6 +335,19 @@ class SupersetSource(StatefulIngestionSourceBase):
327
335
  logger.debug("Got access token from superset")
328
336
 
329
337
  requests_session = requests.Session()
338
+
339
+ # Configure retry strategy for transient failures
340
+ retry_strategy = Retry(
341
+ total=RETRY_MAX_TIMES,
342
+ status_forcelist=RETRY_STATUS_CODES,
343
+ backoff_factor=RETRY_BACKOFF_FACTOR,
344
+ allowed_methods=RETRY_ALLOWED_METHODS,
345
+ raise_on_status=False,
346
+ )
347
+ adapter = HTTPAdapter(max_retries=retry_strategy)
348
+ requests_session.mount("http://", adapter)
349
+ requests_session.mount("https://", adapter)
350
+
330
351
  requests_session.headers.update(
331
352
  {
332
353
  "Authorization": f"Bearer {self.access_token}",
@@ -359,8 +380,13 @@ class SupersetSource(StatefulIngestionSourceBase):
359
380
  )
360
381
 
361
382
  if response.status_code != 200:
362
- logger.warning(f"Failed to get {entity_type} data: {response.text}")
363
- continue
383
+ self.report.warning(
384
+ title="Failed to fetch data from Superset API",
385
+ message="Incomplete metadata extraction due to Superset API failure",
386
+ context=f"Entity Type: {entity_type}, HTTP Status Code: {response.status_code}, Page: {current_page}. Response: {response.text}",
387
+ )
388
+ # we stop pagination for this entity type and we continue the overall ingestion
389
+ break
364
390
 
365
391
  payload = response.json()
366
392
  # Update total_items with the actual count from the response
@@ -521,6 +547,11 @@ class SupersetSource(StatefulIngestionSourceBase):
521
547
  )
522
548
  dashboard_snapshot.aspects.append(owners_info)
523
549
 
550
+ superset_tags = self._extract_and_map_tags(dashboard_data.get("tags", []))
551
+ tags = self._merge_tags_with_existing(dashboard_urn, superset_tags)
552
+ if tags:
553
+ dashboard_snapshot.aspects.append(tags)
554
+
524
555
  return dashboard_snapshot
525
556
 
526
557
  def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
@@ -919,6 +950,12 @@ class SupersetSource(StatefulIngestionSourceBase):
919
950
  lastModified=last_modified,
920
951
  )
921
952
  chart_snapshot.aspects.append(owners_info)
953
+
954
+ superset_tags = self._extract_and_map_tags(chart_data.get("tags", []))
955
+ tags = self._merge_tags_with_existing(chart_urn, superset_tags)
956
+ if tags:
957
+ chart_snapshot.aspects.append(tags)
958
+
922
959
  yield MetadataWorkUnit(
923
960
  id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
924
961
  )
@@ -1288,17 +1325,18 @@ class SupersetSource(StatefulIngestionSourceBase):
1288
1325
  externalUrl=dataset_url,
1289
1326
  lastModified=TimeStamp(time=modified_ts),
1290
1327
  )
1291
- global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
1292
-
1293
- aspects_items: List[Any] = []
1294
- aspects_items.extend(
1295
- [
1296
- self.gen_schema_metadata(dataset_response),
1297
- dataset_info,
1298
- upstream_lineage,
1299
- global_tags,
1300
- ]
1301
- )
1328
+
1329
+ dataset_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
1330
+ tags = self._merge_tags_with_existing(datasource_urn, dataset_tags)
1331
+
1332
+ aspects_items: List[Any] = [
1333
+ self.gen_schema_metadata(dataset_response),
1334
+ dataset_info,
1335
+ upstream_lineage,
1336
+ ]
1337
+
1338
+ if tags:
1339
+ aspects_items.append(tags)
1302
1340
 
1303
1341
  dataset_snapshot = DatasetSnapshot(
1304
1342
  urn=datasource_urn,
@@ -1320,6 +1358,75 @@ class SupersetSource(StatefulIngestionSourceBase):
1320
1358
 
1321
1359
  return dataset_snapshot
1322
1360
 
1361
+ def _extract_and_map_tags(
1362
+ self, raw_tags: List[Dict[str, Any]]
1363
+ ) -> Optional[GlobalTagsClass]:
1364
+ """Extract and map Superset tags to DataHub GlobalTagsClass.
1365
+
1366
+ Filters out system-generated tags (type != 1) and only processes user-defined tags
1367
+ from the Superset API response.
1368
+
1369
+ Args:
1370
+ raw_tags: List of tag dictionaries from Superset API
1371
+
1372
+ Returns:
1373
+ GlobalTagsClass with user-defined tags, or None if no tags found
1374
+ """
1375
+ user_tags = [
1376
+ tag.get("name", "")
1377
+ for tag in raw_tags
1378
+ if tag.get("type") == 1 and tag.get("name")
1379
+ ]
1380
+
1381
+ if not user_tags:
1382
+ return None
1383
+
1384
+ tag_urns = [builder.make_tag_urn(tag) for tag in user_tags]
1385
+ return GlobalTagsClass(
1386
+ tags=[TagAssociationClass(tag=tag_urn) for tag_urn in tag_urns]
1387
+ )
1388
+
1389
+ def _merge_tags_with_existing(
1390
+ self, entity_urn: str, new_tags: Optional[GlobalTagsClass]
1391
+ ) -> Optional[GlobalTagsClass]:
1392
+ """Merge new tags with existing ones from DataHub to preserve manually added tags.
1393
+
1394
+ This method ensures that tags manually added via DataHub UI are not overwritten
1395
+ during ingestion. It fetches existing tags from the graph and merges them with
1396
+ new tags from the source system, avoiding duplicates.
1397
+
1398
+ Args:
1399
+ entity_urn: URN of the entity to check for existing tags
1400
+ new_tags: New tags to add as GlobalTagsClass object
1401
+
1402
+ Returns:
1403
+ GlobalTagsClass with merged tags preserving existing ones, or None if no tags
1404
+ """
1405
+ if not new_tags or not new_tags.tags:
1406
+ return None
1407
+
1408
+ # Fetch existing tags from DataHub
1409
+ existing_global_tags = None
1410
+ if self.ctx.graph:
1411
+ existing_global_tags = self.ctx.graph.get_aspect(
1412
+ entity_urn=entity_urn, aspect_type=GlobalTagsClass
1413
+ )
1414
+
1415
+ # Merge existing tags with new ones, avoiding duplicates
1416
+ all_tags = []
1417
+ existing_tag_urns = set()
1418
+
1419
+ if existing_global_tags and existing_global_tags.tags:
1420
+ all_tags.extend(existing_global_tags.tags)
1421
+ existing_tag_urns = {tag.tag for tag in existing_global_tags.tags}
1422
+
1423
+ # Add new tags that don't already exist
1424
+ for new_tag in new_tags.tags:
1425
+ if new_tag.tag not in existing_tag_urns:
1426
+ all_tags.append(new_tag)
1427
+
1428
+ return GlobalTagsClass(tags=all_tags) if all_tags else None
1429
+
1323
1430
  def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
1324
1431
  dataset_name = ""
1325
1432
  try:
@@ -3,6 +3,7 @@ import logging
3
3
  import re
4
4
  import time
5
5
  from collections import OrderedDict, defaultdict
6
+ from copy import deepcopy
6
7
  from dataclasses import dataclass, field as dataclass_field
7
8
  from datetime import datetime, timedelta, timezone
8
9
  from functools import lru_cache
@@ -474,6 +475,13 @@ class TableauPageSizeConfig(ConfigModel):
474
475
  return self.database_table_page_size or self.page_size
475
476
 
476
477
 
478
+ _IngestHiddenAssetsOptionsType = Literal["worksheet", "dashboard"]
479
+ _IngestHiddenAssetsOptions: List[_IngestHiddenAssetsOptionsType] = [
480
+ "worksheet",
481
+ "dashboard",
482
+ ]
483
+
484
+
477
485
  class TableauConfig(
478
486
  DatasetLineageProviderConfigBase,
479
487
  StatefulIngestionConfigBase,
@@ -524,6 +532,10 @@ class TableauConfig(
524
532
  default=False,
525
533
  description="Ingest Owner from source. This will override Owner info entered from UI",
526
534
  )
535
+ use_email_as_username: bool = Field(
536
+ default=False,
537
+ description="Use email address instead of username for entity owners. Requires ingest_owner to be True.",
538
+ )
527
539
  ingest_tables_external: bool = Field(
528
540
  default=False,
529
541
  description="Ingest details for tables external to (not embedded in) tableau as entities.",
@@ -582,13 +594,13 @@ class TableauConfig(
582
594
  )
583
595
 
584
596
  extract_lineage_from_unsupported_custom_sql_queries: bool = Field(
585
- default=False,
586
- description="[Experimental] Whether to extract lineage from unsupported custom sql queries using SQL parsing",
597
+ default=True,
598
+ description="[Experimental] Extract lineage from Custom SQL queries using DataHub's SQL parser in cases where the Tableau Catalog API fails to return lineage for the query.",
587
599
  )
588
600
 
589
601
  force_extraction_of_lineage_from_custom_sql_queries: bool = Field(
590
602
  default=False,
591
- description="[Experimental] Force extraction of lineage from custom sql queries using SQL parsing, ignoring Tableau metadata",
603
+ description="[Experimental] Force extraction of lineage from Custom SQL queries using DataHub's SQL parser, even when the Tableau Catalog API returns lineage already.",
592
604
  )
593
605
 
594
606
  sql_parsing_disable_schema_awareness: bool = Field(
@@ -621,8 +633,8 @@ class TableauConfig(
621
633
  description="Configuration settings for ingesting Tableau groups and their capabilities as custom properties.",
622
634
  )
623
635
 
624
- ingest_hidden_assets: Union[List[Literal["worksheet", "dashboard"]], bool] = Field(
625
- default=["worksheet", "dashboard"],
636
+ ingest_hidden_assets: Union[List[_IngestHiddenAssetsOptionsType], bool] = Field(
637
+ _IngestHiddenAssetsOptions,
626
638
  description=(
627
639
  "When enabled, hidden worksheets and dashboards are ingested into Datahub."
628
640
  " If a dashboard or worksheet is hidden in Tableau the luid is blank."
@@ -644,6 +656,11 @@ class TableauConfig(
644
656
  # pre = True because we want to take some decision before pydantic initialize the configuration to default values
645
657
  @root_validator(pre=True)
646
658
  def projects_backward_compatibility(cls, values: Dict) -> Dict:
659
+ # In-place update of the input dict would cause state contamination. This was discovered through test failures
660
+ # in test_hex.py where the same dict is reused.
661
+ # So a copy is performed first.
662
+ values = deepcopy(values)
663
+
647
664
  projects = values.get("projects")
648
665
  project_pattern = values.get("project_pattern")
649
666
  project_path_pattern = values.get("project_path_pattern")
@@ -655,6 +672,7 @@ class TableauConfig(
655
672
  values["project_pattern"] = AllowDenyPattern(
656
673
  allow=[f"^{prj}$" for prj in projects]
657
674
  )
675
+ values.pop("projects")
658
676
  elif (project_pattern or project_path_pattern) and projects:
659
677
  raise ValueError(
660
678
  "projects is deprecated. Please use project_path_pattern only."
@@ -666,7 +684,7 @@ class TableauConfig(
666
684
 
667
685
  return values
668
686
 
669
- @root_validator()
687
+ @root_validator(skip_on_failure=True)
670
688
  def validate_config_values(cls, values: Dict) -> Dict:
671
689
  tags_for_hidden_assets = values.get("tags_for_hidden_assets")
672
690
  ingest_tags = values.get("ingest_tags")
@@ -678,6 +696,14 @@ class TableauConfig(
678
696
  raise ValueError(
679
697
  "tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
680
698
  )
699
+
700
+ use_email_as_username = values.get("use_email_as_username")
701
+ ingest_owner = values.get("ingest_owner")
702
+ if use_email_as_username and not ingest_owner:
703
+ raise ValueError(
704
+ "use_email_as_username requires ingest_owner to be enabled."
705
+ )
706
+
681
707
  return values
682
708
 
683
709
 
@@ -839,6 +865,9 @@ class TableauSourceReport(
839
865
  default_factory=(lambda: defaultdict(int))
840
866
  )
841
867
 
868
+ # Owner extraction statistics
869
+ num_email_fallback_to_username: int = 0
870
+
842
871
 
843
872
  def report_user_role(report: TableauSourceReport, server: Server) -> None:
844
873
  title: str = "Insufficient Permissions"
@@ -2716,13 +2745,12 @@ class TableauSiteSource:
2716
2745
  dataset_snapshot.aspects.append(browse_paths)
2717
2746
 
2718
2747
  # Ownership
2719
- owner = (
2720
- self._get_ownership(datasource_info[c.OWNER][c.USERNAME])
2721
- if datasource_info
2722
- and datasource_info.get(c.OWNER)
2723
- and datasource_info[c.OWNER].get(c.USERNAME)
2748
+ owner_identifier = (
2749
+ self._get_owner_identifier(datasource_info[c.OWNER])
2750
+ if datasource_info and datasource_info.get(c.OWNER)
2724
2751
  else None
2725
2752
  )
2753
+ owner = self._get_ownership(owner_identifier) if owner_identifier else None
2726
2754
  if owner is not None:
2727
2755
  dataset_snapshot.aspects.append(owner)
2728
2756
 
@@ -3127,7 +3155,7 @@ class TableauSiteSource:
3127
3155
 
3128
3156
  creator: Optional[str] = None
3129
3157
  if workbook is not None and workbook.get(c.OWNER) is not None:
3130
- creator = workbook[c.OWNER].get(c.USERNAME)
3158
+ creator = self._get_owner_identifier(workbook[c.OWNER])
3131
3159
  created_at = sheet.get(c.CREATED_AT, datetime.now())
3132
3160
  updated_at = sheet.get(c.UPDATED_AT, datetime.now())
3133
3161
  last_modified = self.get_last_modified(creator, created_at, updated_at)
@@ -3276,7 +3304,7 @@ class TableauSiteSource:
3276
3304
 
3277
3305
  def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUnit]:
3278
3306
  workbook_container_key = self.gen_workbook_key(workbook[c.ID])
3279
- creator = workbook.get(c.OWNER, {}).get(c.USERNAME)
3307
+ creator = self._get_owner_identifier(workbook.get(c.OWNER, {}))
3280
3308
 
3281
3309
  owner_urn = (
3282
3310
  builder.make_user_urn(creator)
@@ -3458,7 +3486,7 @@ class TableauSiteSource:
3458
3486
 
3459
3487
  creator: Optional[str] = None
3460
3488
  if workbook is not None and workbook.get(c.OWNER) is not None:
3461
- creator = workbook[c.OWNER].get(c.USERNAME)
3489
+ creator = self._get_owner_identifier(workbook[c.OWNER])
3462
3490
  created_at = dashboard.get(c.CREATED_AT, datetime.now())
3463
3491
  updated_at = dashboard.get(c.UPDATED_AT, datetime.now())
3464
3492
  last_modified = self.get_last_modified(creator, created_at, updated_at)
@@ -3605,6 +3633,20 @@ class TableauSiteSource:
3605
3633
  )
3606
3634
  return last_modified
3607
3635
 
3636
+ def _get_owner_identifier(self, owner_dict: dict) -> Optional[str]:
3637
+ """Extract owner identifier (email or username) based on configuration."""
3638
+ if not owner_dict:
3639
+ return None
3640
+
3641
+ if self.config.use_email_as_username:
3642
+ email = owner_dict.get(c.EMAIL)
3643
+ if email:
3644
+ return email
3645
+ # Fall back to username if email is not available
3646
+ self.report.num_email_fallback_to_username += 1
3647
+
3648
+ return owner_dict.get(c.USERNAME)
3649
+
3608
3650
  @lru_cache(maxsize=None)
3609
3651
  def _get_ownership(self, user: str) -> Optional[OwnershipClass]:
3610
3652
  if self.config.ingest_owner and user:
@@ -3828,3 +3870,15 @@ class TableauSiteSource:
3828
3870
  self.report.emit_upstream_tables_timer[self.site_content_url] = (
3829
3871
  timer.elapsed_seconds(digits=2)
3830
3872
  )
3873
+
3874
+ # Log owner extraction statistics if there were fallbacks
3875
+ if (
3876
+ self.config.use_email_as_username
3877
+ and self.config.ingest_owner
3878
+ and self.report.num_email_fallback_to_username > 0
3879
+ ):
3880
+ logger.info(
3881
+ f"Owner extraction summary for site '{self.site_content_url}': "
3882
+ f"{self.report.num_email_fallback_to_username} entities fell back from email to username "
3883
+ f"(email was not available)"
3884
+ )
@@ -65,6 +65,7 @@ workbook_graphql_query = """
65
65
  projectName
66
66
  owner {
67
67
  username
68
+ email
68
69
  }
69
70
  description
70
71
  uri
@@ -107,6 +108,7 @@ sheet_graphql_query = """
107
108
  luid
108
109
  owner {
109
110
  username
111
+ email
110
112
  }
111
113
  }
112
114
  datasourceFields {
@@ -185,6 +187,7 @@ dashboard_graphql_query = """
185
187
  luid
186
188
  owner {
187
189
  username
190
+ email
188
191
  }
189
192
  }
190
193
  }
@@ -268,6 +271,7 @@ embedded_datasource_graphql_query = """
268
271
  luid
269
272
  owner {
270
273
  username
274
+ email
271
275
  }
272
276
  }
273
277
  }
@@ -424,6 +428,7 @@ published_datasource_graphql_query = """
424
428
  }
425
429
  owner {
426
430
  username
431
+ email
427
432
  }
428
433
  description
429
434
  uri
@@ -59,6 +59,7 @@ LUID = "luid"
59
59
  EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
60
60
  OWNER = "owner"
61
61
  USERNAME = "username"
62
+ EMAIL = "email"
62
63
  HAS_EXTRACTS = "hasExtracts"
63
64
  EXTRACT_LAST_REFRESH_TIME = "extractLastRefreshTime"
64
65
  EXTRACT_LAST_INCREMENTAL_UPDATE_TIME = "extractLastIncrementalUpdateTime"
@@ -1,4 +1,5 @@
1
1
  from dataclasses import dataclass
2
+ from typing import Optional
2
3
 
3
4
  from tableauserverclient import Server, UserItem
4
5
 
@@ -10,6 +11,7 @@ class UserInfo:
10
11
  user_name: str
11
12
  site_role: str
12
13
  site_id: str
14
+ email: Optional[str] = None
13
15
 
14
16
  def has_site_administrator_explorer_privileges(self):
15
17
  return self.site_role in [
@@ -34,4 +36,5 @@ class UserInfo:
34
36
  user_name=user.name,
35
37
  site_role=user.site_role,
36
38
  site_id=server.site_id,
39
+ email=user.email,
37
40
  )
@@ -8,7 +8,12 @@ import pydantic
8
8
  from pydantic import Field
9
9
  from typing_extensions import Literal
10
10
 
11
- from datahub.configuration.common import AllowDenyPattern, ConfigEnum, ConfigModel
11
+ from datahub.configuration.common import (
12
+ AllowDenyPattern,
13
+ ConfigEnum,
14
+ ConfigModel,
15
+ HiddenFromDocs,
16
+ )
12
17
  from datahub.configuration.source_common import (
13
18
  DatasetSourceConfigMixin,
14
19
  LowerCaseDatasetUrnConfigMixin,
@@ -285,10 +290,9 @@ class UnityCatalogSourceConfig(
285
290
  description="Limit the number of columns to get column level lineage. ",
286
291
  )
287
292
 
288
- lineage_max_workers: int = pydantic.Field(
293
+ lineage_max_workers: HiddenFromDocs[int] = pydantic.Field(
289
294
  default=5 * (os.cpu_count() or 4),
290
295
  description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.",
291
- hidden_from_docs=True,
292
296
  )
293
297
 
294
298
  databricks_api_page_size: int = pydantic.Field(
@@ -18,7 +18,7 @@ import pydantic
18
18
  from pydantic.fields import Field
19
19
 
20
20
  import datahub.emitter.mce_builder as builder
21
- from datahub.configuration.common import AllowDenyPattern
21
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
22
22
  from datahub.configuration.time_window_config import (
23
23
  BaseTimeWindowConfig,
24
24
  BucketDuration,
@@ -194,13 +194,13 @@ class GenericAggregatedDataset(Generic[ResourceType]):
194
194
 
195
195
 
196
196
  class BaseUsageConfig(BaseTimeWindowConfig):
197
- queries_character_limit: int = Field(
197
+ queries_character_limit: HiddenFromDocs[int] = Field(
198
+ # Hidden since we don't want to encourage people to break elasticsearch.
198
199
  default=DEFAULT_QUERIES_CHARACTER_LIMIT,
199
200
  description=(
200
201
  "Total character limit for all queries in a single usage aspect."
201
202
  " Queries will be truncated to length `queries_character_limit / top_n_queries`."
202
203
  ),
203
- hidden_from_docs=True, # Don't want to encourage people to break elasticsearch
204
204
  )
205
205
 
206
206
  top_n_queries: pydantic.PositiveInt = Field(
@@ -2,6 +2,7 @@ import re
2
2
  from typing import Dict, List, Optional, Union
3
3
  from urllib.parse import urlparse
4
4
 
5
+ import pydantic
5
6
  from pydantic import Field, validator
6
7
 
7
8
  from datahub.configuration.common import AllowDenyPattern
@@ -121,7 +122,8 @@ class PulsarSourceConfig(
121
122
  )
122
123
  return client_secret
123
124
 
124
- @validator("web_service_url")
125
+ @pydantic.field_validator("web_service_url", mode="after")
126
+ @classmethod
125
127
  def web_service_url_scheme_host_port(cls, val: str) -> str:
126
128
  # Tokenize the web url
127
129
  url = urlparse(val)
@@ -0,0 +1,112 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Dict, List, Optional, cast
4
+
5
+ from datahub.configuration.common import (
6
+ TransformerSemanticsConfigModel,
7
+ )
8
+ from datahub.emitter.mce_builder import Aspect
9
+ from datahub.ingestion.api.common import PipelineContext
10
+ from datahub.ingestion.transformer.base_transformer import (
11
+ BaseTransformer,
12
+ SingleAspectTransformer,
13
+ )
14
+ from datahub.metadata.schema_classes import (
15
+ BrowsePathEntryClass,
16
+ BrowsePathsV2Class,
17
+ )
18
+ from datahub.utilities.urns.urn import guess_entity_type
19
+
20
+
21
+ class SetBrowsePathTransformerConfig(TransformerSemanticsConfigModel):
22
+ path: List[str]
23
+
24
+
25
+ class SetBrowsePathTransformer(BaseTransformer, SingleAspectTransformer):
26
+ ctx: PipelineContext
27
+ config: SetBrowsePathTransformerConfig
28
+
29
+ def __init__(self, config: SetBrowsePathTransformerConfig, ctx: PipelineContext):
30
+ super().__init__()
31
+ self.ctx = ctx
32
+ self.config = config
33
+
34
+ def aspect_name(self) -> str:
35
+ return "browsePathsV2"
36
+
37
+ def entity_types(self) -> List[str]:
38
+ # This is an arbitrary list, might be adjusted if it makes sense. It might be reasonable to make it configurable
39
+ return ["dataset", "dataJob", "dataFlow", "chart", "dashboard", "container"]
40
+
41
+ @classmethod
42
+ def create(
43
+ cls, config_dict: dict, ctx: PipelineContext
44
+ ) -> "SetBrowsePathTransformer":
45
+ config = SetBrowsePathTransformerConfig.parse_obj(config_dict)
46
+ return cls(config, ctx)
47
+
48
+ @staticmethod
49
+ def _build_model(existing_browse_paths: BrowsePathsV2Class) -> Dict[str, List[str]]:
50
+ template_vars: Dict[str, List[str]] = {}
51
+ model: Dict[str, List[str]] = defaultdict(list)
52
+ for entry in existing_browse_paths.path or []:
53
+ if entry.urn:
54
+ entity_type = guess_entity_type(entry.urn)
55
+ model[entity_type].append(entry.urn)
56
+
57
+ for entity_type, urns in model.items():
58
+ template_vars[f"{entity_type}[*]"] = urns
59
+ for i, urn in enumerate(urns):
60
+ template_vars[f"{entity_type}[{i}]"] = [urn]
61
+
62
+ return template_vars
63
+
64
+ @classmethod
65
+ def _expand_nodes(
66
+ cls, templates: List[str], template_vars: Dict[str, List[str]]
67
+ ) -> BrowsePathsV2Class:
68
+ expanded_nodes: List[str] = []
69
+ for node in templates:
70
+ resolved_nodes = cls._resolve_template_to_nodes(node, template_vars)
71
+ expanded_nodes.extend(resolved_nodes)
72
+
73
+ processed_entries: List[BrowsePathEntryClass] = []
74
+ for node in expanded_nodes:
75
+ if not node or node.isspace():
76
+ continue
77
+ processed_entries.append(
78
+ BrowsePathEntryClass(
79
+ id=node, urn=node if node.startswith("urn:") else None
80
+ )
81
+ )
82
+ return BrowsePathsV2Class(path=processed_entries)
83
+
84
+ def transform_aspect(
85
+ self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
86
+ ) -> Optional[Aspect]:
87
+ template_vars: Dict[str, List[str]] = {}
88
+ if aspect is not None:
89
+ assert isinstance(aspect, BrowsePathsV2Class)
90
+ template_vars = self._build_model(aspect)
91
+ new_browse_paths: BrowsePathsV2Class = self._expand_nodes(
92
+ self.config.path, template_vars
93
+ )
94
+ if aspect is not None and not self.config.replace_existing:
95
+ for node in aspect.path:
96
+ new_browse_paths.path.append(node)
97
+
98
+ return cast(Aspect, new_browse_paths)
99
+
100
+ @staticmethod
101
+ def _resolve_template_to_nodes(
102
+ template_str: str, template_vars: Dict[str, List[str]]
103
+ ) -> List[str]:
104
+ # This mechanism can be made simpler (match against known variables only) or more complex (e.g. by using a
105
+ # proper templating engine, like jinja).
106
+ template_str = template_str.strip()
107
+ var_pattern = re.findall(r"^\$([a-zA-Z]+\[[0-9*]+]$)", template_str)
108
+
109
+ if not var_pattern:
110
+ return [template_str]
111
+
112
+ return template_vars.get(var_pattern[0], [])