acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show
  1. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
  2. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
  3. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/common/subtypes.py +2 -0
  35. datahub/ingestion/source/csv_enricher.py +1 -1
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  37. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  38. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  39. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  40. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  41. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  42. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  43. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  44. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  45. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  46. datahub/ingestion/source/elastic_search.py +1 -1
  47. datahub/ingestion/source/feast.py +97 -6
  48. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  49. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  50. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  51. datahub/ingestion/source/ge_data_profiler.py +23 -1
  52. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  53. datahub/ingestion/source/kafka/kafka.py +39 -19
  54. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  55. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  56. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  57. datahub/ingestion/source/looker/view_upstream.py +65 -30
  58. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  59. datahub/ingestion/source/mode.py +0 -23
  60. datahub/ingestion/source/neo4j/__init__.py +0 -0
  61. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  62. datahub/ingestion/source/powerbi/__init__.py +0 -1
  63. datahub/ingestion/source/powerbi/config.py +3 -3
  64. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  65. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  66. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  67. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  68. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  69. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  70. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  71. datahub/ingestion/source/preset.py +1 -0
  72. datahub/ingestion/source/pulsar.py +21 -2
  73. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  74. datahub/ingestion/source/redash.py +13 -63
  75. datahub/ingestion/source/redshift/config.py +1 -0
  76. datahub/ingestion/source/redshift/redshift.py +3 -0
  77. datahub/ingestion/source/s3/source.py +2 -3
  78. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  79. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  80. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  81. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  82. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  83. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  84. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  85. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  86. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  87. datahub/ingestion/source/sql/athena.py +46 -22
  88. datahub/ingestion/source/sql/mssql/source.py +0 -2
  89. datahub/ingestion/source/sql/sql_common.py +34 -21
  90. datahub/ingestion/source/sql/sql_report.py +1 -0
  91. datahub/ingestion/source/sql/sql_types.py +85 -8
  92. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  93. datahub/ingestion/source/superset.py +215 -65
  94. datahub/ingestion/source/tableau/tableau.py +237 -76
  95. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  96. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  97. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  98. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  99. datahub/ingestion/source/unity/proxy_types.py +1 -0
  100. datahub/ingestion/source/unity/source.py +4 -0
  101. datahub/ingestion/source/unity/usage.py +20 -11
  102. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  103. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  104. datahub/integrations/assertion/common.py +1 -1
  105. datahub/lite/duckdb_lite.py +12 -17
  106. datahub/metadata/_schema_classes.py +512 -392
  107. datahub/metadata/_urns/urn_defs.py +1355 -1355
  108. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  109. datahub/metadata/schema.avsc +17222 -17499
  110. datahub/metadata/schemas/FormInfo.avsc +4 -0
  111. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  112. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  113. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  114. datahub/specific/chart.py +0 -39
  115. datahub/specific/dashboard.py +0 -39
  116. datahub/specific/datajob.py +7 -57
  117. datahub/sql_parsing/schema_resolver.py +23 -0
  118. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  119. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  120. datahub/sql_parsing/sqlglot_utils.py +8 -2
  121. datahub/telemetry/telemetry.py +23 -9
  122. datahub/testing/compare_metadata_json.py +1 -1
  123. datahub/testing/doctest.py +12 -0
  124. datahub/utilities/file_backed_collections.py +35 -2
  125. datahub/utilities/partition_executor.py +1 -1
  126. datahub/utilities/urn_encoder.py +2 -1
  127. datahub/utilities/urns/_urn_base.py +1 -1
  128. datahub/utilities/urns/structured_properties_urn.py +1 -1
  129. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  130. datahub/utilities/sql_parser.py +0 -94
  131. datahub/utilities/sql_parser_base.py +0 -21
  132. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  133. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -68,6 +68,7 @@ from datahub.ingestion.api.source import (
68
68
  CapabilityReport,
69
69
  MetadataWorkUnitProcessor,
70
70
  Source,
71
+ StructuredLogLevel,
71
72
  TestableSource,
72
73
  TestConnectionReport,
73
74
  )
@@ -110,6 +111,8 @@ from datahub.ingestion.source.tableau.tableau_common import (
110
111
  tableau_field_to_schema_field,
111
112
  workbook_graphql_query,
112
113
  )
114
+ from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
115
+ from datahub.ingestion.source.tableau.tableau_validation import check_user_role
113
116
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
114
117
  AuditStamp,
115
118
  ChangeAuditStamps,
@@ -166,7 +169,7 @@ from datahub.utilities.urns.dataset_urn import DatasetUrn
166
169
 
167
170
  try:
168
171
  # On earlier versions of the tableauserverclient, the NonXMLResponseError
169
- # was thrown when reauthentication was needed. We'll keep both exceptions
172
+ # was thrown when reauthentication was necessary. We'll keep both exceptions
170
173
  # around for now, but can remove this in the future.
171
174
  from tableauserverclient.server.endpoint.exceptions import ( # type: ignore
172
175
  NotSignedInError,
@@ -289,16 +292,12 @@ class TableauConnectionConfig(ConfigModel):
289
292
  server.auth.sign_in(authentication)
290
293
  return server
291
294
  except ServerResponseError as e:
295
+ message = f"Unable to login (invalid/expired credentials or missing permissions): {str(e)}"
292
296
  if isinstance(authentication, PersonalAccessTokenAuth):
293
297
  # Docs on token expiry in Tableau:
294
298
  # https://help.tableau.com/current/server/en-us/security_personal_access_tokens.htm#token-expiry
295
- logger.info(
296
- "Error authenticating with Tableau. Note that Tableau personal access tokens "
297
- "expire if not used for 15 days or if over 1 year old"
298
- )
299
- raise ValueError(
300
- f"Unable to login (invalid/expired credentials or missing permissions): {str(e)}"
301
- ) from e
299
+ message = f"Error authenticating with Tableau. Note that Tableau personal access tokens expire if not used for 15 days or if over 1 year old: {str(e)}"
300
+ raise ValueError(message) from e
302
301
  except Exception as e:
303
302
  raise ValueError(
304
303
  f"Unable to login (check your Tableau connection and credentials): {str(e)}"
@@ -356,7 +355,7 @@ class TableauConfig(
356
355
 
357
356
  project_path_separator: str = Field(
358
357
  default="/",
359
- description="The separator used for the project_pattern field between project names. By default, we use a slash. "
358
+ description="The separator used for the project_path_pattern field between project names. By default, we use a slash. "
360
359
  "You can change this if your Tableau projects contain slashes in their names, and you'd like to filter by project.",
361
360
  )
362
361
 
@@ -488,6 +487,18 @@ class TableauConfig(
488
487
  description="Configuration settings for ingesting Tableau groups and their capabilities as custom properties.",
489
488
  )
490
489
 
490
+ ingest_hidden_assets: bool = Field(
491
+ True,
492
+ description="When enabled, hidden views and dashboards are ingested into Datahub. "
493
+ "If a dashboard or view is hidden in Tableau the luid is blank. Default of this config field is True.",
494
+ )
495
+
496
+ tags_for_hidden_assets: List[str] = Field(
497
+ default=[],
498
+ description="Tags to be added to hidden dashboards and views. If a dashboard or view is hidden in Tableau the luid is blank. "
499
+ "This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.",
500
+ )
501
+
491
502
  # pre = True because we want to take some decision before pydantic initialize the configuration to default values
492
503
  @root_validator(pre=True)
493
504
  def projects_backward_compatibility(cls, values: Dict) -> Dict:
@@ -513,6 +524,20 @@ class TableauConfig(
513
524
 
514
525
  return values
515
526
 
527
+ @root_validator()
528
+ def validate_config_values(cls, values: Dict) -> Dict:
529
+ tags_for_hidden_assets = values.get("tags_for_hidden_assets")
530
+ ingest_tags = values.get("ingest_tags")
531
+ if (
532
+ not ingest_tags
533
+ and tags_for_hidden_assets
534
+ and len(tags_for_hidden_assets) > 0
535
+ ):
536
+ raise ValueError(
537
+ "tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
538
+ )
539
+ return values
540
+
516
541
 
517
542
  class WorkbookKey(ContainerKey):
518
543
  workbook_id: str
@@ -599,7 +624,43 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
599
624
  num_datasource_field_skipped_no_name: int = 0
600
625
  num_csql_field_skipped_no_name: int = 0
601
626
  num_table_field_skipped_no_name: int = 0
627
+ # lineage
628
+ num_tables_with_upstream_lineage: int = 0
629
+ num_upstream_table_lineage: int = 0
630
+ num_upstream_fine_grained_lineage: int = 0
602
631
  num_upstream_table_skipped_no_name: int = 0
632
+ num_upstream_table_skipped_no_columns: int = 0
633
+ num_upstream_table_failed_generate_reference: int = 0
634
+ num_upstream_table_lineage_failed_parse_sql: int = 0
635
+ num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
636
+ num_hidden_assets_skipped: int = 0
637
+ logged_in_user: List[UserInfo] = []
638
+
639
+
640
+ def report_user_role(report: TableauSourceReport, server: Server) -> None:
641
+ title: str = "Insufficient Permissions"
642
+ message: str = "The user must have the `Site Administrator Explorer` role to perform metadata ingestion."
643
+ try:
644
+ # TableauSiteSource instance is per site, so each time we need to find-out user detail
645
+ # the site-role might be different on another site
646
+ logged_in_user: UserInfo = UserInfo.from_server(server=server)
647
+
648
+ if not logged_in_user.is_site_administrator_explorer():
649
+ report.warning(
650
+ title=title,
651
+ message=message,
652
+ context=f"user-name={logged_in_user.user_name}, role={logged_in_user.site_role}, site_id={logged_in_user.site_id}",
653
+ )
654
+
655
+ report.logged_in_user.append(logged_in_user)
656
+
657
+ except Exception as e:
658
+ report.warning(
659
+ title=title,
660
+ message="Failed to verify the user's role. The user must have `Site Administrator Explorer` role.",
661
+ context=f"{e}",
662
+ exc=e,
663
+ )
603
664
 
604
665
 
605
666
  @platform_name("Tableau")
@@ -644,6 +705,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
644
705
  try:
645
706
  logger.info(f"Authenticated to Tableau site: '{site_content_url}'")
646
707
  self.server = self.config.make_tableau_client(site_content_url)
708
+ report_user_role(report=self.report, server=self.server)
647
709
  # Note that we're not catching ConfigurationError, since we want that to throw.
648
710
  except ValueError as e:
649
711
  self.report.failure(
@@ -657,9 +719,17 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
657
719
  test_report = TestConnectionReport()
658
720
  try:
659
721
  source_config = TableauConfig.parse_obj_allow_extras(config_dict)
660
- source_config.make_tableau_client(source_config.site)
722
+
723
+ server = source_config.make_tableau_client(source_config.site)
724
+
661
725
  test_report.basic_connectivity = CapabilityReport(capable=True)
726
+
727
+ test_report.capability_report = check_user_role(
728
+ logged_in_user=UserInfo.from_server(server=server)
729
+ )
730
+
662
731
  except Exception as e:
732
+ logger.warning(f"{e}", exc_info=e)
663
733
  test_report.basic_connectivity = CapabilityReport(
664
734
  capable=False, failure_reason=str(e)
665
735
  )
@@ -700,6 +770,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
700
770
  config=self.config,
701
771
  ctx=self.ctx,
702
772
  site=site,
773
+ site_id=site.id,
703
774
  report=self.report,
704
775
  server=self.server,
705
776
  platform=self.platform,
@@ -707,11 +778,19 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
707
778
  logger.info(f"Ingesting assets of site '{site.content_url}'.")
708
779
  yield from site_source.ingest_tableau_site()
709
780
  else:
710
- site = self.server.sites.get_by_id(self.server.site_id)
781
+ site = None
782
+ with self.report.report_exc(
783
+ title="Unable to fetch site details. Site hierarchy may be incomplete and external urls may be missing.",
784
+ message="This usually indicates missing permissions. Ensure that you have all necessary permissions.",
785
+ level=StructuredLogLevel.WARN,
786
+ ):
787
+ site = self.server.sites.get_by_id(self.server.site_id)
788
+
711
789
  site_source = TableauSiteSource(
712
790
  config=self.config,
713
791
  ctx=self.ctx,
714
792
  site=site,
793
+ site_id=self.server.site_id,
715
794
  report=self.report,
716
795
  server=self.server,
717
796
  platform=self.platform,
@@ -722,6 +801,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
722
801
  title="Failed to Retrieve Tableau Metadata",
723
802
  message="Unable to retrieve metadata from tableau.",
724
803
  context=str(md_exception),
804
+ exc=md_exception,
725
805
  )
726
806
 
727
807
  def close(self) -> None:
@@ -743,7 +823,8 @@ class TableauSiteSource:
743
823
  self,
744
824
  config: TableauConfig,
745
825
  ctx: PipelineContext,
746
- site: SiteItem,
826
+ site: Optional[SiteItem],
827
+ site_id: Optional[str],
747
828
  report: TableauSourceReport,
748
829
  server: Server,
749
830
  platform: str,
@@ -752,9 +833,16 @@ class TableauSiteSource:
752
833
  self.report = report
753
834
  self.server: Server = server
754
835
  self.ctx: PipelineContext = ctx
755
- self.site: SiteItem = site
756
836
  self.platform = platform
757
837
 
838
+ self.site: Optional[SiteItem] = site
839
+ if site_id is not None:
840
+ self.site_id: str = site_id
841
+ else:
842
+ assert self.site is not None, "site or site_id is required"
843
+ assert self.site.id is not None, "site_id is required when site is provided"
844
+ self.site_id = self.site.id
845
+
758
846
  self.database_tables: Dict[str, DatabaseTable] = {}
759
847
  self.tableau_stat_registry: Dict[str, UsageStat] = {}
760
848
  self.tableau_project_registry: Dict[str, TableauProject] = {}
@@ -781,6 +869,8 @@ class TableauSiteSource:
781
869
  # when emitting custom SQL data sources.
782
870
  self.custom_sql_ids_being_used: List[str] = []
783
871
 
872
+ report_user_role(report=report, server=server)
873
+
784
874
  @property
785
875
  def no_env_browse_prefix(self) -> str:
786
876
  # Prefix to use with browse path (v1)
@@ -808,7 +898,7 @@ class TableauSiteSource:
808
898
  def _re_authenticate(self):
809
899
  tableau_auth: Union[
810
900
  TableauAuth, PersonalAccessTokenAuth
811
- ] = self.config.get_tableau_auth(self.site.content_url)
901
+ ] = self.config.get_tableau_auth(self.site_id)
812
902
  self.server.auth.sign_in(tableau_auth)
813
903
 
814
904
  @property
@@ -826,6 +916,7 @@ class TableauSiteSource:
826
916
  if not view.id:
827
917
  continue
828
918
  self.tableau_stat_registry[view.id] = UsageStat(view_count=view.total_views)
919
+ logger.info(f"Got Tableau stats for {len(self.tableau_stat_registry)} assets")
829
920
  logger.debug("Tableau stats %s", self.tableau_stat_registry)
830
921
 
831
922
  def _populate_database_server_hostname_map(self) -> None:
@@ -876,7 +967,7 @@ class TableauSiteSource:
876
967
  ancestors = [cur_proj.name]
877
968
  while cur_proj.parent_id is not None:
878
969
  if cur_proj.parent_id not in all_project_map:
879
- self.report.report_warning(
970
+ self.report.warning(
880
971
  "project-issue",
881
972
  f"Parent project {cur_proj.parent_id} not found. We need Site Administrator Explorer permissions.",
882
973
  )
@@ -908,19 +999,36 @@ class TableauSiteSource:
908
999
  return is_allowed
909
1000
 
910
1001
  def _is_denied_project(self, project: TableauProject) -> bool:
911
- # Either project name or project path should exist in deny
912
- for deny_pattern in self.config.project_pattern.deny:
913
- # Either name or project path is denied
914
- if re.match(
915
- deny_pattern, project.name, self.config.project_pattern.regex_flags
916
- ) or re.match(
917
- deny_pattern,
918
- self._get_project_path(project),
919
- self.config.project_pattern.regex_flags,
920
- ):
921
- return True
922
- logger.info(f"project({project.name}) is not denied as per project_pattern")
923
- return False
1002
+ """
1003
+ Why use an explicit denial check instead of the `AllowDenyPattern.allowed` method?
1004
+
1005
+ Consider a scenario where a Tableau site contains four projects: A, B, C, and D, with the following hierarchical relationship:
1006
+
1007
+ - **A**
1008
+ - **B** (Child of A)
1009
+ - **C** (Child of A)
1010
+ - **D**
1011
+
1012
+ In this setup:
1013
+
1014
+ - `project_pattern` is configured with `allow: ["A"]` and `deny: ["B"]`.
1015
+ - `extract_project_hierarchy` is set to `True`.
1016
+
1017
+ The goal is to extract assets from project A and its children while explicitly denying the child project B.
1018
+
1019
+ If we rely solely on the `project_pattern.allowed()` method, project C's assets will not be ingested.
1020
+ This happens because project C is not explicitly included in the `allow` list, nor is it part of the `deny` list.
1021
+ However, since `extract_project_hierarchy` is enabled, project C should ideally be included in the ingestion process unless explicitly denied.
1022
+
1023
+ To address this, the function explicitly checks the deny regex to ensure that project C’s assets are ingested if it is not specifically denied in the deny list. This approach ensures that the hierarchy is respected while adhering to the configured allow/deny rules.
1024
+ """
1025
+
1026
+ # Either project_pattern or project_path_pattern is set in a recipe
1027
+ # TableauConfig.projects_backward_compatibility ensures that at least one of these properties is configured.
1028
+
1029
+ return self.config.project_pattern.denied(
1030
+ project.name
1031
+ ) or self.config.project_path_pattern.denied(self._get_project_path(project))
924
1032
 
925
1033
  def _init_tableau_project_registry(self, all_project_map: dict) -> None:
926
1034
  list_of_skip_projects: List[TableauProject] = []
@@ -948,9 +1056,11 @@ class TableauSiteSource:
948
1056
  for project in list_of_skip_projects:
949
1057
  if (
950
1058
  project.parent_id in projects_to_ingest
951
- and self._is_denied_project(project) is False
1059
+ and not self._is_denied_project(project)
952
1060
  ):
953
- logger.debug(f"Project {project.name} is added in project registry")
1061
+ logger.debug(
1062
+ f"Project {project.name} is added in project registry as it's a child project and not explicitly denied in `deny` list"
1063
+ )
954
1064
  projects_to_ingest[project.id] = project
955
1065
 
956
1066
  # We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.
@@ -974,8 +1084,11 @@ class TableauSiteSource:
974
1084
  self.datasource_project_map[ds.id] = ds.project_id
975
1085
  except Exception as e:
976
1086
  self.report.get_all_datasources_query_failed = True
977
- logger.info(f"Get all datasources query failed due to error {e}")
978
- logger.debug("Error stack trace", exc_info=True)
1087
+ self.report.warning(
1088
+ title="Unexpected Query Error",
1089
+ message="Get all datasources query failed due to error",
1090
+ exc=e,
1091
+ )
979
1092
 
980
1093
  def _init_workbook_registry(self) -> None:
981
1094
  if self.server is None:
@@ -1024,6 +1137,11 @@ class TableauSiteSource:
1024
1137
  ),
1025
1138
  )
1026
1139
 
1140
+ def _is_hidden_view(self, dashboard_or_view: Dict) -> bool:
1141
+ # LUID is blank if the view is hidden in the workbook.
1142
+ # More info here: https://help.tableau.com/current/api/metadata_api/en-us/reference/view.doc.html
1143
+ return not dashboard_or_view.get(c.LUID)
1144
+
1027
1145
  def get_connection_object_page(
1028
1146
  self,
1029
1147
  query: str,
@@ -1141,7 +1259,6 @@ class TableauSiteSource:
1141
1259
  )
1142
1260
 
1143
1261
  if node_limit_errors:
1144
- logger.debug(f"Node Limit Error. query_data {query_data}")
1145
1262
  self.report.warning(
1146
1263
  title="Tableau Data Exceed Predefined Limit",
1147
1264
  message="The numbers of record in result set exceeds a predefined limit. Increase the tableau "
@@ -1213,7 +1330,6 @@ class TableauSiteSource:
1213
1330
  page_size = page_size_override or self.config.page_size
1214
1331
 
1215
1332
  filter_pages = get_filter_pages(query_filter, page_size)
1216
-
1217
1333
  for filter_page in filter_pages:
1218
1334
  has_next_page = 1
1219
1335
  current_cursor: Optional[str] = None
@@ -1257,9 +1373,10 @@ class TableauSiteSource:
1257
1373
  wrk_id: Optional[str] = workbook.get(c.ID)
1258
1374
  prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
1259
1375
 
1260
- logger.debug(
1261
- f"Skipping workbook {wrk_name}({wrk_id}) as it is project {prj_name}({project_luid}) not "
1262
- f"present in project registry"
1376
+ self.report.warning(
1377
+ title="Skipping Missing Workbook",
1378
+ message="Skipping workbook as its project is not present in project registry",
1379
+ context=f"workbook={wrk_name}({wrk_id}), project={prj_name}({project_luid})",
1263
1380
  )
1264
1381
  continue
1265
1382
 
@@ -1292,7 +1409,7 @@ class TableauSiteSource:
1292
1409
  datasource: dict,
1293
1410
  browse_path: Optional[str],
1294
1411
  is_embedded_ds: bool = False,
1295
- ) -> Tuple:
1412
+ ) -> Tuple[List[Upstream], List[FineGrainedLineage]]:
1296
1413
  upstream_tables: List[Upstream] = []
1297
1414
  fine_grained_lineages: List[FineGrainedLineage] = []
1298
1415
  table_id_to_urn = {}
@@ -1453,7 +1570,8 @@ class TableauSiteSource:
1453
1570
  c.COLUMNS_CONNECTION
1454
1571
  ].get("totalCount")
1455
1572
  if not is_custom_sql and not num_tbl_cols:
1456
- logger.debug(
1573
+ self.report.num_upstream_table_skipped_no_columns += 1
1574
+ logger.warning(
1457
1575
  f"Skipping upstream table with id {table[c.ID]}, no columns: {table}"
1458
1576
  )
1459
1577
  continue
@@ -1469,7 +1587,13 @@ class TableauSiteSource:
1469
1587
  table, default_schema_map=self.config.default_schema_map
1470
1588
  )
1471
1589
  except Exception as e:
1472
- logger.info(f"Failed to generate upstream reference for {table}: {e}")
1590
+ self.report.num_upstream_table_failed_generate_reference += 1
1591
+ self.report.warning(
1592
+ title="Potentially Missing Lineage Issue",
1593
+ message="Failed to generate upstream reference",
1594
+ exc=e,
1595
+ context=f"table={table}",
1596
+ )
1473
1597
  continue
1474
1598
 
1475
1599
  table_urn = ref.make_dataset_urn(
@@ -1635,15 +1759,7 @@ class TableauSiteSource:
1635
1759
  func_overridden_info=None, # Here we don't want to override any information from configuration
1636
1760
  )
1637
1761
 
1638
- if parsed_result is None:
1639
- logger.info(
1640
- f"Failed to extract column level lineage from datasource {datasource_urn}"
1641
- )
1642
- return []
1643
- if parsed_result.debug_info.error:
1644
- logger.info(
1645
- f"Failed to extract column level lineage from datasource {datasource_urn}: {parsed_result.debug_info.error}"
1646
- )
1762
+ if parsed_result is None or parsed_result.debug_info.error:
1647
1763
  return []
1648
1764
 
1649
1765
  cll: List[ColumnLineageInfo] = (
@@ -1917,10 +2033,12 @@ class TableauSiteSource:
1917
2033
  self.datasource_project_map[ds_result.id] = ds_result.project_id
1918
2034
  except Exception as e:
1919
2035
  self.report.num_get_datasource_query_failures += 1
1920
- logger.warning(
1921
- f"Failed to get datasource project_luid for {ds_luid} due to error {e}"
2036
+ self.report.warning(
2037
+ title="Unexpected Query Error",
2038
+ message="Failed to get datasource details",
2039
+ exc=e,
2040
+ context=f"ds_luid={ds_luid}",
1922
2041
  )
1923
- logger.debug("Error stack trace", exc_info=True)
1924
2042
 
1925
2043
  def _get_workbook_project_luid(self, wb: dict) -> Optional[str]:
1926
2044
  if wb.get(c.LUID) and self.workbook_project_map.get(wb[c.LUID]):
@@ -2005,6 +2123,8 @@ class TableauSiteSource:
2005
2123
  aspect_name=c.UPSTREAM_LINEAGE,
2006
2124
  aspect=upstream_lineage,
2007
2125
  )
2126
+ self.report.num_tables_with_upstream_lineage += 1
2127
+ self.report.num_upstream_table_lineage += len(upstream_tables)
2008
2128
 
2009
2129
  @staticmethod
2010
2130
  def _clean_tableau_query_parameters(query: str) -> str:
@@ -2104,7 +2224,7 @@ class TableauSiteSource:
2104
2224
  f"Overridden info upstream_db={upstream_db}, platform_instance={platform_instance}, platform={platform}"
2105
2225
  )
2106
2226
 
2107
- return create_lineage_sql_parsed_result(
2227
+ parsed_result = create_lineage_sql_parsed_result(
2108
2228
  query=query,
2109
2229
  default_db=upstream_db,
2110
2230
  platform=platform,
@@ -2114,6 +2234,21 @@ class TableauSiteSource:
2114
2234
  schema_aware=not self.config.sql_parsing_disable_schema_awareness,
2115
2235
  )
2116
2236
 
2237
+ assert parsed_result is not None
2238
+
2239
+ if parsed_result.debug_info.table_error:
2240
+ logger.warning(
2241
+ f"Failed to extract table lineage from datasource {datasource_urn}: {parsed_result.debug_info.table_error}"
2242
+ )
2243
+ self.report.num_upstream_table_lineage_failed_parse_sql += 1
2244
+ elif parsed_result.debug_info.column_error:
2245
+ logger.warning(
2246
+ f"Failed to extract column level lineage from datasource {datasource_urn}: {parsed_result.debug_info.column_error}"
2247
+ )
2248
+ self.report.num_upstream_fine_grained_lineage_failed_parse_sql += 1
2249
+
2250
+ return parsed_result
2251
+
2117
2252
  def _enrich_database_tables_with_parsed_schemas(
2118
2253
  self, parsing_result: SqlParsingResult
2119
2254
  ) -> None:
@@ -2148,9 +2283,6 @@ class TableauSiteSource:
2148
2283
  )
2149
2284
 
2150
2285
  if parsed_result is None:
2151
- logger.info(
2152
- f"Failed to extract table level lineage for datasource {csql_urn}"
2153
- )
2154
2286
  return
2155
2287
 
2156
2288
  self._enrich_database_tables_with_parsed_schemas(parsed_result)
@@ -2170,12 +2302,14 @@ class TableauSiteSource:
2170
2302
  upstreams=upstream_tables,
2171
2303
  fineGrainedLineages=fine_grained_lineages,
2172
2304
  )
2173
-
2174
2305
  yield self.get_metadata_change_proposal(
2175
2306
  csql_urn,
2176
2307
  aspect_name=c.UPSTREAM_LINEAGE,
2177
2308
  aspect=upstream_lineage,
2178
2309
  )
2310
+ self.report.num_tables_with_upstream_lineage += 1
2311
+ self.report.num_upstream_table_lineage += len(upstream_tables)
2312
+ self.report.num_upstream_fine_grained_lineage += len(fine_grained_lineages)
2179
2313
 
2180
2314
  def _get_schema_metadata_for_datasource(
2181
2315
  self, datasource_fields: List[dict]
@@ -2252,12 +2386,11 @@ class TableauSiteSource:
2252
2386
  )
2253
2387
 
2254
2388
  # Tags
2255
- if datasource_info:
2389
+ if datasource_info and self.config.ingest_tags:
2256
2390
  tags = self.get_tags(datasource_info)
2257
- if tags:
2258
- dataset_snapshot.aspects.append(
2259
- builder.make_global_tag_aspect_with_tag_list(tags)
2260
- )
2391
+ dataset_snapshot.aspects.append(
2392
+ builder.make_global_tag_aspect_with_tag_list(tags)
2393
+ )
2261
2394
 
2262
2395
  # Browse path
2263
2396
  if browse_path and is_embedded_ds and workbook and workbook.get(c.NAME):
@@ -2326,6 +2459,11 @@ class TableauSiteSource:
2326
2459
  aspect_name=c.UPSTREAM_LINEAGE,
2327
2460
  aspect=upstream_lineage,
2328
2461
  )
2462
+ self.report.num_tables_with_upstream_lineage += 1
2463
+ self.report.num_upstream_table_lineage += len(upstream_tables)
2464
+ self.report.num_upstream_fine_grained_lineage += len(
2465
+ fine_grained_lineages
2466
+ )
2329
2467
 
2330
2468
  # Datasource Fields
2331
2469
  schema_metadata = self._get_schema_metadata_for_datasource(
@@ -2643,7 +2781,13 @@ class TableauSiteSource:
2643
2781
  c.SHEETS_CONNECTION,
2644
2782
  sheets_filter,
2645
2783
  ):
2646
- yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
2784
+ if self.config.ingest_hidden_assets or not self._is_hidden_view(sheet):
2785
+ yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
2786
+ else:
2787
+ self.report.num_hidden_assets_skipped += 1
2788
+ logger.debug(
2789
+ f"Skip view {sheet.get(c.ID)} because it's hidden (luid is blank)."
2790
+ )
2647
2791
 
2648
2792
  def emit_sheets_as_charts(
2649
2793
  self, sheet: dict, workbook: Optional[Dict]
@@ -2734,11 +2878,17 @@ class TableauSiteSource:
2734
2878
  chart_snapshot.aspects.append(owner)
2735
2879
 
2736
2880
  # Tags
2737
- tags = self.get_tags(sheet)
2738
- if tags:
2881
+ if self.config.ingest_tags:
2882
+ tags = self.get_tags(sheet)
2883
+ if len(self.config.tags_for_hidden_assets) > 0 and self._is_hidden_view(
2884
+ sheet
2885
+ ):
2886
+ tags.extend(self.config.tags_for_hidden_assets)
2887
+
2739
2888
  chart_snapshot.aspects.append(
2740
2889
  builder.make_global_tag_aspect_with_tag_list(tags)
2741
2890
  )
2891
+
2742
2892
  yield self.get_metadata_change_event(chart_snapshot)
2743
2893
  if sheet_external_url is not None and self.config.ingest_embed_url is True:
2744
2894
  yield self.new_work_unit(
@@ -2820,7 +2970,7 @@ class TableauSiteSource:
2820
2970
  else None
2821
2971
  )
2822
2972
 
2823
- tags = self.get_tags(workbook)
2973
+ tags = self.get_tags(workbook) if self.config.ingest_tags else None
2824
2974
 
2825
2975
  parent_key = None
2826
2976
  project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
@@ -2951,17 +3101,23 @@ class TableauSiteSource:
2951
3101
  c.DASHBOARDS_CONNECTION,
2952
3102
  dashboards_filter,
2953
3103
  ):
2954
- yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
3104
+ if self.config.ingest_hidden_assets or not self._is_hidden_view(dashboard):
3105
+ yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
3106
+ else:
3107
+ self.report.num_hidden_assets_skipped += 1
3108
+ logger.debug(
3109
+ f"Skip dashboard {dashboard.get(c.ID)} because it's hidden (luid is blank)."
3110
+ )
2955
3111
 
2956
- def get_tags(self, obj: dict) -> Optional[List[str]]:
3112
+ def get_tags(self, obj: dict) -> List[str]:
2957
3113
  tag_list = obj.get(c.TAGS, [])
2958
- if tag_list and self.config.ingest_tags:
3114
+ if tag_list:
2959
3115
  tag_list_str = [
2960
3116
  t[c.NAME] for t in tag_list if t is not None and t.get(c.NAME)
2961
3117
  ]
2962
3118
 
2963
3119
  return tag_list_str
2964
- return None
3120
+ return []
2965
3121
 
2966
3122
  def emit_dashboard(
2967
3123
  self, dashboard: dict, workbook: Optional[Dict]
@@ -3012,8 +3168,13 @@ class TableauSiteSource:
3012
3168
  )
3013
3169
  dashboard_snapshot.aspects.append(dashboard_info_class)
3014
3170
 
3015
- tags = self.get_tags(dashboard)
3016
- if tags:
3171
+ if self.config.ingest_tags:
3172
+ tags = self.get_tags(dashboard)
3173
+ if len(self.config.tags_for_hidden_assets) > 0 and self._is_hidden_view(
3174
+ dashboard
3175
+ ):
3176
+ tags.extend(self.config.tags_for_hidden_assets)
3177
+
3017
3178
  dashboard_snapshot.aspects.append(
3018
3179
  builder.make_global_tag_aspect_with_tag_list(tags)
3019
3180
  )
@@ -3181,10 +3342,10 @@ class TableauSiteSource:
3181
3342
  else:
3182
3343
  # This is a root Tableau project since the parent_project_id is None.
3183
3344
  # For a root project, either the site is the parent, or the platform is the default parent.
3184
- if self.config.add_site_container and self.site and self.site.id:
3345
+ if self.config.add_site_container:
3185
3346
  # The site containers have already been generated by emit_site_container, so we
3186
3347
  # don't need to emit them again here.
3187
- parent_project_key = self.gen_site_key(self.site.id)
3348
+ parent_project_key = self.gen_site_key(self.site_id)
3188
3349
 
3189
3350
  yield from gen_containers(
3190
3351
  container_key=project_key,
@@ -3201,12 +3362,12 @@ class TableauSiteSource:
3201
3362
  yield from emit_project_in_topological_order(project)
3202
3363
 
3203
3364
  def emit_site_container(self):
3204
- if not self.site or not self.site.id:
3365
+ if not self.site:
3205
3366
  logger.warning("Can not ingest site container. No site information found.")
3206
3367
  return
3207
3368
 
3208
3369
  yield from gen_containers(
3209
- container_key=self.gen_site_key(self.site.id),
3370
+ container_key=self.gen_site_key(self.site_id),
3210
3371
  name=self.site.name or "Default",
3211
3372
  sub_types=[c.SITE],
3212
3373
  )
@@ -975,16 +975,22 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
975
975
  # a few ten thousand, then tableau server responds with empty response
976
976
  # causing below error:
977
977
  # tableauserverclient.server.endpoint.exceptions.NonXMLResponseError: b''
978
+
979
+ # in practice, we only do pagination if len(query_filter.keys()) == 1
980
+ if len(query_filter.keys()) != 1:
981
+ return filter_pages
982
+
983
+ current_key = (list(query_filter.keys()))[0]
984
+
978
985
  if (
979
- len(query_filter.keys()) == 1
980
- and query_filter.get(c.ID_WITH_IN)
981
- and isinstance(query_filter[c.ID_WITH_IN], list)
982
- and len(query_filter[c.ID_WITH_IN]) > 100 * page_size
986
+ current_key in [c.ID_WITH_IN, c.PROJECT_NAME_WITH_IN]
987
+ and query_filter.get(current_key)
988
+ and isinstance(query_filter[current_key], list)
983
989
  ):
984
- ids = query_filter[c.ID_WITH_IN]
990
+ ids = query_filter[current_key]
985
991
  filter_pages = [
986
992
  {
987
- c.ID_WITH_IN: ids[
993
+ current_key: ids[
988
994
  start : (
989
995
  start + page_size if start + page_size < len(ids) else len(ids)
990
996
  )