acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
  2. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,9 @@ import json
2
2
  import logging
3
3
  import re
4
4
  import time
5
- from collections import OrderedDict
6
- from dataclasses import dataclass
7
- from datetime import datetime
5
+ from collections import OrderedDict, defaultdict
6
+ from dataclasses import dataclass, field as dataclass_field
7
+ from datetime import datetime, timedelta, timezone
8
8
  from functools import lru_cache
9
9
  from typing import (
10
10
  Any,
@@ -35,7 +35,10 @@ from tableauserverclient import (
35
35
  SiteItem,
36
36
  TableauAuth,
37
37
  )
38
- from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError
38
+ from tableauserverclient.server.endpoint.exceptions import (
39
+ InternalServerError,
40
+ NonXMLResponseError,
41
+ )
39
42
  from urllib3 import Retry
40
43
 
41
44
  import datahub.emitter.mce_builder as builder
@@ -49,6 +52,7 @@ from datahub.configuration.source_common import (
49
52
  DatasetSourceConfigMixin,
50
53
  )
51
54
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
55
+ from datahub.configuration.validate_field_removal import pydantic_removed_field
52
56
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
53
57
  from datahub.emitter.mcp_builder import (
54
58
  ContainerKey,
@@ -105,6 +109,7 @@ from datahub.ingestion.source.tableau.tableau_common import (
105
109
  make_filter,
106
110
  make_fine_grained_lineage_class,
107
111
  make_upstream_class,
112
+ optimize_query_filter,
108
113
  published_datasource_graphql_query,
109
114
  query_metadata_cursor_based_pagination,
110
115
  sheet_graphql_query,
@@ -182,6 +187,20 @@ try:
182
187
  except ImportError:
183
188
  REAUTHENTICATE_ERRORS = (NonXMLResponseError,)
184
189
 
190
+ RETRIABLE_ERROR_CODES = [
191
+ 408, # Request Timeout
192
+ 429, # Too Many Requests
193
+ 500, # Internal Server Error
194
+ 502, # Bad Gateway
195
+ 503, # Service Unavailable
196
+ 504, # Gateway Timeout
197
+ ]
198
+
199
+ # From experience, this expiry time typically ranges from 50 minutes
200
+ # to 2 hours but might as well be configurable. We will allow upto
201
+ # 10 minutes of such expiry time
202
+ REGULAR_AUTH_EXPIRY_PERIOD = timedelta(minutes=10)
203
+
185
204
  logger: logging.Logger = logging.getLogger(__name__)
186
205
 
187
206
  # Replace / with |
@@ -283,7 +302,7 @@ class TableauConnectionConfig(ConfigModel):
283
302
  max_retries=Retry(
284
303
  total=self.max_retries,
285
304
  backoff_factor=1,
286
- status_forcelist=[429, 500, 502, 503, 504],
305
+ status_forcelist=RETRIABLE_ERROR_CODES,
287
306
  )
288
307
  )
289
308
  server._session.mount("http://", adapter)
@@ -380,11 +399,6 @@ class TableauConfig(
380
399
  description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
381
400
  )
382
401
 
383
- fetch_size: int = Field(
384
- default=250,
385
- description="Specifies the number of records to retrieve in each batch during a query execution.",
386
- )
387
-
388
402
  # We've found that even with a small workbook page size (e.g. 10), the Tableau API often
389
403
  # returns warnings like this:
390
404
  # {
@@ -499,6 +513,10 @@ class TableauConfig(
499
513
  "This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.",
500
514
  )
501
515
 
516
+ _fetch_size = pydantic_removed_field(
517
+ "fetch_size",
518
+ )
519
+
502
520
  # pre = True because we want to take some decision before pydantic initialize the configuration to default values
503
521
  @root_validator(pre=True)
504
522
  def projects_backward_compatibility(cls, values: Dict) -> Dict:
@@ -618,6 +636,13 @@ class DatabaseTable:
618
636
  self.parsed_columns = parsed_columns
619
637
 
620
638
 
639
+ @dataclass
640
+ class SiteIdContentUrl:
641
+ site_id: str
642
+ site_content_url: str
643
+
644
+
645
+ @dataclass
621
646
  class TableauSourceReport(StaleEntityRemovalSourceReport):
622
647
  get_all_datasources_query_failed: bool = False
623
648
  num_get_datasource_query_failures: int = 0
@@ -634,7 +659,14 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
634
659
  num_upstream_table_lineage_failed_parse_sql: int = 0
635
660
  num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
636
661
  num_hidden_assets_skipped: int = 0
637
- logged_in_user: List[UserInfo] = []
662
+ logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
663
+ last_authenticated_at: Optional[datetime] = None
664
+
665
+ num_expected_tableau_metadata_queries: int = 0
666
+ num_actual_tableau_metadata_queries: int = 0
667
+ tableau_server_error_stats: Dict[str, int] = dataclass_field(
668
+ default_factory=(lambda: defaultdict(int))
669
+ )
638
670
 
639
671
 
640
672
  def report_user_role(report: TableauSourceReport, server: Server) -> None:
@@ -645,7 +677,7 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
645
677
  # the site-role might be different on another site
646
678
  logged_in_user: UserInfo = UserInfo.from_server(server=server)
647
679
 
648
- if not logged_in_user.is_site_administrator_explorer():
680
+ if not logged_in_user.has_site_administrator_explorer_privileges():
649
681
  report.warning(
650
682
  title=title,
651
683
  message=message,
@@ -705,6 +737,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
705
737
  try:
706
738
  logger.info(f"Authenticated to Tableau site: '{site_content_url}'")
707
739
  self.server = self.config.make_tableau_client(site_content_url)
740
+ self.report.last_authenticated_at = datetime.now(timezone.utc)
708
741
  report_user_role(report=self.report, server=self.server)
709
742
  # Note that we're not catching ConfigurationError, since we want that to throw.
710
743
  except ValueError as e:
@@ -770,7 +803,6 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
770
803
  config=self.config,
771
804
  ctx=self.ctx,
772
805
  site=site,
773
- site_id=site.id,
774
806
  report=self.report,
775
807
  server=self.server,
776
808
  platform=self.platform,
@@ -789,8 +821,14 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
789
821
  site_source = TableauSiteSource(
790
822
  config=self.config,
791
823
  ctx=self.ctx,
792
- site=site,
793
- site_id=self.server.site_id,
824
+ site=(
825
+ site
826
+ if site
827
+ else SiteIdContentUrl(
828
+ site_id=self.server.site_id,
829
+ site_content_url=self.config.site,
830
+ )
831
+ ),
794
832
  report=self.report,
795
833
  server=self.server,
796
834
  platform=self.platform,
@@ -823,8 +861,7 @@ class TableauSiteSource:
823
861
  self,
824
862
  config: TableauConfig,
825
863
  ctx: PipelineContext,
826
- site: Optional[SiteItem],
827
- site_id: Optional[str],
864
+ site: Union[SiteItem, SiteIdContentUrl],
828
865
  report: TableauSourceReport,
829
866
  server: Server,
830
867
  platform: str,
@@ -835,13 +872,18 @@ class TableauSiteSource:
835
872
  self.ctx: PipelineContext = ctx
836
873
  self.platform = platform
837
874
 
838
- self.site: Optional[SiteItem] = site
839
- if site_id is not None:
840
- self.site_id: str = site_id
875
+ self.site: Optional[SiteItem] = None
876
+ if isinstance(site, SiteItem):
877
+ self.site = site
878
+ assert site.id is not None, "Site ID is required"
879
+ self.site_id = site.id
880
+ self.site_content_url = site.content_url
881
+ elif isinstance(site, SiteIdContentUrl):
882
+ self.site = None
883
+ self.site_id = site.site_id
884
+ self.site_content_url = site.site_content_url
841
885
  else:
842
- assert self.site is not None, "site or site_id is required"
843
- assert self.site.id is not None, "site_id is required when site is provided"
844
- self.site_id = self.site.id
886
+ raise AssertionError("site or site id+content_url pair is required")
845
887
 
846
888
  self.database_tables: Dict[str, DatabaseTable] = {}
847
889
  self.tableau_stat_registry: Dict[str, UsageStat] = {}
@@ -895,17 +937,12 @@ class TableauSiteSource:
895
937
  # datasets also have the env in the browse path
896
938
  return f"/{self.config.env.lower()}{self.no_env_browse_prefix}"
897
939
 
898
- def _re_authenticate(self):
899
- tableau_auth: Union[
900
- TableauAuth, PersonalAccessTokenAuth
901
- ] = self.config.get_tableau_auth(self.site_id)
902
- self.server.auth.sign_in(tableau_auth)
903
-
904
- @property
905
- def site_content_url(self) -> Optional[str]:
906
- if self.site and self.site.content_url:
907
- return self.site.content_url
908
- return None
940
+ def _re_authenticate(self) -> None:
941
+ logger.info(f"Re-authenticating to Tableau site '{self.site_content_url}'")
942
+ # Sign-in again may not be enough because Tableau sometimes caches invalid sessions
943
+ # so we need to recreate the Tableau Server object
944
+ self.server = self.config.make_tableau_client(self.site_content_url)
945
+ self.report.last_authenticated_at = datetime.now(timezone.utc)
909
946
 
910
947
  def _populate_usage_stat_registry(self) -> None:
911
948
  if self.server is None:
@@ -1148,7 +1185,7 @@ class TableauSiteSource:
1148
1185
  connection_type: str,
1149
1186
  query_filter: str,
1150
1187
  current_cursor: Optional[str],
1151
- fetch_size: int = 250,
1188
+ fetch_size: int,
1152
1189
  retry_on_auth_error: bool = True,
1153
1190
  retries_remaining: Optional[int] = None,
1154
1191
  ) -> Tuple[dict, Optional[str], int]:
@@ -1171,6 +1208,7 @@ class TableauSiteSource:
1171
1208
  )
1172
1209
  try:
1173
1210
  assert self.server is not None
1211
+ self.report.num_actual_tableau_metadata_queries += 1
1174
1212
  query_data = query_metadata_cursor_based_pagination(
1175
1213
  server=self.server,
1176
1214
  main_query=query,
@@ -1180,24 +1218,56 @@ class TableauSiteSource:
1180
1218
  qry_filter=query_filter,
1181
1219
  )
1182
1220
 
1183
- except REAUTHENTICATE_ERRORS:
1184
- if not retry_on_auth_error:
1221
+ except REAUTHENTICATE_ERRORS as e:
1222
+ self.report.tableau_server_error_stats[e.__class__.__name__] += 1
1223
+ if not retry_on_auth_error or retries_remaining <= 0:
1185
1224
  raise
1186
1225
 
1187
- # If ingestion has been running for over 2 hours, the Tableau
1188
- # temporary credentials will expire. If this happens, this exception
1189
- # will be thrown, and we need to re-authenticate and retry.
1190
- self._re_authenticate()
1226
+ # We have been getting some irregular authorization errors like below well before the expected expiry time
1227
+ # - within few seconds of initial authentication . We'll retry without re-auth for such cases.
1228
+ # <class 'tableauserverclient.server.endpoint.exceptions.NonXMLResponseError'>:
1229
+ # b'{"timestamp":"xxx","status":401,"error":"Unauthorized","path":"/relationship-service-war/graphql"}'
1230
+ if self.report.last_authenticated_at and (
1231
+ datetime.now(timezone.utc) - self.report.last_authenticated_at
1232
+ > REGULAR_AUTH_EXPIRY_PERIOD
1233
+ ):
1234
+ # If ingestion has been running for over 2 hours, the Tableau
1235
+ # temporary credentials will expire. If this happens, this exception
1236
+ # will be thrown, and we need to re-authenticate and retry.
1237
+ self._re_authenticate()
1238
+
1191
1239
  return self.get_connection_object_page(
1192
1240
  query=query,
1193
1241
  connection_type=connection_type,
1194
1242
  query_filter=query_filter,
1195
1243
  fetch_size=fetch_size,
1196
1244
  current_cursor=current_cursor,
1197
- retry_on_auth_error=False,
1245
+ retry_on_auth_error=True,
1198
1246
  retries_remaining=retries_remaining - 1,
1199
1247
  )
1248
+
1249
+ except InternalServerError as ise:
1250
+ self.report.tableau_server_error_stats[InternalServerError.__name__] += 1
1251
+ # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry.
1252
+ # Extended with other retryable errors.
1253
+ if ise.code in RETRIABLE_ERROR_CODES:
1254
+ if retries_remaining <= 0:
1255
+ raise ise
1256
+ logger.info(f"Retrying query due to error {ise.code}")
1257
+ return self.get_connection_object_page(
1258
+ query=query,
1259
+ connection_type=connection_type,
1260
+ query_filter=query_filter,
1261
+ fetch_size=fetch_size,
1262
+ current_cursor=current_cursor,
1263
+ retry_on_auth_error=True,
1264
+ retries_remaining=retries_remaining - 1,
1265
+ )
1266
+ else:
1267
+ raise ise
1268
+
1200
1269
  except OSError:
1270
+ self.report.tableau_server_error_stats[OSError.__name__] += 1
1201
1271
  # In tableauseverclient 0.26 (which was yanked and released in 0.28 on 2023-10-04),
1202
1272
  # the request logic was changed to use threads.
1203
1273
  # https://github.com/tableau/server-client-python/commit/307d8a20a30f32c1ce615cca7c6a78b9b9bff081
@@ -1212,7 +1282,7 @@ class TableauSiteSource:
1212
1282
  query_filter=query_filter,
1213
1283
  fetch_size=fetch_size,
1214
1284
  current_cursor=current_cursor,
1215
- retry_on_auth_error=False,
1285
+ retry_on_auth_error=True,
1216
1286
  retries_remaining=retries_remaining - 1,
1217
1287
  )
1218
1288
 
@@ -1300,7 +1370,7 @@ class TableauSiteSource:
1300
1370
  query_filter=query_filter,
1301
1371
  fetch_size=fetch_size,
1302
1372
  current_cursor=current_cursor,
1303
- retry_on_auth_error=False,
1373
+ retry_on_auth_error=True,
1304
1374
  retries_remaining=retries_remaining,
1305
1375
  )
1306
1376
  raise RuntimeError(f"Query {connection_type} error: {errors}")
@@ -1325,6 +1395,8 @@ class TableauSiteSource:
1325
1395
  query_filter: dict = {},
1326
1396
  page_size_override: Optional[int] = None,
1327
1397
  ) -> Iterable[dict]:
1398
+ query_filter = optimize_query_filter(query_filter)
1399
+
1328
1400
  # Calls the get_connection_object_page function to get the objects,
1329
1401
  # and automatically handles pagination.
1330
1402
  page_size = page_size_override or self.config.page_size
@@ -1336,6 +1408,7 @@ class TableauSiteSource:
1336
1408
  while has_next_page:
1337
1409
  filter_: str = make_filter(filter_page)
1338
1410
 
1411
+ self.report.num_expected_tableau_metadata_queries += 1
1339
1412
  (
1340
1413
  connection_objects,
1341
1414
  current_cursor,
@@ -1345,7 +1418,11 @@ class TableauSiteSource:
1345
1418
  connection_type=connection_type,
1346
1419
  query_filter=filter_,
1347
1420
  current_cursor=current_cursor,
1348
- fetch_size=self.config.fetch_size,
1421
+ # `filter_page` contains metadata object IDs (e.g., Project IDs, Field IDs, Sheet IDs, etc.).
1422
+ # The number of IDs is always less than or equal to page_size.
1423
+ # If the IDs are primary keys, the number of metadata objects to load matches the number of records to return.
1424
+ # In our case, mostly, the IDs are primary key, therefore, fetch_size is set equal to page_size.
1425
+ fetch_size=page_size,
1349
1426
  )
1350
1427
 
1351
1428
  yield from connection_objects.get(c.NODES) or []
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  import html
2
3
  import json
3
4
  import logging
@@ -35,6 +36,7 @@ from datahub.metadata.schema_classes import (
35
36
  UpstreamClass,
36
37
  )
37
38
  from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
39
+ from datahub.utilities.ordered_set import OrderedSet
38
40
 
39
41
  logger = logging.getLogger(__name__)
40
42
 
@@ -1000,3 +1002,19 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
1000
1002
  ]
1001
1003
 
1002
1004
  return filter_pages
1005
+
1006
+
1007
+ def optimize_query_filter(query_filter: dict) -> dict:
1008
+ """
1009
+ Duplicates in the filter cause duplicates in the result,
1010
+ leading to entities/aspects being emitted multiple times unnecessarily
1011
+ """
1012
+ optimized_query = copy.deepcopy(query_filter)
1013
+
1014
+ if query_filter.get(c.ID_WITH_IN):
1015
+ optimized_query[c.ID_WITH_IN] = list(OrderedSet(query_filter[c.ID_WITH_IN]))
1016
+ if query_filter.get(c.PROJECT_NAME_WITH_IN):
1017
+ optimized_query[c.PROJECT_NAME_WITH_IN] = list(
1018
+ OrderedSet(query_filter[c.PROJECT_NAME_WITH_IN])
1019
+ )
1020
+ return optimized_query
@@ -82,4 +82,6 @@ PROJECT = "Project"
82
82
  SITE = "Site"
83
83
  IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
84
84
  SITE_PERMISSION = "sitePermission"
85
- SITE_ROLE = "SiteAdministratorExplorer"
85
+ ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer"
86
+ ROLE_SITE_ADMIN_CREATOR = "SiteAdministratorCreator"
87
+ ROLE_SERVER_ADMIN = "ServerAdministrator"
@@ -11,8 +11,12 @@ class UserInfo:
11
11
  site_role: str
12
12
  site_id: str
13
13
 
14
- def is_site_administrator_explorer(self):
15
- return self.site_role == c.SITE_ROLE
14
+ def has_site_administrator_explorer_privileges(self):
15
+ return self.site_role in [
16
+ c.ROLE_SITE_ADMIN_EXPLORER,
17
+ c.ROLE_SITE_ADMIN_CREATOR,
18
+ c.ROLE_SERVER_ADMIN,
19
+ ]
16
20
 
17
21
  @staticmethod
18
22
  def from_server(server: Server) -> "UserInfo":
@@ -28,7 +28,7 @@ def check_user_role(
28
28
 
29
29
  try:
30
30
  # TODO: Add check for `Enable Derived Permissions`
31
- if not logged_in_user.is_site_administrator_explorer():
31
+ if not logged_in_user.has_site_administrator_explorer_privileges():
32
32
  capability_dict[c.SITE_PERMISSION] = CapabilityReport(
33
33
  capable=False,
34
34
  failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.",
@@ -4,7 +4,7 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
4
4
 
5
5
  import dataclasses
6
6
  import logging
7
- from datetime import datetime, timezone
7
+ from datetime import datetime
8
8
  from typing import Any, Dict, Iterable, List, Optional, Union, cast
9
9
  from unittest.mock import patch
10
10
 
@@ -27,6 +27,7 @@ from databricks.sdk.service.sql import (
27
27
  from databricks.sdk.service.workspace import ObjectType
28
28
 
29
29
  import datahub
30
+ from datahub.emitter.mce_builder import parse_ts_millis
30
31
  from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
31
32
  from datahub.ingestion.source.unity.proxy_profiling import (
32
33
  UnityCatalogProxyProfilingMixin,
@@ -211,16 +212,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
211
212
  id=obj.object_id,
212
213
  path=obj.path,
213
214
  language=obj.language,
214
- created_at=(
215
- datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc)
216
- if obj.created_at
217
- else None
218
- ),
219
- modified_at=(
220
- datetime.fromtimestamp(obj.modified_at / 1000, tz=timezone.utc)
221
- if obj.modified_at
222
- else None
223
- ),
215
+ created_at=parse_ts_millis(obj.created_at),
216
+ modified_at=parse_ts_millis(obj.modified_at),
224
217
  )
225
218
 
226
219
  def query_history(
@@ -452,17 +445,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
452
445
  properties=obj.properties or {},
453
446
  owner=obj.owner,
454
447
  generation=obj.generation,
455
- created_at=(
456
- datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc)
457
- if obj.created_at
458
- else None
459
- ),
448
+ created_at=(parse_ts_millis(obj.created_at) if obj.created_at else None),
460
449
  created_by=obj.created_by,
461
- updated_at=(
462
- datetime.fromtimestamp(obj.updated_at / 1000, tz=timezone.utc)
463
- if obj.updated_at
464
- else None
465
- ),
450
+ updated_at=(parse_ts_millis(obj.updated_at) if obj.updated_at else None),
466
451
  updated_by=obj.updated_by,
467
452
  table_id=obj.table_id,
468
453
  comment=obj.comment,
@@ -500,12 +485,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
500
485
  query_id=info.query_id,
501
486
  query_text=info.query_text,
502
487
  statement_type=info.statement_type,
503
- start_time=datetime.fromtimestamp(
504
- info.query_start_time_ms / 1000, tz=timezone.utc
505
- ),
506
- end_time=datetime.fromtimestamp(
507
- info.query_end_time_ms / 1000, tz=timezone.utc
508
- ),
488
+ start_time=parse_ts_millis(info.query_start_time_ms),
489
+ end_time=parse_ts_millis(info.query_end_time_ms),
509
490
  user_id=info.user_id,
510
491
  user_name=info.user_name,
511
492
  executed_as_user_id=info.executed_as_user_id,
@@ -54,6 +54,20 @@ def default_user_urn_builder(email: str) -> str:
54
54
  return builder.make_user_urn(email.split("@")[0])
55
55
 
56
56
 
57
+ def extract_user_email(user: str) -> Optional[str]:
58
+ """Extracts user email from user input
59
+
60
+ >>> extract_user_email('urn:li:corpuser:abc@xyz.com')
61
+ 'abc@xyz.com'
62
+ >>> extract_user_email('urn:li:corpuser:abc')
63
+ >>> extract_user_email('abc@xyz.com')
64
+ 'abc@xyz.com'
65
+ """
66
+ if user.startswith(("urn:li:corpuser:", "urn:li:corpGroup:")):
67
+ user = user.split(":")[-1]
68
+ return user if "@" in user else None
69
+
70
+
57
71
  def make_usage_workunit(
58
72
  bucket_start_time: datetime,
59
73
  resource: ResourceType,
@@ -104,7 +118,7 @@ def make_usage_workunit(
104
118
  DatasetUserUsageCountsClass(
105
119
  user=user_urn_builder(user),
106
120
  count=count,
107
- userEmail=user if "@" in user else None,
121
+ userEmail=extract_user_email(user),
108
122
  )
109
123
  for user, count in user_freq
110
124
  ],
@@ -14,6 +14,8 @@ LINEAGE_EXTRACTION = "Lineage Extraction"
14
14
  USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion"
15
15
  USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats"
16
16
  USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation"
17
+ EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage"
18
+ VIEW_PARSING = "View Parsing"
17
19
  QUERIES_EXTRACTION = "Queries Extraction"
18
20
  PROFILING = "Profiling"
19
21
 
@@ -40,4 +42,5 @@ class IngestionStageReport:
40
42
  self._timer = PerfTimer()
41
43
 
42
44
  self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
45
+ logger.info(f"Stage started: {self.ingestion_stage}")
43
46
  self._timer.start()