acryl-datahub 0.15.0.1rc14__py3-none-any.whl → 0.15.0.1rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=tC4XcRTMJqr-bc6T1QdoRI7MvbIkSZk8AscmW3iOtOo,577
1
+ datahub/__init__.py,sha256=0dgSJoggO_qJtX-oEnxH20rGzNGGCstuwsxqUKzbKUA,577
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -302,10 +302,10 @@ datahub/ingestion/source/fivetran/fivetran.py,sha256=uKbM5czPz-6LOseoh1FwavWDIuL
302
302
  datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP_CyAT5Cian2N4a-lb8x1NKHk,12776
303
303
  datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
304
304
  datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
305
- datahub/ingestion/source/gc/datahub_gc.py,sha256=WOg3yIaNmwdbSTwytKeSfIUihsM7FMYBip9u2Dnwk3c,12849
305
+ datahub/ingestion/source/gc/datahub_gc.py,sha256=W6uoeV7B4WIXdxT4tOEdDksdJm656WwwvkH79L7f_8Q,12969
306
306
  datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeNOjEingKpYy-XvCPeaAb4k,17125
307
- datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=sZbdkg3MuPVGf8eeeRg_2khGMZ01QoH4dgJiTxf7Srg,9813
308
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=LvDGTaAaI-T0OZ3fkaFwipLdzPePunuSVWoEuSBsfEM,11099
307
+ datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=9jsyCIspWSSYSAVPHjKHr05885rXxM6FCH7KzTBceic,10139
308
+ datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=2JpESfsqoJRdLskV3AHYU8nRj_NvNtIaLZ4_RRNIod4,11229
309
309
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
310
310
  datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
311
311
  datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
@@ -434,7 +434,7 @@ datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJ
434
434
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
435
435
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=5Lpy_irZlbOFJbvVkgsZSBjdLCT3VZNjlEvttzSQAU4,21121
436
436
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
437
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=Lhc5FAx8pXiUyfODGNkQJhjThSCIjPqG2R82dHN-jg0,26889
437
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=jTpnFWRqqFId6DKJvvAbNuFPxyNi1oQxxDUyMvh1iu4,26968
438
438
  datahub/ingestion/source/snowflake/snowflake_query.py,sha256=5po2FWz41UVowykJYbTFGxsltbmlHBCPcHG20VOhdOE,38469
439
439
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
440
440
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
@@ -491,7 +491,7 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
491
491
  datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
492
492
  datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
493
493
  datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
494
- datahub/ingestion/source/tableau/tableau.py,sha256=fY--jFtPtCuDBAruiMStAoT7HqaTDYtiVEKzEYuzCag,140121
494
+ datahub/ingestion/source/tableau/tableau.py,sha256=KAwyM9XiJUXFPwuVQM7GcHntcTFxMFAN4j3xSIOMbgg,142010
495
495
  datahub/ingestion/source/tableau/tableau_common.py,sha256=a3Nu0Upy6_pnrd7XpSMcYHdnYca1JBW7H0jMqkYr0ME,26871
496
496
  datahub/ingestion/source/tableau/tableau_constant.py,sha256=ZcAeHsQUXVVL26ORly0ByZk_GJAFbxaKuJAlX_sYMac,2686
497
497
  datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=nSyx9RzC6TCQDm-cTVJ657qT8iDwzk_8JMKpohhmOc4,1046
@@ -511,7 +511,7 @@ datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9d
511
511
  datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
512
512
  datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
513
513
  datahub/ingestion/source/usage/starburst_trino_usage.py,sha256=R1DDs98tYn2WW0_tGXQhk7lqEU0ru2SgrvMBtV305ps,10542
514
- datahub/ingestion/source/usage/usage_common.py,sha256=e7fcTd_vbUFv3xu5iY0mkEaAywjAufxV0Mw2Mu54IMY,11805
514
+ datahub/ingestion/source/usage/usage_common.py,sha256=poNlVKx1VRPRfE4K3yAyIS96DkGAt3MC17vQlwqBNvw,12235
515
515
  datahub/ingestion/source_config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
516
516
  datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1pZSi33tzW9sQb7ZEgac,1733
517
517
  datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
@@ -881,7 +881,7 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
881
881
  datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
882
882
  datahub/sql_parsing/schema_resolver.py,sha256=9INZWdxA2dMSLK6RXaVqjbjyLY_VKMhCkQv_Xd6Ln3I,10848
883
883
  datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
884
- datahub/sql_parsing/sql_parsing_aggregator.py,sha256=LBs1RjRqh3natrx4WfgRQGNpI56o12jtbABO5ipEBWA,69889
884
+ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=jVF6TbyM71XdJ34K0Setz3LgJALvJrJs1mVKdxU_6d4,69830
885
885
  datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
886
886
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
887
887
  datahub/sql_parsing/sqlglot_lineage.py,sha256=gUVq3NwZUzQByJs43JZXz8lZf0ZVzVt0FzaW5wZOwK4,47460
@@ -986,8 +986,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
986
986
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
987
987
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
988
988
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
989
- acryl_datahub-0.15.0.1rc14.dist-info/METADATA,sha256=na5JJwiilGTUFiwOBRULg2a8NxVvzNRgwodacg0LOSU,173444
990
- acryl_datahub-0.15.0.1rc14.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
991
- acryl_datahub-0.15.0.1rc14.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
992
- acryl_datahub-0.15.0.1rc14.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
993
- acryl_datahub-0.15.0.1rc14.dist-info/RECORD,,
989
+ acryl_datahub-0.15.0.1rc16.dist-info/METADATA,sha256=hMvfZy8EYOj5eb7yygEhb_kZJbHtpVx-bWNE6H6eu_c,173444
990
+ acryl_datahub-0.15.0.1rc16.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
991
+ acryl_datahub-0.15.0.1rc16.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
992
+ acryl_datahub-0.15.0.1rc16.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
993
+ acryl_datahub-0.15.0.1rc16.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.7.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc14"
6
+ __version__ = "0.15.0.1rc16"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -188,6 +188,9 @@ class DataHubGcSource(Source):
188
188
  self._truncate_timeseries_helper(
189
189
  aspect_name="dashboardUsageStatistics", entity_type="dashboard"
190
190
  )
191
+ self._truncate_timeseries_helper(
192
+ aspect_name="queryusagestatistics", entity_type="query"
193
+ )
191
194
 
192
195
  def _truncate_timeseries_helper(self, aspect_name: str, entity_type: str) -> None:
193
196
  self._truncate_timeseries_with_watch_optional(
@@ -141,7 +141,9 @@ class DatahubExecutionRequestCleanup:
141
141
  break
142
142
  if self.report.ergc_read_errors >= self.config.max_read_errors:
143
143
  self.report.failure(
144
- f"ergc({self.instance_id}): too many read errors, aborting."
144
+ title="Too many read errors, aborting",
145
+ message="Too many read errors, aborting",
146
+ context=str(self.instance_id),
145
147
  )
146
148
  break
147
149
  try:
@@ -158,8 +160,11 @@ class DatahubExecutionRequestCleanup:
158
160
  break
159
161
  params["scrollId"] = document["scrollId"]
160
162
  except Exception as e:
161
- logger.error(
162
- f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}"
163
+ self.report.failure(
164
+ title="Failed to fetch next batch of execution requests",
165
+ message="Failed to fetch next batch of execution requests",
166
+ context=str(self.instance_id),
167
+ exc=e,
163
168
  )
164
169
  self.report.ergc_read_errors += 1
165
170
 
@@ -231,8 +236,11 @@ class DatahubExecutionRequestCleanup:
231
236
  self.graph.delete_entity(entry.urn, True)
232
237
  except Exception as e:
233
238
  self.report.ergc_delete_errors += 1
234
- logger.error(
235
- f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}"
239
+ self.report.failure(
240
+ title="Failed to delete ExecutionRequest",
241
+ message="Failed to delete ExecutionRequest",
242
+ context=str(self.instance_id),
243
+ exc=e,
236
244
  )
237
245
 
238
246
  def _reached_runtime_limit(self) -> bool:
@@ -105,6 +105,8 @@ class SoftDeletedEntitiesReport(SourceReport):
105
105
  sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
106
106
  default_factory=TopKDict
107
107
  )
108
+ runtime_limit_reached: bool = False
109
+ deletion_limit_reached: bool = False
108
110
 
109
111
 
110
112
  class SoftDeletedEntitiesCleanup:
@@ -163,6 +165,8 @@ class SoftDeletedEntitiesCleanup:
163
165
  f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
164
166
  )
165
167
  return
168
+ if self._deletion_limit_reached() or self._times_up():
169
+ return
166
170
  self._increment_removal_started_count()
167
171
  self.ctx.graph.delete_entity(urn=urn, hard=True)
168
172
  self.ctx.graph.delete_references_to_urn(
@@ -203,11 +207,10 @@ class SoftDeletedEntitiesCleanup:
203
207
  for future in done:
204
208
  self._print_report()
205
209
  if future.exception():
206
- logger.error(
207
- f"Failed to delete entity {futures[future]}: {future.exception()}"
208
- )
209
210
  self.report.failure(
210
- f"Failed to delete entity {futures[future]}",
211
+ title="Failed to delete entity",
212
+ message="Failed to delete entity",
213
+ context=futures[future],
211
214
  exc=future.exception(),
212
215
  )
213
216
  self.report.num_soft_deleted_entity_processed += 1
@@ -255,7 +258,7 @@ class SoftDeletedEntitiesCleanup:
255
258
  )
256
259
  break
257
260
  scroll_across_entities = result.get("scrollAcrossEntities")
258
- if not scroll_across_entities:
261
+ if not scroll_across_entities or not scroll_across_entities.get("count"):
259
262
  break
260
263
  scroll_id = scroll_across_entities.get("nextScrollId")
261
264
  self.report.num_queries_found += scroll_across_entities.get("count")
@@ -274,6 +277,26 @@ class SoftDeletedEntitiesCleanup:
274
277
  )
275
278
  yield from self._get_soft_deleted_queries()
276
279
 
280
+ def _times_up(self) -> bool:
281
+ if (
282
+ self.config.runtime_limit_seconds
283
+ and time.time() - self.start_time > self.config.runtime_limit_seconds
284
+ ):
285
+ with self._report_lock:
286
+ self.report.runtime_limit_reached = True
287
+ return True
288
+ return False
289
+
290
+ def _deletion_limit_reached(self) -> bool:
291
+ if (
292
+ self.config.limit_entities_delete
293
+ and self.report.num_hard_deleted > self.config.limit_entities_delete
294
+ ):
295
+ with self._report_lock:
296
+ self.report.deletion_limit_reached = True
297
+ return True
298
+ return False
299
+
277
300
  def cleanup_soft_deleted_entities(self) -> None:
278
301
  if not self.config.enabled:
279
302
  return
@@ -285,24 +308,8 @@ class SoftDeletedEntitiesCleanup:
285
308
  self._print_report()
286
309
  while len(futures) >= self.config.futures_max_at_time:
287
310
  futures = self._process_futures(futures)
288
- if (
289
- self.config.limit_entities_delete
290
- and self.report.num_hard_deleted > self.config.limit_entities_delete
291
- ):
292
- logger.info(
293
- f"Limit of {self.config.limit_entities_delete} entities reached. Stopped adding more."
294
- )
311
+ if self._deletion_limit_reached() or self._times_up():
295
312
  break
296
- if (
297
- self.config.runtime_limit_seconds
298
- and time.time() - self.start_time
299
- > self.config.runtime_limit_seconds
300
- ):
301
- logger.info(
302
- f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Not submitting more futures."
303
- )
304
- break
305
-
306
313
  future = executor.submit(self.delete_soft_deleted_entity, urn)
307
314
  futures[future] = urn
308
315
 
@@ -61,6 +61,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
61
61
  ColumnRef,
62
62
  DownstreamColumnRef,
63
63
  )
64
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
64
65
  from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
65
66
  from datahub.utilities.perf_timer import PerfTimer
66
67
 
@@ -475,10 +476,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
475
476
 
476
477
  entry = PreparsedQuery(
477
478
  # Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
478
- # job at eliminating redundant / repetitive queries. As such, we don't include the fingerprint
479
- # here so that the aggregator auto-generates one.
480
- # query_id=res["query_fingerprint"],
481
- query_id=None,
479
+ # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
480
+ # here
481
+ query_id=get_query_fingerprint(
482
+ res["query_text"], self.identifiers.platform, fast=True
483
+ ),
482
484
  query_text=res["query_text"],
483
485
  upstreams=upstreams,
484
486
  downstream=downstream,
@@ -2,9 +2,9 @@ import json
2
2
  import logging
3
3
  import re
4
4
  import time
5
- from collections import OrderedDict
6
- from dataclasses import dataclass
7
- from datetime import datetime
5
+ from collections import OrderedDict, defaultdict
6
+ from dataclasses import dataclass, field as dataclass_field
7
+ from datetime import datetime, timedelta, timezone
8
8
  from functools import lru_cache
9
9
  from typing import (
10
10
  Any,
@@ -196,6 +196,11 @@ RETRIABLE_ERROR_CODES = [
196
196
  504, # Gateway Timeout
197
197
  ]
198
198
 
199
+ # From experience, this expiry time typically ranges from 50 minutes
200
+ # to 2 hours but might as well be configurable. We will allow upto
201
+ # 10 minutes of such expiry time
202
+ REGULAR_AUTH_EXPIRY_PERIOD = timedelta(minutes=10)
203
+
199
204
  logger: logging.Logger = logging.getLogger(__name__)
200
205
 
201
206
  # Replace / with |
@@ -637,6 +642,7 @@ class SiteIdContentUrl:
637
642
  site_content_url: str
638
643
 
639
644
 
645
+ @dataclass
640
646
  class TableauSourceReport(StaleEntityRemovalSourceReport):
641
647
  get_all_datasources_query_failed: bool = False
642
648
  num_get_datasource_query_failures: int = 0
@@ -653,7 +659,14 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
653
659
  num_upstream_table_lineage_failed_parse_sql: int = 0
654
660
  num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
655
661
  num_hidden_assets_skipped: int = 0
656
- logged_in_user: List[UserInfo] = []
662
+ logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
663
+ last_authenticated_at: Optional[datetime] = None
664
+
665
+ num_expected_tableau_metadata_queries: int = 0
666
+ num_actual_tableau_metadata_queries: int = 0
667
+ tableau_server_error_stats: Dict[str, int] = dataclass_field(
668
+ default_factory=(lambda: defaultdict(int))
669
+ )
657
670
 
658
671
 
659
672
  def report_user_role(report: TableauSourceReport, server: Server) -> None:
@@ -724,6 +737,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
724
737
  try:
725
738
  logger.info(f"Authenticated to Tableau site: '{site_content_url}'")
726
739
  self.server = self.config.make_tableau_client(site_content_url)
740
+ self.report.last_authenticated_at = datetime.now(timezone.utc)
727
741
  report_user_role(report=self.report, server=self.server)
728
742
  # Note that we're not catching ConfigurationError, since we want that to throw.
729
743
  except ValueError as e:
@@ -807,10 +821,13 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
807
821
  site_source = TableauSiteSource(
808
822
  config=self.config,
809
823
  ctx=self.ctx,
810
- site=site
811
- if site
812
- else SiteIdContentUrl(
813
- site_id=self.server.site_id, site_content_url=self.config.site
824
+ site=(
825
+ site
826
+ if site
827
+ else SiteIdContentUrl(
828
+ site_id=self.server.site_id,
829
+ site_content_url=self.config.site,
830
+ )
814
831
  ),
815
832
  report=self.report,
816
833
  server=self.server,
@@ -925,6 +942,7 @@ class TableauSiteSource:
925
942
  # Sign-in again may not be enough because Tableau sometimes caches invalid sessions
926
943
  # so we need to recreate the Tableau Server object
927
944
  self.server = self.config.make_tableau_client(self.site_content_url)
945
+ self.report.last_authenticated_at = datetime.now(timezone.utc)
928
946
 
929
947
  def _populate_usage_stat_registry(self) -> None:
930
948
  if self.server is None:
@@ -1190,6 +1208,7 @@ class TableauSiteSource:
1190
1208
  )
1191
1209
  try:
1192
1210
  assert self.server is not None
1211
+ self.report.num_actual_tableau_metadata_queries += 1
1193
1212
  query_data = query_metadata_cursor_based_pagination(
1194
1213
  server=self.server,
1195
1214
  main_query=query,
@@ -1199,25 +1218,36 @@ class TableauSiteSource:
1199
1218
  qry_filter=query_filter,
1200
1219
  )
1201
1220
 
1202
- except REAUTHENTICATE_ERRORS:
1203
- if not retry_on_auth_error:
1221
+ except REAUTHENTICATE_ERRORS as e:
1222
+ self.report.tableau_server_error_stats[e.__class__.__name__] += 1
1223
+ if not retry_on_auth_error or retries_remaining <= 0:
1204
1224
  raise
1205
1225
 
1206
- # If ingestion has been running for over 2 hours, the Tableau
1207
- # temporary credentials will expire. If this happens, this exception
1208
- # will be thrown, and we need to re-authenticate and retry.
1209
- self._re_authenticate()
1226
+ # We have been getting some irregular authorization errors like below well before the expected expiry time
1227
+ # - within few seconds of initial authentication . We'll retry without re-auth for such cases.
1228
+ # <class 'tableauserverclient.server.endpoint.exceptions.NonXMLResponseError'>:
1229
+ # b'{"timestamp":"xxx","status":401,"error":"Unauthorized","path":"/relationship-service-war/graphql"}'
1230
+ if self.report.last_authenticated_at and (
1231
+ datetime.now(timezone.utc) - self.report.last_authenticated_at
1232
+ > REGULAR_AUTH_EXPIRY_PERIOD
1233
+ ):
1234
+ # If ingestion has been running for over 2 hours, the Tableau
1235
+ # temporary credentials will expire. If this happens, this exception
1236
+ # will be thrown, and we need to re-authenticate and retry.
1237
+ self._re_authenticate()
1238
+
1210
1239
  return self.get_connection_object_page(
1211
1240
  query=query,
1212
1241
  connection_type=connection_type,
1213
1242
  query_filter=query_filter,
1214
1243
  fetch_size=fetch_size,
1215
1244
  current_cursor=current_cursor,
1216
- retry_on_auth_error=False,
1245
+ retry_on_auth_error=True,
1217
1246
  retries_remaining=retries_remaining - 1,
1218
1247
  )
1219
1248
 
1220
1249
  except InternalServerError as ise:
1250
+ self.report.tableau_server_error_stats[InternalServerError.__name__] += 1
1221
1251
  # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry.
1222
1252
  # Extended with other retryable errors.
1223
1253
  if ise.code in RETRIABLE_ERROR_CODES:
@@ -1230,13 +1260,14 @@ class TableauSiteSource:
1230
1260
  query_filter=query_filter,
1231
1261
  fetch_size=fetch_size,
1232
1262
  current_cursor=current_cursor,
1233
- retry_on_auth_error=False,
1263
+ retry_on_auth_error=True,
1234
1264
  retries_remaining=retries_remaining - 1,
1235
1265
  )
1236
1266
  else:
1237
1267
  raise ise
1238
1268
 
1239
1269
  except OSError:
1270
+ self.report.tableau_server_error_stats[OSError.__name__] += 1
1240
1271
  # In tableauseverclient 0.26 (which was yanked and released in 0.28 on 2023-10-04),
1241
1272
  # the request logic was changed to use threads.
1242
1273
  # https://github.com/tableau/server-client-python/commit/307d8a20a30f32c1ce615cca7c6a78b9b9bff081
@@ -1251,7 +1282,7 @@ class TableauSiteSource:
1251
1282
  query_filter=query_filter,
1252
1283
  fetch_size=fetch_size,
1253
1284
  current_cursor=current_cursor,
1254
- retry_on_auth_error=False,
1285
+ retry_on_auth_error=True,
1255
1286
  retries_remaining=retries_remaining - 1,
1256
1287
  )
1257
1288
 
@@ -1339,7 +1370,7 @@ class TableauSiteSource:
1339
1370
  query_filter=query_filter,
1340
1371
  fetch_size=fetch_size,
1341
1372
  current_cursor=current_cursor,
1342
- retry_on_auth_error=False,
1373
+ retry_on_auth_error=True,
1343
1374
  retries_remaining=retries_remaining,
1344
1375
  )
1345
1376
  raise RuntimeError(f"Query {connection_type} error: {errors}")
@@ -1377,6 +1408,7 @@ class TableauSiteSource:
1377
1408
  while has_next_page:
1378
1409
  filter_: str = make_filter(filter_page)
1379
1410
 
1411
+ self.report.num_expected_tableau_metadata_queries += 1
1380
1412
  (
1381
1413
  connection_objects,
1382
1414
  current_cursor,
@@ -54,6 +54,20 @@ def default_user_urn_builder(email: str) -> str:
54
54
  return builder.make_user_urn(email.split("@")[0])
55
55
 
56
56
 
57
+ def extract_user_email(user: str) -> Optional[str]:
58
+ """Extracts user email from user input
59
+
60
+ >>> extract_user_email('urn:li:corpuser:abc@xyz.com')
61
+ 'abc@xyz.com'
62
+ >>> extract_user_email('urn:li:corpuser:abc')
63
+ >>> extract_user_email('abc@xyz.com')
64
+ 'abc@xyz.com'
65
+ """
66
+ if user.startswith(("urn:li:corpuser:", "urn:li:corpGroup:")):
67
+ user = user.split(":")[-1]
68
+ return user if "@" in user else None
69
+
70
+
57
71
  def make_usage_workunit(
58
72
  bucket_start_time: datetime,
59
73
  resource: ResourceType,
@@ -104,7 +118,7 @@ def make_usage_workunit(
104
118
  DatasetUserUsageCountsClass(
105
119
  user=user_urn_builder(user),
106
120
  count=count,
107
- userEmail=user if "@" in user else None,
121
+ userEmail=extract_user_email(user),
108
122
  )
109
123
  for user, count in user_freq
110
124
  ],
@@ -198,7 +198,7 @@ class TableSwap:
198
198
 
199
199
  @dataclasses.dataclass
200
200
  class PreparsedQuery:
201
- # If not provided, we will generate one using the fast fingerprint generator.
201
+ # If not provided, we will generate one using the fingerprint generator.
202
202
  query_id: Optional[QueryId]
203
203
 
204
204
  query_text: str
@@ -622,7 +622,6 @@ class SqlParsingAggregator(Closeable):
622
622
  query_fingerprint = get_query_fingerprint(
623
623
  known_query_lineage.query_text,
624
624
  platform=self.platform.platform_name,
625
- fast=True,
626
625
  )
627
626
  formatted_query = self._maybe_format_query(known_query_lineage.query_text)
628
627
 
@@ -848,7 +847,6 @@ class SqlParsingAggregator(Closeable):
848
847
  query_fingerprint = get_query_fingerprint(
849
848
  parsed.query_text,
850
849
  platform=self.platform.platform_name,
851
- fast=True,
852
850
  )
853
851
 
854
852
  # Format the query.