acryl-datahub 0.15.0.1rc14__py3-none-any.whl → 0.15.0.1rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc14.dist-info → acryl_datahub-0.15.0.1rc16.dist-info}/METADATA +2440 -2440
- {acryl_datahub-0.15.0.1rc14.dist-info → acryl_datahub-0.15.0.1rc16.dist-info}/RECORD +13 -13
- {acryl_datahub-0.15.0.1rc14.dist-info → acryl_datahub-0.15.0.1rc16.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/ingestion/source/gc/datahub_gc.py +3 -0
- datahub/ingestion/source/gc/execution_request_cleanup.py +13 -5
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +29 -22
- datahub/ingestion/source/snowflake/snowflake_queries.py +6 -4
- datahub/ingestion/source/tableau/tableau.py +50 -18
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -3
- {acryl_datahub-0.15.0.1rc14.dist-info → acryl_datahub-0.15.0.1rc16.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc14.dist-info → acryl_datahub-0.15.0.1rc16.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=0dgSJoggO_qJtX-oEnxH20rGzNGGCstuwsxqUKzbKUA,577
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -302,10 +302,10 @@ datahub/ingestion/source/fivetran/fivetran.py,sha256=uKbM5czPz-6LOseoh1FwavWDIuL
|
|
|
302
302
|
datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP_CyAT5Cian2N4a-lb8x1NKHk,12776
|
|
303
303
|
datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
|
|
304
304
|
datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
305
|
-
datahub/ingestion/source/gc/datahub_gc.py,sha256=
|
|
305
|
+
datahub/ingestion/source/gc/datahub_gc.py,sha256=W6uoeV7B4WIXdxT4tOEdDksdJm656WwwvkH79L7f_8Q,12969
|
|
306
306
|
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeNOjEingKpYy-XvCPeaAb4k,17125
|
|
307
|
-
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=
|
|
308
|
-
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=
|
|
307
|
+
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=9jsyCIspWSSYSAVPHjKHr05885rXxM6FCH7KzTBceic,10139
|
|
308
|
+
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=2JpESfsqoJRdLskV3AHYU8nRj_NvNtIaLZ4_RRNIod4,11229
|
|
309
309
|
datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
310
310
|
datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
|
|
311
311
|
datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
|
|
@@ -434,7 +434,7 @@ datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJ
|
|
|
434
434
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
435
435
|
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=5Lpy_irZlbOFJbvVkgsZSBjdLCT3VZNjlEvttzSQAU4,21121
|
|
436
436
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
437
|
-
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=
|
|
437
|
+
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=jTpnFWRqqFId6DKJvvAbNuFPxyNi1oQxxDUyMvh1iu4,26968
|
|
438
438
|
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=5po2FWz41UVowykJYbTFGxsltbmlHBCPcHG20VOhdOE,38469
|
|
439
439
|
datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
|
|
440
440
|
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
|
|
@@ -491,7 +491,7 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
|
|
|
491
491
|
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
|
|
492
492
|
datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
|
|
493
493
|
datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
494
|
-
datahub/ingestion/source/tableau/tableau.py,sha256=
|
|
494
|
+
datahub/ingestion/source/tableau/tableau.py,sha256=KAwyM9XiJUXFPwuVQM7GcHntcTFxMFAN4j3xSIOMbgg,142010
|
|
495
495
|
datahub/ingestion/source/tableau/tableau_common.py,sha256=a3Nu0Upy6_pnrd7XpSMcYHdnYca1JBW7H0jMqkYr0ME,26871
|
|
496
496
|
datahub/ingestion/source/tableau/tableau_constant.py,sha256=ZcAeHsQUXVVL26ORly0ByZk_GJAFbxaKuJAlX_sYMac,2686
|
|
497
497
|
datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=nSyx9RzC6TCQDm-cTVJ657qT8iDwzk_8JMKpohhmOc4,1046
|
|
@@ -511,7 +511,7 @@ datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9d
|
|
|
511
511
|
datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
512
512
|
datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
|
|
513
513
|
datahub/ingestion/source/usage/starburst_trino_usage.py,sha256=R1DDs98tYn2WW0_tGXQhk7lqEU0ru2SgrvMBtV305ps,10542
|
|
514
|
-
datahub/ingestion/source/usage/usage_common.py,sha256=
|
|
514
|
+
datahub/ingestion/source/usage/usage_common.py,sha256=poNlVKx1VRPRfE4K3yAyIS96DkGAt3MC17vQlwqBNvw,12235
|
|
515
515
|
datahub/ingestion/source_config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
516
516
|
datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1pZSi33tzW9sQb7ZEgac,1733
|
|
517
517
|
datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
|
|
@@ -881,7 +881,7 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
|
|
|
881
881
|
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
882
882
|
datahub/sql_parsing/schema_resolver.py,sha256=9INZWdxA2dMSLK6RXaVqjbjyLY_VKMhCkQv_Xd6Ln3I,10848
|
|
883
883
|
datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
|
|
884
|
-
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=
|
|
884
|
+
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=jVF6TbyM71XdJ34K0Setz3LgJALvJrJs1mVKdxU_6d4,69830
|
|
885
885
|
datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
|
|
886
886
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
887
887
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=gUVq3NwZUzQByJs43JZXz8lZf0ZVzVt0FzaW5wZOwK4,47460
|
|
@@ -986,8 +986,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
986
986
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
987
987
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
988
988
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
989
|
-
acryl_datahub-0.15.0.
|
|
990
|
-
acryl_datahub-0.15.0.
|
|
991
|
-
acryl_datahub-0.15.0.
|
|
992
|
-
acryl_datahub-0.15.0.
|
|
993
|
-
acryl_datahub-0.15.0.
|
|
989
|
+
acryl_datahub-0.15.0.1rc16.dist-info/METADATA,sha256=hMvfZy8EYOj5eb7yygEhb_kZJbHtpVx-bWNE6H6eu_c,173444
|
|
990
|
+
acryl_datahub-0.15.0.1rc16.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
|
|
991
|
+
acryl_datahub-0.15.0.1rc16.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
|
|
992
|
+
acryl_datahub-0.15.0.1rc16.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
993
|
+
acryl_datahub-0.15.0.1rc16.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -188,6 +188,9 @@ class DataHubGcSource(Source):
|
|
|
188
188
|
self._truncate_timeseries_helper(
|
|
189
189
|
aspect_name="dashboardUsageStatistics", entity_type="dashboard"
|
|
190
190
|
)
|
|
191
|
+
self._truncate_timeseries_helper(
|
|
192
|
+
aspect_name="queryusagestatistics", entity_type="query"
|
|
193
|
+
)
|
|
191
194
|
|
|
192
195
|
def _truncate_timeseries_helper(self, aspect_name: str, entity_type: str) -> None:
|
|
193
196
|
self._truncate_timeseries_with_watch_optional(
|
|
@@ -141,7 +141,9 @@ class DatahubExecutionRequestCleanup:
|
|
|
141
141
|
break
|
|
142
142
|
if self.report.ergc_read_errors >= self.config.max_read_errors:
|
|
143
143
|
self.report.failure(
|
|
144
|
-
|
|
144
|
+
title="Too many read errors, aborting",
|
|
145
|
+
message="Too many read errors, aborting",
|
|
146
|
+
context=str(self.instance_id),
|
|
145
147
|
)
|
|
146
148
|
break
|
|
147
149
|
try:
|
|
@@ -158,8 +160,11 @@ class DatahubExecutionRequestCleanup:
|
|
|
158
160
|
break
|
|
159
161
|
params["scrollId"] = document["scrollId"]
|
|
160
162
|
except Exception as e:
|
|
161
|
-
|
|
162
|
-
|
|
163
|
+
self.report.failure(
|
|
164
|
+
title="Failed to fetch next batch of execution requests",
|
|
165
|
+
message="Failed to fetch next batch of execution requests",
|
|
166
|
+
context=str(self.instance_id),
|
|
167
|
+
exc=e,
|
|
163
168
|
)
|
|
164
169
|
self.report.ergc_read_errors += 1
|
|
165
170
|
|
|
@@ -231,8 +236,11 @@ class DatahubExecutionRequestCleanup:
|
|
|
231
236
|
self.graph.delete_entity(entry.urn, True)
|
|
232
237
|
except Exception as e:
|
|
233
238
|
self.report.ergc_delete_errors += 1
|
|
234
|
-
|
|
235
|
-
|
|
239
|
+
self.report.failure(
|
|
240
|
+
title="Failed to delete ExecutionRequest",
|
|
241
|
+
message="Failed to delete ExecutionRequest",
|
|
242
|
+
context=str(self.instance_id),
|
|
243
|
+
exc=e,
|
|
236
244
|
)
|
|
237
245
|
|
|
238
246
|
def _reached_runtime_limit(self) -> bool:
|
|
@@ -105,6 +105,8 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
105
105
|
sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
|
|
106
106
|
default_factory=TopKDict
|
|
107
107
|
)
|
|
108
|
+
runtime_limit_reached: bool = False
|
|
109
|
+
deletion_limit_reached: bool = False
|
|
108
110
|
|
|
109
111
|
|
|
110
112
|
class SoftDeletedEntitiesCleanup:
|
|
@@ -163,6 +165,8 @@ class SoftDeletedEntitiesCleanup:
|
|
|
163
165
|
f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
|
|
164
166
|
)
|
|
165
167
|
return
|
|
168
|
+
if self._deletion_limit_reached() or self._times_up():
|
|
169
|
+
return
|
|
166
170
|
self._increment_removal_started_count()
|
|
167
171
|
self.ctx.graph.delete_entity(urn=urn, hard=True)
|
|
168
172
|
self.ctx.graph.delete_references_to_urn(
|
|
@@ -203,11 +207,10 @@ class SoftDeletedEntitiesCleanup:
|
|
|
203
207
|
for future in done:
|
|
204
208
|
self._print_report()
|
|
205
209
|
if future.exception():
|
|
206
|
-
logger.error(
|
|
207
|
-
f"Failed to delete entity {futures[future]}: {future.exception()}"
|
|
208
|
-
)
|
|
209
210
|
self.report.failure(
|
|
210
|
-
|
|
211
|
+
title="Failed to delete entity",
|
|
212
|
+
message="Failed to delete entity",
|
|
213
|
+
context=futures[future],
|
|
211
214
|
exc=future.exception(),
|
|
212
215
|
)
|
|
213
216
|
self.report.num_soft_deleted_entity_processed += 1
|
|
@@ -255,7 +258,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
255
258
|
)
|
|
256
259
|
break
|
|
257
260
|
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
258
|
-
if not scroll_across_entities:
|
|
261
|
+
if not scroll_across_entities or not scroll_across_entities.get("count"):
|
|
259
262
|
break
|
|
260
263
|
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
261
264
|
self.report.num_queries_found += scroll_across_entities.get("count")
|
|
@@ -274,6 +277,26 @@ class SoftDeletedEntitiesCleanup:
|
|
|
274
277
|
)
|
|
275
278
|
yield from self._get_soft_deleted_queries()
|
|
276
279
|
|
|
280
|
+
def _times_up(self) -> bool:
|
|
281
|
+
if (
|
|
282
|
+
self.config.runtime_limit_seconds
|
|
283
|
+
and time.time() - self.start_time > self.config.runtime_limit_seconds
|
|
284
|
+
):
|
|
285
|
+
with self._report_lock:
|
|
286
|
+
self.report.runtime_limit_reached = True
|
|
287
|
+
return True
|
|
288
|
+
return False
|
|
289
|
+
|
|
290
|
+
def _deletion_limit_reached(self) -> bool:
|
|
291
|
+
if (
|
|
292
|
+
self.config.limit_entities_delete
|
|
293
|
+
and self.report.num_hard_deleted > self.config.limit_entities_delete
|
|
294
|
+
):
|
|
295
|
+
with self._report_lock:
|
|
296
|
+
self.report.deletion_limit_reached = True
|
|
297
|
+
return True
|
|
298
|
+
return False
|
|
299
|
+
|
|
277
300
|
def cleanup_soft_deleted_entities(self) -> None:
|
|
278
301
|
if not self.config.enabled:
|
|
279
302
|
return
|
|
@@ -285,24 +308,8 @@ class SoftDeletedEntitiesCleanup:
|
|
|
285
308
|
self._print_report()
|
|
286
309
|
while len(futures) >= self.config.futures_max_at_time:
|
|
287
310
|
futures = self._process_futures(futures)
|
|
288
|
-
if (
|
|
289
|
-
self.config.limit_entities_delete
|
|
290
|
-
and self.report.num_hard_deleted > self.config.limit_entities_delete
|
|
291
|
-
):
|
|
292
|
-
logger.info(
|
|
293
|
-
f"Limit of {self.config.limit_entities_delete} entities reached. Stopped adding more."
|
|
294
|
-
)
|
|
311
|
+
if self._deletion_limit_reached() or self._times_up():
|
|
295
312
|
break
|
|
296
|
-
if (
|
|
297
|
-
self.config.runtime_limit_seconds
|
|
298
|
-
and time.time() - self.start_time
|
|
299
|
-
> self.config.runtime_limit_seconds
|
|
300
|
-
):
|
|
301
|
-
logger.info(
|
|
302
|
-
f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Not submitting more futures."
|
|
303
|
-
)
|
|
304
|
-
break
|
|
305
|
-
|
|
306
313
|
future = executor.submit(self.delete_soft_deleted_entity, urn)
|
|
307
314
|
futures[future] = urn
|
|
308
315
|
|
|
@@ -61,6 +61,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
61
61
|
ColumnRef,
|
|
62
62
|
DownstreamColumnRef,
|
|
63
63
|
)
|
|
64
|
+
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
64
65
|
from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
|
|
65
66
|
from datahub.utilities.perf_timer import PerfTimer
|
|
66
67
|
|
|
@@ -475,10 +476,11 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
475
476
|
|
|
476
477
|
entry = PreparsedQuery(
|
|
477
478
|
# Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
|
|
478
|
-
# job at eliminating redundant / repetitive queries. As such, we
|
|
479
|
-
# here
|
|
480
|
-
|
|
481
|
-
|
|
479
|
+
# job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
|
|
480
|
+
# here
|
|
481
|
+
query_id=get_query_fingerprint(
|
|
482
|
+
res["query_text"], self.identifiers.platform, fast=True
|
|
483
|
+
),
|
|
482
484
|
query_text=res["query_text"],
|
|
483
485
|
upstreams=upstreams,
|
|
484
486
|
downstream=downstream,
|
|
@@ -2,9 +2,9 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
4
|
import time
|
|
5
|
-
from collections import OrderedDict
|
|
6
|
-
from dataclasses import dataclass
|
|
7
|
-
from datetime import datetime
|
|
5
|
+
from collections import OrderedDict, defaultdict
|
|
6
|
+
from dataclasses import dataclass, field as dataclass_field
|
|
7
|
+
from datetime import datetime, timedelta, timezone
|
|
8
8
|
from functools import lru_cache
|
|
9
9
|
from typing import (
|
|
10
10
|
Any,
|
|
@@ -196,6 +196,11 @@ RETRIABLE_ERROR_CODES = [
|
|
|
196
196
|
504, # Gateway Timeout
|
|
197
197
|
]
|
|
198
198
|
|
|
199
|
+
# From experience, this expiry time typically ranges from 50 minutes
|
|
200
|
+
# to 2 hours but might as well be configurable. We will allow upto
|
|
201
|
+
# 10 minutes of such expiry time
|
|
202
|
+
REGULAR_AUTH_EXPIRY_PERIOD = timedelta(minutes=10)
|
|
203
|
+
|
|
199
204
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
200
205
|
|
|
201
206
|
# Replace / with |
|
|
@@ -637,6 +642,7 @@ class SiteIdContentUrl:
|
|
|
637
642
|
site_content_url: str
|
|
638
643
|
|
|
639
644
|
|
|
645
|
+
@dataclass
|
|
640
646
|
class TableauSourceReport(StaleEntityRemovalSourceReport):
|
|
641
647
|
get_all_datasources_query_failed: bool = False
|
|
642
648
|
num_get_datasource_query_failures: int = 0
|
|
@@ -653,7 +659,14 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
|
|
|
653
659
|
num_upstream_table_lineage_failed_parse_sql: int = 0
|
|
654
660
|
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
|
|
655
661
|
num_hidden_assets_skipped: int = 0
|
|
656
|
-
logged_in_user: List[UserInfo] =
|
|
662
|
+
logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
|
|
663
|
+
last_authenticated_at: Optional[datetime] = None
|
|
664
|
+
|
|
665
|
+
num_expected_tableau_metadata_queries: int = 0
|
|
666
|
+
num_actual_tableau_metadata_queries: int = 0
|
|
667
|
+
tableau_server_error_stats: Dict[str, int] = dataclass_field(
|
|
668
|
+
default_factory=(lambda: defaultdict(int))
|
|
669
|
+
)
|
|
657
670
|
|
|
658
671
|
|
|
659
672
|
def report_user_role(report: TableauSourceReport, server: Server) -> None:
|
|
@@ -724,6 +737,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
724
737
|
try:
|
|
725
738
|
logger.info(f"Authenticated to Tableau site: '{site_content_url}'")
|
|
726
739
|
self.server = self.config.make_tableau_client(site_content_url)
|
|
740
|
+
self.report.last_authenticated_at = datetime.now(timezone.utc)
|
|
727
741
|
report_user_role(report=self.report, server=self.server)
|
|
728
742
|
# Note that we're not catching ConfigurationError, since we want that to throw.
|
|
729
743
|
except ValueError as e:
|
|
@@ -807,10 +821,13 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
807
821
|
site_source = TableauSiteSource(
|
|
808
822
|
config=self.config,
|
|
809
823
|
ctx=self.ctx,
|
|
810
|
-
site=
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
824
|
+
site=(
|
|
825
|
+
site
|
|
826
|
+
if site
|
|
827
|
+
else SiteIdContentUrl(
|
|
828
|
+
site_id=self.server.site_id,
|
|
829
|
+
site_content_url=self.config.site,
|
|
830
|
+
)
|
|
814
831
|
),
|
|
815
832
|
report=self.report,
|
|
816
833
|
server=self.server,
|
|
@@ -925,6 +942,7 @@ class TableauSiteSource:
|
|
|
925
942
|
# Sign-in again may not be enough because Tableau sometimes caches invalid sessions
|
|
926
943
|
# so we need to recreate the Tableau Server object
|
|
927
944
|
self.server = self.config.make_tableau_client(self.site_content_url)
|
|
945
|
+
self.report.last_authenticated_at = datetime.now(timezone.utc)
|
|
928
946
|
|
|
929
947
|
def _populate_usage_stat_registry(self) -> None:
|
|
930
948
|
if self.server is None:
|
|
@@ -1190,6 +1208,7 @@ class TableauSiteSource:
|
|
|
1190
1208
|
)
|
|
1191
1209
|
try:
|
|
1192
1210
|
assert self.server is not None
|
|
1211
|
+
self.report.num_actual_tableau_metadata_queries += 1
|
|
1193
1212
|
query_data = query_metadata_cursor_based_pagination(
|
|
1194
1213
|
server=self.server,
|
|
1195
1214
|
main_query=query,
|
|
@@ -1199,25 +1218,36 @@ class TableauSiteSource:
|
|
|
1199
1218
|
qry_filter=query_filter,
|
|
1200
1219
|
)
|
|
1201
1220
|
|
|
1202
|
-
except REAUTHENTICATE_ERRORS:
|
|
1203
|
-
|
|
1221
|
+
except REAUTHENTICATE_ERRORS as e:
|
|
1222
|
+
self.report.tableau_server_error_stats[e.__class__.__name__] += 1
|
|
1223
|
+
if not retry_on_auth_error or retries_remaining <= 0:
|
|
1204
1224
|
raise
|
|
1205
1225
|
|
|
1206
|
-
#
|
|
1207
|
-
#
|
|
1208
|
-
#
|
|
1209
|
-
|
|
1226
|
+
# We have been getting some irregular authorization errors like below well before the expected expiry time
|
|
1227
|
+
# - within few seconds of initial authentication . We'll retry without re-auth for such cases.
|
|
1228
|
+
# <class 'tableauserverclient.server.endpoint.exceptions.NonXMLResponseError'>:
|
|
1229
|
+
# b'{"timestamp":"xxx","status":401,"error":"Unauthorized","path":"/relationship-service-war/graphql"}'
|
|
1230
|
+
if self.report.last_authenticated_at and (
|
|
1231
|
+
datetime.now(timezone.utc) - self.report.last_authenticated_at
|
|
1232
|
+
> REGULAR_AUTH_EXPIRY_PERIOD
|
|
1233
|
+
):
|
|
1234
|
+
# If ingestion has been running for over 2 hours, the Tableau
|
|
1235
|
+
# temporary credentials will expire. If this happens, this exception
|
|
1236
|
+
# will be thrown, and we need to re-authenticate and retry.
|
|
1237
|
+
self._re_authenticate()
|
|
1238
|
+
|
|
1210
1239
|
return self.get_connection_object_page(
|
|
1211
1240
|
query=query,
|
|
1212
1241
|
connection_type=connection_type,
|
|
1213
1242
|
query_filter=query_filter,
|
|
1214
1243
|
fetch_size=fetch_size,
|
|
1215
1244
|
current_cursor=current_cursor,
|
|
1216
|
-
retry_on_auth_error=
|
|
1245
|
+
retry_on_auth_error=True,
|
|
1217
1246
|
retries_remaining=retries_remaining - 1,
|
|
1218
1247
|
)
|
|
1219
1248
|
|
|
1220
1249
|
except InternalServerError as ise:
|
|
1250
|
+
self.report.tableau_server_error_stats[InternalServerError.__name__] += 1
|
|
1221
1251
|
# In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry.
|
|
1222
1252
|
# Extended with other retryable errors.
|
|
1223
1253
|
if ise.code in RETRIABLE_ERROR_CODES:
|
|
@@ -1230,13 +1260,14 @@ class TableauSiteSource:
|
|
|
1230
1260
|
query_filter=query_filter,
|
|
1231
1261
|
fetch_size=fetch_size,
|
|
1232
1262
|
current_cursor=current_cursor,
|
|
1233
|
-
retry_on_auth_error=
|
|
1263
|
+
retry_on_auth_error=True,
|
|
1234
1264
|
retries_remaining=retries_remaining - 1,
|
|
1235
1265
|
)
|
|
1236
1266
|
else:
|
|
1237
1267
|
raise ise
|
|
1238
1268
|
|
|
1239
1269
|
except OSError:
|
|
1270
|
+
self.report.tableau_server_error_stats[OSError.__name__] += 1
|
|
1240
1271
|
# In tableauseverclient 0.26 (which was yanked and released in 0.28 on 2023-10-04),
|
|
1241
1272
|
# the request logic was changed to use threads.
|
|
1242
1273
|
# https://github.com/tableau/server-client-python/commit/307d8a20a30f32c1ce615cca7c6a78b9b9bff081
|
|
@@ -1251,7 +1282,7 @@ class TableauSiteSource:
|
|
|
1251
1282
|
query_filter=query_filter,
|
|
1252
1283
|
fetch_size=fetch_size,
|
|
1253
1284
|
current_cursor=current_cursor,
|
|
1254
|
-
retry_on_auth_error=
|
|
1285
|
+
retry_on_auth_error=True,
|
|
1255
1286
|
retries_remaining=retries_remaining - 1,
|
|
1256
1287
|
)
|
|
1257
1288
|
|
|
@@ -1339,7 +1370,7 @@ class TableauSiteSource:
|
|
|
1339
1370
|
query_filter=query_filter,
|
|
1340
1371
|
fetch_size=fetch_size,
|
|
1341
1372
|
current_cursor=current_cursor,
|
|
1342
|
-
retry_on_auth_error=
|
|
1373
|
+
retry_on_auth_error=True,
|
|
1343
1374
|
retries_remaining=retries_remaining,
|
|
1344
1375
|
)
|
|
1345
1376
|
raise RuntimeError(f"Query {connection_type} error: {errors}")
|
|
@@ -1377,6 +1408,7 @@ class TableauSiteSource:
|
|
|
1377
1408
|
while has_next_page:
|
|
1378
1409
|
filter_: str = make_filter(filter_page)
|
|
1379
1410
|
|
|
1411
|
+
self.report.num_expected_tableau_metadata_queries += 1
|
|
1380
1412
|
(
|
|
1381
1413
|
connection_objects,
|
|
1382
1414
|
current_cursor,
|
|
@@ -54,6 +54,20 @@ def default_user_urn_builder(email: str) -> str:
|
|
|
54
54
|
return builder.make_user_urn(email.split("@")[0])
|
|
55
55
|
|
|
56
56
|
|
|
57
|
+
def extract_user_email(user: str) -> Optional[str]:
|
|
58
|
+
"""Extracts user email from user input
|
|
59
|
+
|
|
60
|
+
>>> extract_user_email('urn:li:corpuser:abc@xyz.com')
|
|
61
|
+
'abc@xyz.com'
|
|
62
|
+
>>> extract_user_email('urn:li:corpuser:abc')
|
|
63
|
+
>>> extract_user_email('abc@xyz.com')
|
|
64
|
+
'abc@xyz.com'
|
|
65
|
+
"""
|
|
66
|
+
if user.startswith(("urn:li:corpuser:", "urn:li:corpGroup:")):
|
|
67
|
+
user = user.split(":")[-1]
|
|
68
|
+
return user if "@" in user else None
|
|
69
|
+
|
|
70
|
+
|
|
57
71
|
def make_usage_workunit(
|
|
58
72
|
bucket_start_time: datetime,
|
|
59
73
|
resource: ResourceType,
|
|
@@ -104,7 +118,7 @@ def make_usage_workunit(
|
|
|
104
118
|
DatasetUserUsageCountsClass(
|
|
105
119
|
user=user_urn_builder(user),
|
|
106
120
|
count=count,
|
|
107
|
-
userEmail=user
|
|
121
|
+
userEmail=extract_user_email(user),
|
|
108
122
|
)
|
|
109
123
|
for user, count in user_freq
|
|
110
124
|
],
|
|
@@ -198,7 +198,7 @@ class TableSwap:
|
|
|
198
198
|
|
|
199
199
|
@dataclasses.dataclass
|
|
200
200
|
class PreparsedQuery:
|
|
201
|
-
# If not provided, we will generate one using the
|
|
201
|
+
# If not provided, we will generate one using the fingerprint generator.
|
|
202
202
|
query_id: Optional[QueryId]
|
|
203
203
|
|
|
204
204
|
query_text: str
|
|
@@ -622,7 +622,6 @@ class SqlParsingAggregator(Closeable):
|
|
|
622
622
|
query_fingerprint = get_query_fingerprint(
|
|
623
623
|
known_query_lineage.query_text,
|
|
624
624
|
platform=self.platform.platform_name,
|
|
625
|
-
fast=True,
|
|
626
625
|
)
|
|
627
626
|
formatted_query = self._maybe_format_query(known_query_lineage.query_text)
|
|
628
627
|
|
|
@@ -848,7 +847,6 @@ class SqlParsingAggregator(Closeable):
|
|
|
848
847
|
query_fingerprint = get_query_fingerprint(
|
|
849
848
|
parsed.query_text,
|
|
850
849
|
platform=self.platform.platform_name,
|
|
851
|
-
fast=True,
|
|
852
850
|
)
|
|
853
851
|
|
|
854
852
|
# Format the query.
|
{acryl_datahub-0.15.0.1rc14.dist-info → acryl_datahub-0.15.0.1rc16.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|