acryl-datahub 0.15.0.1rc7__py3-none-any.whl → 0.15.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc7.dist-info → acryl_datahub-0.15.0.1rc9.dist-info}/METADATA +2448 -2448
- {acryl_datahub-0.15.0.1rc7.dist-info → acryl_datahub-0.15.0.1rc9.dist-info}/RECORD +12 -12
- datahub/__init__.py +1 -1
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/gc/datahub_gc.py +18 -5
- datahub/ingestion/source/gc/execution_request_cleanup.py +49 -12
- datahub/ingestion/source/mode.py +26 -20
- datahub/ingestion/source_report/ingestion_stage.py +1 -0
- datahub/sql_parsing/tool_meta_extractor.py +4 -1
- {acryl_datahub-0.15.0.1rc7.dist-info → acryl_datahub-0.15.0.1rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc7.dist-info → acryl_datahub-0.15.0.1rc9.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc7.dist-info → acryl_datahub-0.15.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=2793dupxo-Ov36jB1Z_p3H61xA3Rxhr1VhzHSdVOKhY,576
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -197,7 +197,7 @@ datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suab
|
|
|
197
197
|
datahub/ingestion/source/ldap.py,sha256=Vnzg8tpwBYeyM-KBVVsUJvGZGBMJiCJ_i_FhxaFRQ9A,18627
|
|
198
198
|
datahub/ingestion/source/metabase.py,sha256=oemiMdzjfr82Hx6rdwTNBzFM8962LDkosYh7SD_I5cY,31717
|
|
199
199
|
datahub/ingestion/source/mlflow.py,sha256=-yWUuAEVBiNN-elz8Pgn0UeGsC3fVB20z1zKNIr4LXI,12309
|
|
200
|
-
datahub/ingestion/source/mode.py,sha256=
|
|
200
|
+
datahub/ingestion/source/mode.py,sha256=cq1KIpLxuplETF7sUW0hoMQIZG1cgga5BGHP54a28wE,63467
|
|
201
201
|
datahub/ingestion/source/mongodb.py,sha256=vZue4Nz0xaBoCUsQr3_0OIRkWRxeE_IH_Y_QKZ1s7S0,21077
|
|
202
202
|
datahub/ingestion/source/nifi.py,sha256=ttsjZ9aRUvINmewvKFIQD8Rwa4jcl35WFG-F-jPGPWQ,56146
|
|
203
203
|
datahub/ingestion/source/openapi.py,sha256=3ea2ORz1cuq4e7L2hSjxG9Cw3__pVoJ5UNYTJS3EnKU,17386
|
|
@@ -217,7 +217,7 @@ datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm
|
|
|
217
217
|
datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
|
|
218
218
|
datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
219
219
|
datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
|
|
220
|
-
datahub/ingestion/source/aws/glue.py,sha256=
|
|
220
|
+
datahub/ingestion/source/aws/glue.py,sha256=r7y1MPDK__BKX_mrJjVa_CEmSXM3Pa02gt19o0sSLE8,56815
|
|
221
221
|
datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
|
|
222
222
|
datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
|
|
223
223
|
datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
|
|
@@ -302,9 +302,9 @@ datahub/ingestion/source/fivetran/fivetran.py,sha256=uKbM5czPz-6LOseoh1FwavWDIuL
|
|
|
302
302
|
datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP_CyAT5Cian2N4a-lb8x1NKHk,12776
|
|
303
303
|
datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
|
|
304
304
|
datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
305
|
-
datahub/ingestion/source/gc/datahub_gc.py,sha256=
|
|
305
|
+
datahub/ingestion/source/gc/datahub_gc.py,sha256=WOg3yIaNmwdbSTwytKeSfIUihsM7FMYBip9u2Dnwk3c,12849
|
|
306
306
|
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=u90XEmW1vRFbvp4CQ8ujPxTGJUyJqO2U6ApcI6mFrjE,16588
|
|
307
|
-
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=
|
|
307
|
+
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=sZbdkg3MuPVGf8eeeRg_2khGMZ01QoH4dgJiTxf7Srg,9813
|
|
308
308
|
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=wRnRaIVUG483tY4nyDkEn6Xi2RL5MjrVvoCoZimqwSg,7514
|
|
309
309
|
datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
310
310
|
datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
|
|
@@ -517,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
|
|
|
517
517
|
datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
|
|
518
518
|
datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
|
|
519
519
|
datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
520
|
-
datahub/ingestion/source_report/ingestion_stage.py,sha256=
|
|
520
|
+
datahub/ingestion/source_report/ingestion_stage.py,sha256=gbYmnio3fAOsjh_RzU3j_5UGu7bYBwUM4bm7S8ID_IU,1649
|
|
521
521
|
datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
|
|
522
522
|
datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
|
|
523
523
|
datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -882,7 +882,7 @@ datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf
|
|
|
882
882
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
883
883
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=gUVq3NwZUzQByJs43JZXz8lZf0ZVzVt0FzaW5wZOwK4,47460
|
|
884
884
|
datahub/sql_parsing/sqlglot_utils.py,sha256=n6yufzEGwSlFeCSU540hEldIuab0q8KGqm9x0vSawkc,14699
|
|
885
|
-
datahub/sql_parsing/tool_meta_extractor.py,sha256=
|
|
885
|
+
datahub/sql_parsing/tool_meta_extractor.py,sha256=qEPq8RFWyK0tmSPNlluvd5cxgwbd2v6m9ViSY4hm2QM,6822
|
|
886
886
|
datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
887
887
|
datahub/telemetry/stats.py,sha256=YltbtC3fe6rl1kcxn1A-mSnVpECTPm5k-brrUt7QxTI,967
|
|
888
888
|
datahub/telemetry/telemetry.py,sha256=gzla-QGNsynGg2FqFxiDDFQ0emG53MJ9lhOA2-UUg-Y,15047
|
|
@@ -982,8 +982,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
982
982
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
983
983
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
984
984
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
985
|
-
acryl_datahub-0.15.0.
|
|
986
|
-
acryl_datahub-0.15.0.
|
|
987
|
-
acryl_datahub-0.15.0.
|
|
988
|
-
acryl_datahub-0.15.0.
|
|
989
|
-
acryl_datahub-0.15.0.
|
|
985
|
+
acryl_datahub-0.15.0.1rc9.dist-info/METADATA,sha256=nUI5E0nMS2Ng9RLK_q6N4VmqhzakT3CIw34UEqv8E1E,173642
|
|
986
|
+
acryl_datahub-0.15.0.1rc9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
987
|
+
acryl_datahub-0.15.0.1rc9.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
|
|
988
|
+
acryl_datahub-0.15.0.1rc9.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
989
|
+
acryl_datahub-0.15.0.1rc9.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -52,6 +52,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
52
52
|
platform_name,
|
|
53
53
|
support_status,
|
|
54
54
|
)
|
|
55
|
+
from datahub.ingestion.api.report import EntityFilterReport
|
|
55
56
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
56
57
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
57
58
|
from datahub.ingestion.source.aws import s3_util
|
|
@@ -115,7 +116,6 @@ from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_col
|
|
|
115
116
|
|
|
116
117
|
logger = logging.getLogger(__name__)
|
|
117
118
|
|
|
118
|
-
|
|
119
119
|
DEFAULT_PLATFORM = "glue"
|
|
120
120
|
VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
|
|
121
121
|
|
|
@@ -220,6 +220,7 @@ class GlueSourceConfig(
|
|
|
220
220
|
class GlueSourceReport(StaleEntityRemovalSourceReport):
|
|
221
221
|
tables_scanned = 0
|
|
222
222
|
filtered: List[str] = dataclass_field(default_factory=list)
|
|
223
|
+
databases: EntityFilterReport = EntityFilterReport.field(type="database")
|
|
223
224
|
|
|
224
225
|
num_job_script_location_missing: int = 0
|
|
225
226
|
num_job_script_location_invalid: int = 0
|
|
@@ -668,6 +669,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
668
669
|
return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
|
|
669
670
|
|
|
670
671
|
def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
|
|
672
|
+
logger.debug("Getting all databases")
|
|
671
673
|
# see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html
|
|
672
674
|
paginator = self.glue_client.get_paginator("get_databases")
|
|
673
675
|
|
|
@@ -684,10 +686,18 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
684
686
|
pattern += "[?!TargetDatabase]"
|
|
685
687
|
|
|
686
688
|
for database in paginator_response.search(pattern):
|
|
687
|
-
if self.source_config.database_pattern.allowed(database["Name"])
|
|
689
|
+
if (not self.source_config.database_pattern.allowed(database["Name"])) or (
|
|
690
|
+
self.source_config.catalog_id
|
|
691
|
+
and database.get("CatalogId")
|
|
692
|
+
and database.get("CatalogId") != self.source_config.catalog_id
|
|
693
|
+
):
|
|
694
|
+
self.report.databases.dropped(database["Name"])
|
|
695
|
+
else:
|
|
696
|
+
self.report.databases.processed(database["Name"])
|
|
688
697
|
yield database
|
|
689
698
|
|
|
690
699
|
def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]:
|
|
700
|
+
logger.debug(f"Getting tables from database {database['Name']}")
|
|
691
701
|
# see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html
|
|
692
702
|
paginator = self.glue_client.get_paginator("get_tables")
|
|
693
703
|
database_name = database["Name"]
|
|
@@ -34,6 +34,7 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
|
|
|
34
34
|
SoftDeletedEntitiesCleanupConfig,
|
|
35
35
|
SoftDeletedEntitiesReport,
|
|
36
36
|
)
|
|
37
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
37
38
|
|
|
38
39
|
logger = logging.getLogger(__name__)
|
|
39
40
|
|
|
@@ -86,6 +87,7 @@ class DataHubGcSourceReport(
|
|
|
86
87
|
DataProcessCleanupReport,
|
|
87
88
|
SoftDeletedEntitiesReport,
|
|
88
89
|
DatahubExecutionRequestCleanupReport,
|
|
90
|
+
IngestionStageReport,
|
|
89
91
|
):
|
|
90
92
|
expired_tokens_revoked: int = 0
|
|
91
93
|
|
|
@@ -139,31 +141,40 @@ class DataHubGcSource(Source):
|
|
|
139
141
|
) -> Iterable[MetadataWorkUnit]:
|
|
140
142
|
if self.config.cleanup_expired_tokens:
|
|
141
143
|
try:
|
|
144
|
+
self.report.report_ingestion_stage_start("Expired Token Cleanup")
|
|
142
145
|
self.revoke_expired_tokens()
|
|
143
146
|
except Exception as e:
|
|
144
147
|
self.report.failure("While trying to cleanup expired token ", exc=e)
|
|
145
148
|
if self.config.truncate_indices:
|
|
146
149
|
try:
|
|
150
|
+
self.report.report_ingestion_stage_start("Truncate Indices")
|
|
147
151
|
self.truncate_indices()
|
|
148
152
|
except Exception as e:
|
|
149
153
|
self.report.failure("While trying to truncate indices ", exc=e)
|
|
150
154
|
if self.config.soft_deleted_entities_cleanup.enabled:
|
|
151
155
|
try:
|
|
156
|
+
self.report.report_ingestion_stage_start(
|
|
157
|
+
"Soft Deleted Entities Cleanup"
|
|
158
|
+
)
|
|
152
159
|
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
153
160
|
except Exception as e:
|
|
154
161
|
self.report.failure(
|
|
155
162
|
"While trying to cleanup soft deleted entities ", exc=e
|
|
156
163
|
)
|
|
157
|
-
if self.config.execution_request_cleanup.enabled:
|
|
158
|
-
try:
|
|
159
|
-
self.execution_request_cleanup.run()
|
|
160
|
-
except Exception as e:
|
|
161
|
-
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
162
164
|
if self.config.dataprocess_cleanup.enabled:
|
|
163
165
|
try:
|
|
166
|
+
self.report.report_ingestion_stage_start("Data Process Cleanup")
|
|
164
167
|
yield from self.dataprocess_cleanup.get_workunits_internal()
|
|
165
168
|
except Exception as e:
|
|
166
169
|
self.report.failure("While trying to cleanup data process ", exc=e)
|
|
170
|
+
if self.config.execution_request_cleanup.enabled:
|
|
171
|
+
try:
|
|
172
|
+
self.report.report_ingestion_stage_start("Execution request Cleanup")
|
|
173
|
+
self.execution_request_cleanup.run()
|
|
174
|
+
except Exception as e:
|
|
175
|
+
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
176
|
+
# Otherwise last stage's duration does not get calculated.
|
|
177
|
+
self.report.report_ingestion_stage_start("End")
|
|
167
178
|
yield from []
|
|
168
179
|
|
|
169
180
|
def truncate_indices(self) -> None:
|
|
@@ -281,6 +292,8 @@ class DataHubGcSource(Source):
|
|
|
281
292
|
list_access_tokens = expired_tokens_res.get("listAccessTokens", {})
|
|
282
293
|
tokens = list_access_tokens.get("tokens", [])
|
|
283
294
|
total = list_access_tokens.get("total", 0)
|
|
295
|
+
if tokens == []:
|
|
296
|
+
break
|
|
284
297
|
for token in tokens:
|
|
285
298
|
self.report.expired_tokens_revoked += 1
|
|
286
299
|
token_id = token["id"]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import datetime
|
|
1
2
|
import logging
|
|
2
3
|
import time
|
|
3
4
|
from typing import Any, Dict, Iterator, Optional
|
|
@@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
42
43
|
description="Global switch for this cleanup task",
|
|
43
44
|
)
|
|
44
45
|
|
|
46
|
+
runtime_limit_seconds: int = Field(
|
|
47
|
+
default=3600,
|
|
48
|
+
description="Maximum runtime in seconds for the cleanup task",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
max_read_errors: int = Field(
|
|
52
|
+
default=10,
|
|
53
|
+
description="Maximum number of read errors before aborting",
|
|
54
|
+
)
|
|
55
|
+
|
|
45
56
|
def keep_history_max_milliseconds(self):
|
|
46
57
|
return self.keep_history_max_days * 24 * 3600 * 1000
|
|
47
58
|
|
|
48
59
|
|
|
49
60
|
class DatahubExecutionRequestCleanupReport(SourceReport):
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
61
|
+
ergc_records_read: int = 0
|
|
62
|
+
ergc_records_preserved: int = 0
|
|
63
|
+
ergc_records_deleted: int = 0
|
|
64
|
+
ergc_read_errors: int = 0
|
|
65
|
+
ergc_delete_errors: int = 0
|
|
66
|
+
ergc_start_time: Optional[datetime.datetime] = None
|
|
67
|
+
ergc_end_time: Optional[datetime.datetime] = None
|
|
55
68
|
|
|
56
69
|
|
|
57
70
|
class CleanupRecord(BaseModel):
|
|
@@ -124,6 +137,13 @@ class DatahubExecutionRequestCleanup:
|
|
|
124
137
|
params.update(overrides)
|
|
125
138
|
|
|
126
139
|
while True:
|
|
140
|
+
if self._reached_runtime_limit():
|
|
141
|
+
break
|
|
142
|
+
if self.report.ergc_read_errors >= self.config.max_read_errors:
|
|
143
|
+
self.report.failure(
|
|
144
|
+
f"ergc({self.instance_id}): too many read errors, aborting."
|
|
145
|
+
)
|
|
146
|
+
break
|
|
127
147
|
try:
|
|
128
148
|
url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}"
|
|
129
149
|
response = self.graph._session.get(url, headers=headers, params=params)
|
|
@@ -141,7 +161,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
141
161
|
logger.error(
|
|
142
162
|
f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}"
|
|
143
163
|
)
|
|
144
|
-
self.report.
|
|
164
|
+
self.report.ergc_read_errors += 1
|
|
145
165
|
|
|
146
166
|
def _scroll_garbage_records(self):
|
|
147
167
|
state: Dict[str, Dict] = {}
|
|
@@ -150,7 +170,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
150
170
|
running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
|
|
151
171
|
|
|
152
172
|
for entry in self._scroll_execution_requests():
|
|
153
|
-
self.report.
|
|
173
|
+
self.report.ergc_records_read += 1
|
|
154
174
|
key = entry.ingestion_source
|
|
155
175
|
|
|
156
176
|
# Always delete corrupted records
|
|
@@ -171,7 +191,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
171
191
|
|
|
172
192
|
# Do not delete if number of requests is below minimum
|
|
173
193
|
if state[key]["count"] < self.config.keep_history_min_count:
|
|
174
|
-
self.report.
|
|
194
|
+
self.report.ergc_records_preserved += 1
|
|
175
195
|
continue
|
|
176
196
|
|
|
177
197
|
# Do not delete if number of requests do not exceed allowed maximum,
|
|
@@ -179,7 +199,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
179
199
|
if (state[key]["count"] < self.config.keep_history_max_count) and (
|
|
180
200
|
entry.requested_at > state[key]["cutoffTimestamp"]
|
|
181
201
|
):
|
|
182
|
-
self.report.
|
|
202
|
+
self.report.ergc_records_preserved += 1
|
|
183
203
|
continue
|
|
184
204
|
|
|
185
205
|
# Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not
|
|
@@ -188,7 +208,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
188
208
|
"RUNNING",
|
|
189
209
|
"PENDING",
|
|
190
210
|
]:
|
|
191
|
-
self.report.
|
|
211
|
+
self.report.ergc_records_preserved += 1
|
|
192
212
|
continue
|
|
193
213
|
|
|
194
214
|
# Otherwise delete current record
|
|
@@ -200,7 +220,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
200
220
|
f"record timestamp: {entry.requested_at}."
|
|
201
221
|
)
|
|
202
222
|
)
|
|
203
|
-
self.report.
|
|
223
|
+
self.report.ergc_records_deleted += 1
|
|
204
224
|
yield entry
|
|
205
225
|
|
|
206
226
|
def _delete_entry(self, entry: CleanupRecord) -> None:
|
|
@@ -210,17 +230,31 @@ class DatahubExecutionRequestCleanup:
|
|
|
210
230
|
)
|
|
211
231
|
self.graph.delete_entity(entry.urn, True)
|
|
212
232
|
except Exception as e:
|
|
213
|
-
self.report.
|
|
233
|
+
self.report.ergc_delete_errors += 1
|
|
214
234
|
logger.error(
|
|
215
235
|
f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}"
|
|
216
236
|
)
|
|
217
237
|
|
|
238
|
+
def _reached_runtime_limit(self) -> bool:
|
|
239
|
+
if (
|
|
240
|
+
self.config.runtime_limit_seconds
|
|
241
|
+
and self.report.ergc_start_time
|
|
242
|
+
and (
|
|
243
|
+
datetime.datetime.now() - self.report.ergc_start_time
|
|
244
|
+
>= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
|
|
245
|
+
)
|
|
246
|
+
):
|
|
247
|
+
logger.info(f"ergc({self.instance_id}): max runtime reached.")
|
|
248
|
+
return True
|
|
249
|
+
return False
|
|
250
|
+
|
|
218
251
|
def run(self) -> None:
|
|
219
252
|
if not self.config.enabled:
|
|
220
253
|
logger.info(
|
|
221
254
|
f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled."
|
|
222
255
|
)
|
|
223
256
|
return
|
|
257
|
+
self.report.ergc_start_time = datetime.datetime.now()
|
|
224
258
|
|
|
225
259
|
logger.info(
|
|
226
260
|
(
|
|
@@ -232,8 +266,11 @@ class DatahubExecutionRequestCleanup:
|
|
|
232
266
|
)
|
|
233
267
|
|
|
234
268
|
for entry in self._scroll_garbage_records():
|
|
269
|
+
if self._reached_runtime_limit():
|
|
270
|
+
break
|
|
235
271
|
self._delete_entry(entry)
|
|
236
272
|
|
|
273
|
+
self.report.ergc_end_time = datetime.datetime.now()
|
|
237
274
|
logger.info(
|
|
238
275
|
f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records."
|
|
239
276
|
)
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -5,6 +5,7 @@ import time
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from datetime import datetime, timezone
|
|
7
7
|
from functools import lru_cache
|
|
8
|
+
from json import JSONDecodeError
|
|
8
9
|
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
9
10
|
|
|
10
11
|
import dateutil.parser as dp
|
|
@@ -193,6 +194,9 @@ class HTTPError429(HTTPError):
|
|
|
193
194
|
pass
|
|
194
195
|
|
|
195
196
|
|
|
197
|
+
ModeRequestError = (HTTPError, JSONDecodeError)
|
|
198
|
+
|
|
199
|
+
|
|
196
200
|
@dataclass
|
|
197
201
|
class ModeSourceReport(StaleEntityRemovalSourceReport):
|
|
198
202
|
filtered_spaces: LossyList[str] = dataclasses.field(default_factory=LossyList)
|
|
@@ -328,11 +332,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
328
332
|
# Test the connection
|
|
329
333
|
try:
|
|
330
334
|
self._get_request_json(f"{self.config.connect_uri}/api/verify")
|
|
331
|
-
except
|
|
335
|
+
except ModeRequestError as e:
|
|
332
336
|
self.report.report_failure(
|
|
333
337
|
title="Failed to Connect",
|
|
334
338
|
message="Unable to verify connection to mode.",
|
|
335
|
-
context=f"Error: {str(
|
|
339
|
+
context=f"Error: {str(e)}",
|
|
336
340
|
)
|
|
337
341
|
|
|
338
342
|
self.workspace_uri = f"{self.config.connect_uri}/api/{self.config.workspace}"
|
|
@@ -521,11 +525,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
521
525
|
if self.config.owner_username_instead_of_email
|
|
522
526
|
else user_json.get("email")
|
|
523
527
|
)
|
|
524
|
-
except
|
|
528
|
+
except ModeRequestError as e:
|
|
525
529
|
self.report.report_warning(
|
|
526
530
|
title="Failed to retrieve Mode creator",
|
|
527
531
|
message=f"Unable to retrieve user for {href}",
|
|
528
|
-
context=f"Reason: {str(
|
|
532
|
+
context=f"Reason: {str(e)}",
|
|
529
533
|
)
|
|
530
534
|
return user
|
|
531
535
|
|
|
@@ -571,11 +575,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
571
575
|
logging.debug(f"Skipping space {space_name} due to space pattern")
|
|
572
576
|
continue
|
|
573
577
|
space_info[s.get("token", "")] = s.get("name", "")
|
|
574
|
-
except
|
|
578
|
+
except ModeRequestError as e:
|
|
575
579
|
self.report.report_failure(
|
|
576
580
|
title="Failed to Retrieve Spaces",
|
|
577
581
|
message="Unable to retrieve spaces / collections for workspace.",
|
|
578
|
-
context=f"Workspace: {self.workspace_uri}, Error: {str(
|
|
582
|
+
context=f"Workspace: {self.workspace_uri}, Error: {str(e)}",
|
|
579
583
|
)
|
|
580
584
|
|
|
581
585
|
return space_info
|
|
@@ -721,11 +725,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
721
725
|
try:
|
|
722
726
|
ds_json = self._get_request_json(f"{self.workspace_uri}/data_sources")
|
|
723
727
|
data_sources = ds_json.get("_embedded", {}).get("data_sources", [])
|
|
724
|
-
except
|
|
728
|
+
except ModeRequestError as e:
|
|
725
729
|
self.report.report_failure(
|
|
726
730
|
title="Failed to retrieve Data Sources",
|
|
727
731
|
message="Unable to retrieve data sources from Mode.",
|
|
728
|
-
context=f"Error: {str(
|
|
732
|
+
context=f"Error: {str(e)}",
|
|
729
733
|
)
|
|
730
734
|
|
|
731
735
|
return data_sources
|
|
@@ -812,11 +816,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
812
816
|
if definition.get("name", "") == definition_name:
|
|
813
817
|
return definition.get("source", "")
|
|
814
818
|
|
|
815
|
-
except
|
|
819
|
+
except ModeRequestError as e:
|
|
816
820
|
self.report.report_failure(
|
|
817
821
|
title="Failed to Retrieve Definition",
|
|
818
822
|
message="Unable to retrieve definition from Mode.",
|
|
819
|
-
context=f"Definition Name: {definition_name}, Error: {str(
|
|
823
|
+
context=f"Definition Name: {definition_name}, Error: {str(e)}",
|
|
820
824
|
)
|
|
821
825
|
return None
|
|
822
826
|
|
|
@@ -1382,11 +1386,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1382
1386
|
f"{self.workspace_uri}/spaces/{space_token}/reports"
|
|
1383
1387
|
)
|
|
1384
1388
|
reports = reports_json.get("_embedded", {}).get("reports", {})
|
|
1385
|
-
except
|
|
1389
|
+
except ModeRequestError as e:
|
|
1386
1390
|
self.report.report_failure(
|
|
1387
1391
|
title="Failed to Retrieve Reports for Space",
|
|
1388
1392
|
message="Unable to retrieve reports for space token.",
|
|
1389
|
-
context=f"Space Token: {space_token}, Error: {str(
|
|
1393
|
+
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1390
1394
|
)
|
|
1391
1395
|
return reports
|
|
1392
1396
|
|
|
@@ -1400,11 +1404,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1400
1404
|
url = f"{self.workspace_uri}/spaces/{space_token}/datasets"
|
|
1401
1405
|
datasets_json = self._get_request_json(url)
|
|
1402
1406
|
datasets = datasets_json.get("_embedded", {}).get("reports", [])
|
|
1403
|
-
except
|
|
1407
|
+
except ModeRequestError as e:
|
|
1404
1408
|
self.report.report_failure(
|
|
1405
1409
|
title="Failed to Retrieve Datasets for Space",
|
|
1406
1410
|
message=f"Unable to retrieve datasets for space token {space_token}.",
|
|
1407
|
-
context=f"Error: {str(
|
|
1411
|
+
context=f"Error: {str(e)}",
|
|
1408
1412
|
)
|
|
1409
1413
|
return datasets
|
|
1410
1414
|
|
|
@@ -1416,11 +1420,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1416
1420
|
f"{self.workspace_uri}/reports/{report_token}/queries"
|
|
1417
1421
|
)
|
|
1418
1422
|
queries = queries_json.get("_embedded", {}).get("queries", {})
|
|
1419
|
-
except
|
|
1423
|
+
except ModeRequestError as e:
|
|
1420
1424
|
self.report.report_failure(
|
|
1421
1425
|
title="Failed to Retrieve Queries",
|
|
1422
1426
|
message="Unable to retrieve queries for report token.",
|
|
1423
|
-
context=f"Report Token: {report_token}, Error: {str(
|
|
1427
|
+
context=f"Report Token: {report_token}, Error: {str(e)}",
|
|
1424
1428
|
)
|
|
1425
1429
|
return queries
|
|
1426
1430
|
|
|
@@ -1433,11 +1437,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1433
1437
|
f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}"
|
|
1434
1438
|
)
|
|
1435
1439
|
queries = queries_json.get("_embedded", {}).get("queries", {})
|
|
1436
|
-
except
|
|
1440
|
+
except ModeRequestError as e:
|
|
1437
1441
|
self.report.report_failure(
|
|
1438
1442
|
title="Failed to Retrieve Queries for Report",
|
|
1439
1443
|
message="Unable to retrieve queries for report token.",
|
|
1440
|
-
context=f"Report Token:{report_token}, Error: {str(
|
|
1444
|
+
context=f"Report Token:{report_token}, Error: {str(e)}",
|
|
1441
1445
|
)
|
|
1442
1446
|
return {}
|
|
1443
1447
|
return queries
|
|
@@ -1451,13 +1455,13 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1451
1455
|
f"/queries/{query_token}/charts"
|
|
1452
1456
|
)
|
|
1453
1457
|
charts = charts_json.get("_embedded", {}).get("charts", {})
|
|
1454
|
-
except
|
|
1458
|
+
except ModeRequestError as e:
|
|
1455
1459
|
self.report.report_failure(
|
|
1456
1460
|
title="Failed to Retrieve Charts",
|
|
1457
1461
|
message="Unable to retrieve charts from Mode.",
|
|
1458
1462
|
context=f"Report Token: {report_token}, "
|
|
1459
1463
|
f"Query token: {query_token}, "
|
|
1460
|
-
f"Error: {str(
|
|
1464
|
+
f"Error: {str(e)}",
|
|
1461
1465
|
)
|
|
1462
1466
|
return charts
|
|
1463
1467
|
|
|
@@ -1477,6 +1481,8 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1477
1481
|
response = self.session.get(
|
|
1478
1482
|
url, timeout=self.config.api_options.timeout
|
|
1479
1483
|
)
|
|
1484
|
+
if response.status_code == 204: # No content, don't parse json
|
|
1485
|
+
return {}
|
|
1480
1486
|
return response.json()
|
|
1481
1487
|
except HTTPError as http_error:
|
|
1482
1488
|
error_response = http_error.response
|
|
@@ -40,6 +40,7 @@ def _get_last_line(query: str) -> str:
|
|
|
40
40
|
class ToolMetaExtractorReport(Report):
|
|
41
41
|
num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict)
|
|
42
42
|
failures: List[str] = field(default_factory=list)
|
|
43
|
+
looker_user_mapping_missing: Optional[bool] = None
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
class ToolMetaExtractor:
|
|
@@ -108,7 +109,9 @@ class ToolMetaExtractor:
|
|
|
108
109
|
PlatformResource.search_by_filters(query=query, graph_client=graph)
|
|
109
110
|
)
|
|
110
111
|
|
|
111
|
-
if len(platform_resources)
|
|
112
|
+
if len(platform_resources) == 0:
|
|
113
|
+
report.looker_user_mapping_missing = True
|
|
114
|
+
elif len(platform_resources) > 1:
|
|
112
115
|
report.failures.append(
|
|
113
116
|
"Looker user metadata extraction failed. Found more than one looker user id mappings."
|
|
114
117
|
)
|
|
File without changes
|
{acryl_datahub-0.15.0.1rc7.dist-info → acryl_datahub-0.15.0.1rc9.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|