acryl-datahub 0.15.0.1rc7__py3-none-any.whl → 0.15.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=dj0h5Hq8a33nXbLNFmlqql5K3OaWumjRX8IsgKQUCfs,576
1
+ datahub/__init__.py,sha256=2793dupxo-Ov36jB1Z_p3H61xA3Rxhr1VhzHSdVOKhY,576
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -197,7 +197,7 @@ datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suab
197
197
  datahub/ingestion/source/ldap.py,sha256=Vnzg8tpwBYeyM-KBVVsUJvGZGBMJiCJ_i_FhxaFRQ9A,18627
198
198
  datahub/ingestion/source/metabase.py,sha256=oemiMdzjfr82Hx6rdwTNBzFM8962LDkosYh7SD_I5cY,31717
199
199
  datahub/ingestion/source/mlflow.py,sha256=-yWUuAEVBiNN-elz8Pgn0UeGsC3fVB20z1zKNIr4LXI,12309
200
- datahub/ingestion/source/mode.py,sha256=fuDTByENqcbxViFyYjU70B86FyAYr3Pk9usIBI0Vl1U,63384
200
+ datahub/ingestion/source/mode.py,sha256=cq1KIpLxuplETF7sUW0hoMQIZG1cgga5BGHP54a28wE,63467
201
201
  datahub/ingestion/source/mongodb.py,sha256=vZue4Nz0xaBoCUsQr3_0OIRkWRxeE_IH_Y_QKZ1s7S0,21077
202
202
  datahub/ingestion/source/nifi.py,sha256=ttsjZ9aRUvINmewvKFIQD8Rwa4jcl35WFG-F-jPGPWQ,56146
203
203
  datahub/ingestion/source/openapi.py,sha256=3ea2ORz1cuq4e7L2hSjxG9Cw3__pVoJ5UNYTJS3EnKU,17386
@@ -217,7 +217,7 @@ datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm
217
217
  datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
218
218
  datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
219
219
  datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
220
- datahub/ingestion/source/aws/glue.py,sha256=fX0dtaVVq174ZS0aBJvZFYK8ligfZX5EU3pdS3j1KQs,56215
220
+ datahub/ingestion/source/aws/glue.py,sha256=r7y1MPDK__BKX_mrJjVa_CEmSXM3Pa02gt19o0sSLE8,56815
221
221
  datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
222
222
  datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
223
223
  datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
@@ -302,9 +302,9 @@ datahub/ingestion/source/fivetran/fivetran.py,sha256=uKbM5czPz-6LOseoh1FwavWDIuL
302
302
  datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP_CyAT5Cian2N4a-lb8x1NKHk,12776
303
303
  datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
304
304
  datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
305
- datahub/ingestion/source/gc/datahub_gc.py,sha256=AHlKGwDD-E_TEHcJIpRtwk6ikjT-KiyfTo-BXZnMSk0,12114
305
+ datahub/ingestion/source/gc/datahub_gc.py,sha256=WOg3yIaNmwdbSTwytKeSfIUihsM7FMYBip9u2Dnwk3c,12849
306
306
  datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=u90XEmW1vRFbvp4CQ8ujPxTGJUyJqO2U6ApcI6mFrjE,16588
307
- datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=cHJmxz4NmA7VjTX2iGEo3wZ_SDrjC_rCQcnRxKgfUVI,8713
307
+ datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=sZbdkg3MuPVGf8eeeRg_2khGMZ01QoH4dgJiTxf7Srg,9813
308
308
  datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=wRnRaIVUG483tY4nyDkEn6Xi2RL5MjrVvoCoZimqwSg,7514
309
309
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
310
310
  datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
@@ -517,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
517
517
  datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
518
518
  datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
519
519
  datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
520
- datahub/ingestion/source_report/ingestion_stage.py,sha256=w6qTnJm_-eoTiGxwS7cFnhdIfsv8omC6H5e0qw5t4Jc,1587
520
+ datahub/ingestion/source_report/ingestion_stage.py,sha256=gbYmnio3fAOsjh_RzU3j_5UGu7bYBwUM4bm7S8ID_IU,1649
521
521
  datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
522
522
  datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
523
523
  datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -882,7 +882,7 @@ datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf
882
882
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
883
883
  datahub/sql_parsing/sqlglot_lineage.py,sha256=gUVq3NwZUzQByJs43JZXz8lZf0ZVzVt0FzaW5wZOwK4,47460
884
884
  datahub/sql_parsing/sqlglot_utils.py,sha256=n6yufzEGwSlFeCSU540hEldIuab0q8KGqm9x0vSawkc,14699
885
- datahub/sql_parsing/tool_meta_extractor.py,sha256=7tY4FAClhFcqwc23lGVlnT6Dequ_5Xcpbt0hDvnlLzM,6670
885
+ datahub/sql_parsing/tool_meta_extractor.py,sha256=qEPq8RFWyK0tmSPNlluvd5cxgwbd2v6m9ViSY4hm2QM,6822
886
886
  datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
887
887
  datahub/telemetry/stats.py,sha256=YltbtC3fe6rl1kcxn1A-mSnVpECTPm5k-brrUt7QxTI,967
888
888
  datahub/telemetry/telemetry.py,sha256=gzla-QGNsynGg2FqFxiDDFQ0emG53MJ9lhOA2-UUg-Y,15047
@@ -982,8 +982,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
982
982
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
983
983
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
984
984
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
985
- acryl_datahub-0.15.0.1rc7.dist-info/METADATA,sha256=hl14lRgFU4pk8d2s_Qxx1Xtkbd2TQp6gEek2gpkea1o,173642
986
- acryl_datahub-0.15.0.1rc7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
987
- acryl_datahub-0.15.0.1rc7.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
988
- acryl_datahub-0.15.0.1rc7.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
989
- acryl_datahub-0.15.0.1rc7.dist-info/RECORD,,
985
+ acryl_datahub-0.15.0.1rc9.dist-info/METADATA,sha256=nUI5E0nMS2Ng9RLK_q6N4VmqhzakT3CIw34UEqv8E1E,173642
986
+ acryl_datahub-0.15.0.1rc9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
987
+ acryl_datahub-0.15.0.1rc9.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
988
+ acryl_datahub-0.15.0.1rc9.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
989
+ acryl_datahub-0.15.0.1rc9.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc7"
6
+ __version__ = "0.15.0.1rc9"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -52,6 +52,7 @@ from datahub.ingestion.api.decorators import (
52
52
  platform_name,
53
53
  support_status,
54
54
  )
55
+ from datahub.ingestion.api.report import EntityFilterReport
55
56
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
56
57
  from datahub.ingestion.api.workunit import MetadataWorkUnit
57
58
  from datahub.ingestion.source.aws import s3_util
@@ -115,7 +116,6 @@ from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_col
115
116
 
116
117
  logger = logging.getLogger(__name__)
117
118
 
118
-
119
119
  DEFAULT_PLATFORM = "glue"
120
120
  VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
121
121
 
@@ -220,6 +220,7 @@ class GlueSourceConfig(
220
220
  class GlueSourceReport(StaleEntityRemovalSourceReport):
221
221
  tables_scanned = 0
222
222
  filtered: List[str] = dataclass_field(default_factory=list)
223
+ databases: EntityFilterReport = EntityFilterReport.field(type="database")
223
224
 
224
225
  num_job_script_location_missing: int = 0
225
226
  num_job_script_location_invalid: int = 0
@@ -668,6 +669,7 @@ class GlueSource(StatefulIngestionSourceBase):
668
669
  return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
669
670
 
670
671
  def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
672
+ logger.debug("Getting all databases")
671
673
  # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html
672
674
  paginator = self.glue_client.get_paginator("get_databases")
673
675
 
@@ -684,10 +686,18 @@ class GlueSource(StatefulIngestionSourceBase):
684
686
  pattern += "[?!TargetDatabase]"
685
687
 
686
688
  for database in paginator_response.search(pattern):
687
- if self.source_config.database_pattern.allowed(database["Name"]):
689
+ if (not self.source_config.database_pattern.allowed(database["Name"])) or (
690
+ self.source_config.catalog_id
691
+ and database.get("CatalogId")
692
+ and database.get("CatalogId") != self.source_config.catalog_id
693
+ ):
694
+ self.report.databases.dropped(database["Name"])
695
+ else:
696
+ self.report.databases.processed(database["Name"])
688
697
  yield database
689
698
 
690
699
  def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]:
700
+ logger.debug(f"Getting tables from database {database['Name']}")
691
701
  # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html
692
702
  paginator = self.glue_client.get_paginator("get_tables")
693
703
  database_name = database["Name"]
@@ -34,6 +34,7 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
34
34
  SoftDeletedEntitiesCleanupConfig,
35
35
  SoftDeletedEntitiesReport,
36
36
  )
37
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
37
38
 
38
39
  logger = logging.getLogger(__name__)
39
40
 
@@ -86,6 +87,7 @@ class DataHubGcSourceReport(
86
87
  DataProcessCleanupReport,
87
88
  SoftDeletedEntitiesReport,
88
89
  DatahubExecutionRequestCleanupReport,
90
+ IngestionStageReport,
89
91
  ):
90
92
  expired_tokens_revoked: int = 0
91
93
 
@@ -139,31 +141,40 @@ class DataHubGcSource(Source):
139
141
  ) -> Iterable[MetadataWorkUnit]:
140
142
  if self.config.cleanup_expired_tokens:
141
143
  try:
144
+ self.report.report_ingestion_stage_start("Expired Token Cleanup")
142
145
  self.revoke_expired_tokens()
143
146
  except Exception as e:
144
147
  self.report.failure("While trying to cleanup expired token ", exc=e)
145
148
  if self.config.truncate_indices:
146
149
  try:
150
+ self.report.report_ingestion_stage_start("Truncate Indices")
147
151
  self.truncate_indices()
148
152
  except Exception as e:
149
153
  self.report.failure("While trying to truncate indices ", exc=e)
150
154
  if self.config.soft_deleted_entities_cleanup.enabled:
151
155
  try:
156
+ self.report.report_ingestion_stage_start(
157
+ "Soft Deleted Entities Cleanup"
158
+ )
152
159
  self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
153
160
  except Exception as e:
154
161
  self.report.failure(
155
162
  "While trying to cleanup soft deleted entities ", exc=e
156
163
  )
157
- if self.config.execution_request_cleanup.enabled:
158
- try:
159
- self.execution_request_cleanup.run()
160
- except Exception as e:
161
- self.report.failure("While trying to cleanup execution request ", exc=e)
162
164
  if self.config.dataprocess_cleanup.enabled:
163
165
  try:
166
+ self.report.report_ingestion_stage_start("Data Process Cleanup")
164
167
  yield from self.dataprocess_cleanup.get_workunits_internal()
165
168
  except Exception as e:
166
169
  self.report.failure("While trying to cleanup data process ", exc=e)
170
+ if self.config.execution_request_cleanup.enabled:
171
+ try:
172
+ self.report.report_ingestion_stage_start("Execution request Cleanup")
173
+ self.execution_request_cleanup.run()
174
+ except Exception as e:
175
+ self.report.failure("While trying to cleanup execution request ", exc=e)
176
+ # Otherwise last stage's duration does not get calculated.
177
+ self.report.report_ingestion_stage_start("End")
167
178
  yield from []
168
179
 
169
180
  def truncate_indices(self) -> None:
@@ -281,6 +292,8 @@ class DataHubGcSource(Source):
281
292
  list_access_tokens = expired_tokens_res.get("listAccessTokens", {})
282
293
  tokens = list_access_tokens.get("tokens", [])
283
294
  total = list_access_tokens.get("total", 0)
295
+ if tokens == []:
296
+ break
284
297
  for token in tokens:
285
298
  self.report.expired_tokens_revoked += 1
286
299
  token_id = token["id"]
@@ -1,3 +1,4 @@
1
+ import datetime
1
2
  import logging
2
3
  import time
3
4
  from typing import Any, Dict, Iterator, Optional
@@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
42
43
  description="Global switch for this cleanup task",
43
44
  )
44
45
 
46
+ runtime_limit_seconds: int = Field(
47
+ default=3600,
48
+ description="Maximum runtime in seconds for the cleanup task",
49
+ )
50
+
51
+ max_read_errors: int = Field(
52
+ default=10,
53
+ description="Maximum number of read errors before aborting",
54
+ )
55
+
45
56
  def keep_history_max_milliseconds(self):
46
57
  return self.keep_history_max_days * 24 * 3600 * 1000
47
58
 
48
59
 
49
60
  class DatahubExecutionRequestCleanupReport(SourceReport):
50
- execution_request_cleanup_records_read: int = 0
51
- execution_request_cleanup_records_preserved: int = 0
52
- execution_request_cleanup_records_deleted: int = 0
53
- execution_request_cleanup_read_errors: int = 0
54
- execution_request_cleanup_delete_errors: int = 0
61
+ ergc_records_read: int = 0
62
+ ergc_records_preserved: int = 0
63
+ ergc_records_deleted: int = 0
64
+ ergc_read_errors: int = 0
65
+ ergc_delete_errors: int = 0
66
+ ergc_start_time: Optional[datetime.datetime] = None
67
+ ergc_end_time: Optional[datetime.datetime] = None
55
68
 
56
69
 
57
70
  class CleanupRecord(BaseModel):
@@ -124,6 +137,13 @@ class DatahubExecutionRequestCleanup:
124
137
  params.update(overrides)
125
138
 
126
139
  while True:
140
+ if self._reached_runtime_limit():
141
+ break
142
+ if self.report.ergc_read_errors >= self.config.max_read_errors:
143
+ self.report.failure(
144
+ f"ergc({self.instance_id}): too many read errors, aborting."
145
+ )
146
+ break
127
147
  try:
128
148
  url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}"
129
149
  response = self.graph._session.get(url, headers=headers, params=params)
@@ -141,7 +161,7 @@ class DatahubExecutionRequestCleanup:
141
161
  logger.error(
142
162
  f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}"
143
163
  )
144
- self.report.execution_request_cleanup_read_errors += 1
164
+ self.report.ergc_read_errors += 1
145
165
 
146
166
  def _scroll_garbage_records(self):
147
167
  state: Dict[str, Dict] = {}
@@ -150,7 +170,7 @@ class DatahubExecutionRequestCleanup:
150
170
  running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
151
171
 
152
172
  for entry in self._scroll_execution_requests():
153
- self.report.execution_request_cleanup_records_read += 1
173
+ self.report.ergc_records_read += 1
154
174
  key = entry.ingestion_source
155
175
 
156
176
  # Always delete corrupted records
@@ -171,7 +191,7 @@ class DatahubExecutionRequestCleanup:
171
191
 
172
192
  # Do not delete if number of requests is below minimum
173
193
  if state[key]["count"] < self.config.keep_history_min_count:
174
- self.report.execution_request_cleanup_records_preserved += 1
194
+ self.report.ergc_records_preserved += 1
175
195
  continue
176
196
 
177
197
  # Do not delete if number of requests do not exceed allowed maximum,
@@ -179,7 +199,7 @@ class DatahubExecutionRequestCleanup:
179
199
  if (state[key]["count"] < self.config.keep_history_max_count) and (
180
200
  entry.requested_at > state[key]["cutoffTimestamp"]
181
201
  ):
182
- self.report.execution_request_cleanup_records_preserved += 1
202
+ self.report.ergc_records_preserved += 1
183
203
  continue
184
204
 
185
205
  # Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not
@@ -188,7 +208,7 @@ class DatahubExecutionRequestCleanup:
188
208
  "RUNNING",
189
209
  "PENDING",
190
210
  ]:
191
- self.report.execution_request_cleanup_records_preserved += 1
211
+ self.report.ergc_records_preserved += 1
192
212
  continue
193
213
 
194
214
  # Otherwise delete current record
@@ -200,7 +220,7 @@ class DatahubExecutionRequestCleanup:
200
220
  f"record timestamp: {entry.requested_at}."
201
221
  )
202
222
  )
203
- self.report.execution_request_cleanup_records_deleted += 1
223
+ self.report.ergc_records_deleted += 1
204
224
  yield entry
205
225
 
206
226
  def _delete_entry(self, entry: CleanupRecord) -> None:
@@ -210,17 +230,31 @@ class DatahubExecutionRequestCleanup:
210
230
  )
211
231
  self.graph.delete_entity(entry.urn, True)
212
232
  except Exception as e:
213
- self.report.execution_request_cleanup_delete_errors += 1
233
+ self.report.ergc_delete_errors += 1
214
234
  logger.error(
215
235
  f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}"
216
236
  )
217
237
 
238
+ def _reached_runtime_limit(self) -> bool:
239
+ if (
240
+ self.config.runtime_limit_seconds
241
+ and self.report.ergc_start_time
242
+ and (
243
+ datetime.datetime.now() - self.report.ergc_start_time
244
+ >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
245
+ )
246
+ ):
247
+ logger.info(f"ergc({self.instance_id}): max runtime reached.")
248
+ return True
249
+ return False
250
+
218
251
  def run(self) -> None:
219
252
  if not self.config.enabled:
220
253
  logger.info(
221
254
  f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled."
222
255
  )
223
256
  return
257
+ self.report.ergc_start_time = datetime.datetime.now()
224
258
 
225
259
  logger.info(
226
260
  (
@@ -232,8 +266,11 @@ class DatahubExecutionRequestCleanup:
232
266
  )
233
267
 
234
268
  for entry in self._scroll_garbage_records():
269
+ if self._reached_runtime_limit():
270
+ break
235
271
  self._delete_entry(entry)
236
272
 
273
+ self.report.ergc_end_time = datetime.datetime.now()
237
274
  logger.info(
238
275
  f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records."
239
276
  )
@@ -5,6 +5,7 @@ import time
5
5
  from dataclasses import dataclass
6
6
  from datetime import datetime, timezone
7
7
  from functools import lru_cache
8
+ from json import JSONDecodeError
8
9
  from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
9
10
 
10
11
  import dateutil.parser as dp
@@ -193,6 +194,9 @@ class HTTPError429(HTTPError):
193
194
  pass
194
195
 
195
196
 
197
+ ModeRequestError = (HTTPError, JSONDecodeError)
198
+
199
+
196
200
  @dataclass
197
201
  class ModeSourceReport(StaleEntityRemovalSourceReport):
198
202
  filtered_spaces: LossyList[str] = dataclasses.field(default_factory=LossyList)
@@ -328,11 +332,11 @@ class ModeSource(StatefulIngestionSourceBase):
328
332
  # Test the connection
329
333
  try:
330
334
  self._get_request_json(f"{self.config.connect_uri}/api/verify")
331
- except HTTPError as http_error:
335
+ except ModeRequestError as e:
332
336
  self.report.report_failure(
333
337
  title="Failed to Connect",
334
338
  message="Unable to verify connection to mode.",
335
- context=f"Error: {str(http_error)}",
339
+ context=f"Error: {str(e)}",
336
340
  )
337
341
 
338
342
  self.workspace_uri = f"{self.config.connect_uri}/api/{self.config.workspace}"
@@ -521,11 +525,11 @@ class ModeSource(StatefulIngestionSourceBase):
521
525
  if self.config.owner_username_instead_of_email
522
526
  else user_json.get("email")
523
527
  )
524
- except HTTPError as http_error:
528
+ except ModeRequestError as e:
525
529
  self.report.report_warning(
526
530
  title="Failed to retrieve Mode creator",
527
531
  message=f"Unable to retrieve user for {href}",
528
- context=f"Reason: {str(http_error)}",
532
+ context=f"Reason: {str(e)}",
529
533
  )
530
534
  return user
531
535
 
@@ -571,11 +575,11 @@ class ModeSource(StatefulIngestionSourceBase):
571
575
  logging.debug(f"Skipping space {space_name} due to space pattern")
572
576
  continue
573
577
  space_info[s.get("token", "")] = s.get("name", "")
574
- except HTTPError as http_error:
578
+ except ModeRequestError as e:
575
579
  self.report.report_failure(
576
580
  title="Failed to Retrieve Spaces",
577
581
  message="Unable to retrieve spaces / collections for workspace.",
578
- context=f"Workspace: {self.workspace_uri}, Error: {str(http_error)}",
582
+ context=f"Workspace: {self.workspace_uri}, Error: {str(e)}",
579
583
  )
580
584
 
581
585
  return space_info
@@ -721,11 +725,11 @@ class ModeSource(StatefulIngestionSourceBase):
721
725
  try:
722
726
  ds_json = self._get_request_json(f"{self.workspace_uri}/data_sources")
723
727
  data_sources = ds_json.get("_embedded", {}).get("data_sources", [])
724
- except HTTPError as http_error:
728
+ except ModeRequestError as e:
725
729
  self.report.report_failure(
726
730
  title="Failed to retrieve Data Sources",
727
731
  message="Unable to retrieve data sources from Mode.",
728
- context=f"Error: {str(http_error)}",
732
+ context=f"Error: {str(e)}",
729
733
  )
730
734
 
731
735
  return data_sources
@@ -812,11 +816,11 @@ class ModeSource(StatefulIngestionSourceBase):
812
816
  if definition.get("name", "") == definition_name:
813
817
  return definition.get("source", "")
814
818
 
815
- except HTTPError as http_error:
819
+ except ModeRequestError as e:
816
820
  self.report.report_failure(
817
821
  title="Failed to Retrieve Definition",
818
822
  message="Unable to retrieve definition from Mode.",
819
- context=f"Definition Name: {definition_name}, Error: {str(http_error)}",
823
+ context=f"Definition Name: {definition_name}, Error: {str(e)}",
820
824
  )
821
825
  return None
822
826
 
@@ -1382,11 +1386,11 @@ class ModeSource(StatefulIngestionSourceBase):
1382
1386
  f"{self.workspace_uri}/spaces/{space_token}/reports"
1383
1387
  )
1384
1388
  reports = reports_json.get("_embedded", {}).get("reports", {})
1385
- except HTTPError as http_error:
1389
+ except ModeRequestError as e:
1386
1390
  self.report.report_failure(
1387
1391
  title="Failed to Retrieve Reports for Space",
1388
1392
  message="Unable to retrieve reports for space token.",
1389
- context=f"Space Token: {space_token}, Error: {str(http_error)}",
1393
+ context=f"Space Token: {space_token}, Error: {str(e)}",
1390
1394
  )
1391
1395
  return reports
1392
1396
 
@@ -1400,11 +1404,11 @@ class ModeSource(StatefulIngestionSourceBase):
1400
1404
  url = f"{self.workspace_uri}/spaces/{space_token}/datasets"
1401
1405
  datasets_json = self._get_request_json(url)
1402
1406
  datasets = datasets_json.get("_embedded", {}).get("reports", [])
1403
- except HTTPError as http_error:
1407
+ except ModeRequestError as e:
1404
1408
  self.report.report_failure(
1405
1409
  title="Failed to Retrieve Datasets for Space",
1406
1410
  message=f"Unable to retrieve datasets for space token {space_token}.",
1407
- context=f"Error: {str(http_error)}",
1411
+ context=f"Error: {str(e)}",
1408
1412
  )
1409
1413
  return datasets
1410
1414
 
@@ -1416,11 +1420,11 @@ class ModeSource(StatefulIngestionSourceBase):
1416
1420
  f"{self.workspace_uri}/reports/{report_token}/queries"
1417
1421
  )
1418
1422
  queries = queries_json.get("_embedded", {}).get("queries", {})
1419
- except HTTPError as http_error:
1423
+ except ModeRequestError as e:
1420
1424
  self.report.report_failure(
1421
1425
  title="Failed to Retrieve Queries",
1422
1426
  message="Unable to retrieve queries for report token.",
1423
- context=f"Report Token: {report_token}, Error: {str(http_error)}",
1427
+ context=f"Report Token: {report_token}, Error: {str(e)}",
1424
1428
  )
1425
1429
  return queries
1426
1430
 
@@ -1433,11 +1437,11 @@ class ModeSource(StatefulIngestionSourceBase):
1433
1437
  f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}"
1434
1438
  )
1435
1439
  queries = queries_json.get("_embedded", {}).get("queries", {})
1436
- except HTTPError as http_error:
1440
+ except ModeRequestError as e:
1437
1441
  self.report.report_failure(
1438
1442
  title="Failed to Retrieve Queries for Report",
1439
1443
  message="Unable to retrieve queries for report token.",
1440
- context=f"Report Token:{report_token}, Error: {str(http_error)}",
1444
+ context=f"Report Token:{report_token}, Error: {str(e)}",
1441
1445
  )
1442
1446
  return {}
1443
1447
  return queries
@@ -1451,13 +1455,13 @@ class ModeSource(StatefulIngestionSourceBase):
1451
1455
  f"/queries/{query_token}/charts"
1452
1456
  )
1453
1457
  charts = charts_json.get("_embedded", {}).get("charts", {})
1454
- except HTTPError as http_error:
1458
+ except ModeRequestError as e:
1455
1459
  self.report.report_failure(
1456
1460
  title="Failed to Retrieve Charts",
1457
1461
  message="Unable to retrieve charts from Mode.",
1458
1462
  context=f"Report Token: {report_token}, "
1459
1463
  f"Query token: {query_token}, "
1460
- f"Error: {str(http_error)}",
1464
+ f"Error: {str(e)}",
1461
1465
  )
1462
1466
  return charts
1463
1467
 
@@ -1477,6 +1481,8 @@ class ModeSource(StatefulIngestionSourceBase):
1477
1481
  response = self.session.get(
1478
1482
  url, timeout=self.config.api_options.timeout
1479
1483
  )
1484
+ if response.status_code == 204: # No content, don't parse json
1485
+ return {}
1480
1486
  return response.json()
1481
1487
  except HTTPError as http_error:
1482
1488
  error_response = http_error.response
@@ -42,4 +42,5 @@ class IngestionStageReport:
42
42
  self._timer = PerfTimer()
43
43
 
44
44
  self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
45
+ logger.info(f"Stage started: {self.ingestion_stage}")
45
46
  self._timer.start()
@@ -40,6 +40,7 @@ def _get_last_line(query: str) -> str:
40
40
  class ToolMetaExtractorReport(Report):
41
41
  num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict)
42
42
  failures: List[str] = field(default_factory=list)
43
+ looker_user_mapping_missing: Optional[bool] = None
43
44
 
44
45
 
45
46
  class ToolMetaExtractor:
@@ -108,7 +109,9 @@ class ToolMetaExtractor:
108
109
  PlatformResource.search_by_filters(query=query, graph_client=graph)
109
110
  )
110
111
 
111
- if len(platform_resources) > 1:
112
+ if len(platform_resources) == 0:
113
+ report.looker_user_mapping_missing = True
114
+ elif len(platform_resources) > 1:
112
115
  report.failures.append(
113
116
  "Looker user metadata extraction failed. Found more than one looker user id mappings."
114
117
  )