acryl-datahub 0.15.0.1rc8__py3-none-any.whl → 0.15.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=Ed_HdiA9eGLLG0fhJKPwruUxl4bgAPR8p2MDlRHqts8,576
1
+ datahub/__init__.py,sha256=2793dupxo-Ov36jB1Z_p3H61xA3Rxhr1VhzHSdVOKhY,576
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -217,7 +217,7 @@ datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm
217
217
  datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
218
218
  datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
219
219
  datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
220
- datahub/ingestion/source/aws/glue.py,sha256=fX0dtaVVq174ZS0aBJvZFYK8ligfZX5EU3pdS3j1KQs,56215
220
+ datahub/ingestion/source/aws/glue.py,sha256=r7y1MPDK__BKX_mrJjVa_CEmSXM3Pa02gt19o0sSLE8,56815
221
221
  datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
222
222
  datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
223
223
  datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
@@ -302,9 +302,9 @@ datahub/ingestion/source/fivetran/fivetran.py,sha256=uKbM5czPz-6LOseoh1FwavWDIuL
302
302
  datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP_CyAT5Cian2N4a-lb8x1NKHk,12776
303
303
  datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
304
304
  datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
305
- datahub/ingestion/source/gc/datahub_gc.py,sha256=AHlKGwDD-E_TEHcJIpRtwk6ikjT-KiyfTo-BXZnMSk0,12114
305
+ datahub/ingestion/source/gc/datahub_gc.py,sha256=WOg3yIaNmwdbSTwytKeSfIUihsM7FMYBip9u2Dnwk3c,12849
306
306
  datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=u90XEmW1vRFbvp4CQ8ujPxTGJUyJqO2U6ApcI6mFrjE,16588
307
- datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=cHJmxz4NmA7VjTX2iGEo3wZ_SDrjC_rCQcnRxKgfUVI,8713
307
+ datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=sZbdkg3MuPVGf8eeeRg_2khGMZ01QoH4dgJiTxf7Srg,9813
308
308
  datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=wRnRaIVUG483tY4nyDkEn6Xi2RL5MjrVvoCoZimqwSg,7514
309
309
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
310
310
  datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
@@ -517,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
517
517
  datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
518
518
  datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
519
519
  datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
520
- datahub/ingestion/source_report/ingestion_stage.py,sha256=w6qTnJm_-eoTiGxwS7cFnhdIfsv8omC6H5e0qw5t4Jc,1587
520
+ datahub/ingestion/source_report/ingestion_stage.py,sha256=gbYmnio3fAOsjh_RzU3j_5UGu7bYBwUM4bm7S8ID_IU,1649
521
521
  datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
522
522
  datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
523
523
  datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -982,8 +982,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
982
982
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
983
983
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
984
984
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
985
- acryl_datahub-0.15.0.1rc8.dist-info/METADATA,sha256=mW2V4Czvd-ZE_mUJX8XkNZxNwnBa-gLJxebl0KWsM2A,173642
986
- acryl_datahub-0.15.0.1rc8.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
987
- acryl_datahub-0.15.0.1rc8.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
988
- acryl_datahub-0.15.0.1rc8.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
989
- acryl_datahub-0.15.0.1rc8.dist-info/RECORD,,
985
+ acryl_datahub-0.15.0.1rc9.dist-info/METADATA,sha256=nUI5E0nMS2Ng9RLK_q6N4VmqhzakT3CIw34UEqv8E1E,173642
986
+ acryl_datahub-0.15.0.1rc9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
987
+ acryl_datahub-0.15.0.1rc9.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
988
+ acryl_datahub-0.15.0.1rc9.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
989
+ acryl_datahub-0.15.0.1rc9.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.1rc8"
6
+ __version__ = "0.15.0.1rc9"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -52,6 +52,7 @@ from datahub.ingestion.api.decorators import (
52
52
  platform_name,
53
53
  support_status,
54
54
  )
55
+ from datahub.ingestion.api.report import EntityFilterReport
55
56
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
56
57
  from datahub.ingestion.api.workunit import MetadataWorkUnit
57
58
  from datahub.ingestion.source.aws import s3_util
@@ -115,7 +116,6 @@ from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_col
115
116
 
116
117
  logger = logging.getLogger(__name__)
117
118
 
118
-
119
119
  DEFAULT_PLATFORM = "glue"
120
120
  VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"]
121
121
 
@@ -220,6 +220,7 @@ class GlueSourceConfig(
220
220
  class GlueSourceReport(StaleEntityRemovalSourceReport):
221
221
  tables_scanned = 0
222
222
  filtered: List[str] = dataclass_field(default_factory=list)
223
+ databases: EntityFilterReport = EntityFilterReport.field(type="database")
223
224
 
224
225
  num_job_script_location_missing: int = 0
225
226
  num_job_script_location_invalid: int = 0
@@ -668,6 +669,7 @@ class GlueSource(StatefulIngestionSourceBase):
668
669
  return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
669
670
 
670
671
  def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
672
+ logger.debug("Getting all databases")
671
673
  # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html
672
674
  paginator = self.glue_client.get_paginator("get_databases")
673
675
 
@@ -684,10 +686,18 @@ class GlueSource(StatefulIngestionSourceBase):
684
686
  pattern += "[?!TargetDatabase]"
685
687
 
686
688
  for database in paginator_response.search(pattern):
687
- if self.source_config.database_pattern.allowed(database["Name"]):
689
+ if (not self.source_config.database_pattern.allowed(database["Name"])) or (
690
+ self.source_config.catalog_id
691
+ and database.get("CatalogId")
692
+ and database.get("CatalogId") != self.source_config.catalog_id
693
+ ):
694
+ self.report.databases.dropped(database["Name"])
695
+ else:
696
+ self.report.databases.processed(database["Name"])
688
697
  yield database
689
698
 
690
699
  def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]:
700
+ logger.debug(f"Getting tables from database {database['Name']}")
691
701
  # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html
692
702
  paginator = self.glue_client.get_paginator("get_tables")
693
703
  database_name = database["Name"]
@@ -34,6 +34,7 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
34
34
  SoftDeletedEntitiesCleanupConfig,
35
35
  SoftDeletedEntitiesReport,
36
36
  )
37
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
37
38
 
38
39
  logger = logging.getLogger(__name__)
39
40
 
@@ -86,6 +87,7 @@ class DataHubGcSourceReport(
86
87
  DataProcessCleanupReport,
87
88
  SoftDeletedEntitiesReport,
88
89
  DatahubExecutionRequestCleanupReport,
90
+ IngestionStageReport,
89
91
  ):
90
92
  expired_tokens_revoked: int = 0
91
93
 
@@ -139,31 +141,40 @@ class DataHubGcSource(Source):
139
141
  ) -> Iterable[MetadataWorkUnit]:
140
142
  if self.config.cleanup_expired_tokens:
141
143
  try:
144
+ self.report.report_ingestion_stage_start("Expired Token Cleanup")
142
145
  self.revoke_expired_tokens()
143
146
  except Exception as e:
144
147
  self.report.failure("While trying to cleanup expired token ", exc=e)
145
148
  if self.config.truncate_indices:
146
149
  try:
150
+ self.report.report_ingestion_stage_start("Truncate Indices")
147
151
  self.truncate_indices()
148
152
  except Exception as e:
149
153
  self.report.failure("While trying to truncate indices ", exc=e)
150
154
  if self.config.soft_deleted_entities_cleanup.enabled:
151
155
  try:
156
+ self.report.report_ingestion_stage_start(
157
+ "Soft Deleted Entities Cleanup"
158
+ )
152
159
  self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
153
160
  except Exception as e:
154
161
  self.report.failure(
155
162
  "While trying to cleanup soft deleted entities ", exc=e
156
163
  )
157
- if self.config.execution_request_cleanup.enabled:
158
- try:
159
- self.execution_request_cleanup.run()
160
- except Exception as e:
161
- self.report.failure("While trying to cleanup execution request ", exc=e)
162
164
  if self.config.dataprocess_cleanup.enabled:
163
165
  try:
166
+ self.report.report_ingestion_stage_start("Data Process Cleanup")
164
167
  yield from self.dataprocess_cleanup.get_workunits_internal()
165
168
  except Exception as e:
166
169
  self.report.failure("While trying to cleanup data process ", exc=e)
170
+ if self.config.execution_request_cleanup.enabled:
171
+ try:
172
+ self.report.report_ingestion_stage_start("Execution request Cleanup")
173
+ self.execution_request_cleanup.run()
174
+ except Exception as e:
175
+ self.report.failure("While trying to cleanup execution request ", exc=e)
176
+ # Otherwise last stage's duration does not get calculated.
177
+ self.report.report_ingestion_stage_start("End")
167
178
  yield from []
168
179
 
169
180
  def truncate_indices(self) -> None:
@@ -281,6 +292,8 @@ class DataHubGcSource(Source):
281
292
  list_access_tokens = expired_tokens_res.get("listAccessTokens", {})
282
293
  tokens = list_access_tokens.get("tokens", [])
283
294
  total = list_access_tokens.get("total", 0)
295
+ if tokens == []:
296
+ break
284
297
  for token in tokens:
285
298
  self.report.expired_tokens_revoked += 1
286
299
  token_id = token["id"]
@@ -1,3 +1,4 @@
1
+ import datetime
1
2
  import logging
2
3
  import time
3
4
  from typing import Any, Dict, Iterator, Optional
@@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
42
43
  description="Global switch for this cleanup task",
43
44
  )
44
45
 
46
+ runtime_limit_seconds: int = Field(
47
+ default=3600,
48
+ description="Maximum runtime in seconds for the cleanup task",
49
+ )
50
+
51
+ max_read_errors: int = Field(
52
+ default=10,
53
+ description="Maximum number of read errors before aborting",
54
+ )
55
+
45
56
  def keep_history_max_milliseconds(self):
46
57
  return self.keep_history_max_days * 24 * 3600 * 1000
47
58
 
48
59
 
49
60
  class DatahubExecutionRequestCleanupReport(SourceReport):
50
- execution_request_cleanup_records_read: int = 0
51
- execution_request_cleanup_records_preserved: int = 0
52
- execution_request_cleanup_records_deleted: int = 0
53
- execution_request_cleanup_read_errors: int = 0
54
- execution_request_cleanup_delete_errors: int = 0
61
+ ergc_records_read: int = 0
62
+ ergc_records_preserved: int = 0
63
+ ergc_records_deleted: int = 0
64
+ ergc_read_errors: int = 0
65
+ ergc_delete_errors: int = 0
66
+ ergc_start_time: Optional[datetime.datetime] = None
67
+ ergc_end_time: Optional[datetime.datetime] = None
55
68
 
56
69
 
57
70
  class CleanupRecord(BaseModel):
@@ -124,6 +137,13 @@ class DatahubExecutionRequestCleanup:
124
137
  params.update(overrides)
125
138
 
126
139
  while True:
140
+ if self._reached_runtime_limit():
141
+ break
142
+ if self.report.ergc_read_errors >= self.config.max_read_errors:
143
+ self.report.failure(
144
+ f"ergc({self.instance_id}): too many read errors, aborting."
145
+ )
146
+ break
127
147
  try:
128
148
  url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}"
129
149
  response = self.graph._session.get(url, headers=headers, params=params)
@@ -141,7 +161,7 @@ class DatahubExecutionRequestCleanup:
141
161
  logger.error(
142
162
  f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}"
143
163
  )
144
- self.report.execution_request_cleanup_read_errors += 1
164
+ self.report.ergc_read_errors += 1
145
165
 
146
166
  def _scroll_garbage_records(self):
147
167
  state: Dict[str, Dict] = {}
@@ -150,7 +170,7 @@ class DatahubExecutionRequestCleanup:
150
170
  running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
151
171
 
152
172
  for entry in self._scroll_execution_requests():
153
- self.report.execution_request_cleanup_records_read += 1
173
+ self.report.ergc_records_read += 1
154
174
  key = entry.ingestion_source
155
175
 
156
176
  # Always delete corrupted records
@@ -171,7 +191,7 @@ class DatahubExecutionRequestCleanup:
171
191
 
172
192
  # Do not delete if number of requests is below minimum
173
193
  if state[key]["count"] < self.config.keep_history_min_count:
174
- self.report.execution_request_cleanup_records_preserved += 1
194
+ self.report.ergc_records_preserved += 1
175
195
  continue
176
196
 
177
197
  # Do not delete if number of requests do not exceed allowed maximum,
@@ -179,7 +199,7 @@ class DatahubExecutionRequestCleanup:
179
199
  if (state[key]["count"] < self.config.keep_history_max_count) and (
180
200
  entry.requested_at > state[key]["cutoffTimestamp"]
181
201
  ):
182
- self.report.execution_request_cleanup_records_preserved += 1
202
+ self.report.ergc_records_preserved += 1
183
203
  continue
184
204
 
185
205
  # Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not
@@ -188,7 +208,7 @@ class DatahubExecutionRequestCleanup:
188
208
  "RUNNING",
189
209
  "PENDING",
190
210
  ]:
191
- self.report.execution_request_cleanup_records_preserved += 1
211
+ self.report.ergc_records_preserved += 1
192
212
  continue
193
213
 
194
214
  # Otherwise delete current record
@@ -200,7 +220,7 @@ class DatahubExecutionRequestCleanup:
200
220
  f"record timestamp: {entry.requested_at}."
201
221
  )
202
222
  )
203
- self.report.execution_request_cleanup_records_deleted += 1
223
+ self.report.ergc_records_deleted += 1
204
224
  yield entry
205
225
 
206
226
  def _delete_entry(self, entry: CleanupRecord) -> None:
@@ -210,17 +230,31 @@ class DatahubExecutionRequestCleanup:
210
230
  )
211
231
  self.graph.delete_entity(entry.urn, True)
212
232
  except Exception as e:
213
- self.report.execution_request_cleanup_delete_errors += 1
233
+ self.report.ergc_delete_errors += 1
214
234
  logger.error(
215
235
  f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}"
216
236
  )
217
237
 
238
+ def _reached_runtime_limit(self) -> bool:
239
+ if (
240
+ self.config.runtime_limit_seconds
241
+ and self.report.ergc_start_time
242
+ and (
243
+ datetime.datetime.now() - self.report.ergc_start_time
244
+ >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
245
+ )
246
+ ):
247
+ logger.info(f"ergc({self.instance_id}): max runtime reached.")
248
+ return True
249
+ return False
250
+
218
251
  def run(self) -> None:
219
252
  if not self.config.enabled:
220
253
  logger.info(
221
254
  f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled."
222
255
  )
223
256
  return
257
+ self.report.ergc_start_time = datetime.datetime.now()
224
258
 
225
259
  logger.info(
226
260
  (
@@ -232,8 +266,11 @@ class DatahubExecutionRequestCleanup:
232
266
  )
233
267
 
234
268
  for entry in self._scroll_garbage_records():
269
+ if self._reached_runtime_limit():
270
+ break
235
271
  self._delete_entry(entry)
236
272
 
273
+ self.report.ergc_end_time = datetime.datetime.now()
237
274
  logger.info(
238
275
  f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records."
239
276
  )
@@ -42,4 +42,5 @@ class IngestionStageReport:
42
42
  self._timer = PerfTimer()
43
43
 
44
44
  self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
45
+ logger.info(f"Stage started: {self.ingestion_stage}")
45
46
  self._timer.start()