acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
  2. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from typing import Optional, Set
3
3
 
4
+ import pydantic
4
5
  from pydantic import Field, root_validator
5
6
 
6
7
  from datahub.configuration.common import AllowDenyPattern
@@ -14,6 +15,17 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
14
15
  DEFAULT_DATABASE_TABLE_NAME = "metadata_aspect_v2"
15
16
  DEFAULT_KAFKA_TOPIC_NAME = "MetadataChangeLog_Timeseries_v1"
16
17
  DEFAULT_DATABASE_BATCH_SIZE = 10_000
18
+ DEFAULT_EXCLUDE_ASPECTS = {
19
+ "dataHubIngestionSourceKey",
20
+ "dataHubIngestionSourceInfo",
21
+ "datahubIngestionRunSummary",
22
+ "datahubIngestionCheckpoint",
23
+ "dataHubSecretKey",
24
+ "dataHubSecretValue",
25
+ "globalSettingsKey",
26
+ "globalSettingsInfo",
27
+ "testResults",
28
+ }
17
29
 
18
30
 
19
31
  class DataHubSourceConfig(StatefulIngestionConfigBase):
@@ -44,7 +56,7 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
44
56
  )
45
57
 
46
58
  exclude_aspects: Set[str] = Field(
47
- default_factory=set,
59
+ default=DEFAULT_EXCLUDE_ASPECTS,
48
60
  description="Set of aspect names to exclude from ingestion",
49
61
  )
50
62
 
@@ -108,3 +120,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
108
120
  " Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
109
121
  )
110
122
  return values
123
+
124
+ @pydantic.validator("database_connection")
125
+ def validate_mysql_scheme(
126
+ cls, v: SQLAlchemyConnectionConfig
127
+ ) -> SQLAlchemyConnectionConfig:
128
+ if "mysql" in v.scheme:
129
+ if v.scheme != "mysql+pymysql":
130
+ raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
131
+ return v
@@ -151,8 +151,10 @@ class DataHubDatabaseReader:
151
151
  self, query: str, params: Dict[str, Any]
152
152
  ) -> Iterable[Dict[str, Any]]:
153
153
  with self.engine.connect() as conn:
154
- if self.engine.dialect.name == "postgresql":
154
+ if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
155
155
  with conn.begin(): # Transaction required for PostgreSQL server-side cursor
156
+ # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
157
+ # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
156
158
  conn = conn.execution_options(
157
159
  stream_results=True,
158
160
  yield_per=self.config.database_query_batch_size,
@@ -160,22 +162,6 @@ class DataHubDatabaseReader:
160
162
  result = conn.execute(query, params)
161
163
  for row in result:
162
164
  yield dict(row)
163
- elif self.engine.dialect.name == "mysql": # MySQL
164
- import MySQLdb
165
-
166
- with contextlib.closing(
167
- conn.connection.cursor(MySQLdb.cursors.SSCursor)
168
- ) as cursor:
169
- logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
170
- cursor.execute(query, params)
171
-
172
- columns = [desc[0] for desc in cursor.description]
173
- while True:
174
- rows = cursor.fetchmany(self.config.database_query_batch_size)
175
- if not rows:
176
- break # Use break instead of return in generator
177
- for row in rows:
178
- yield dict(zip(columns, row))
179
165
  else:
180
166
  raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
181
167
 
@@ -12,6 +12,7 @@ from confluent_kafka.schema_registry import SchemaRegistryClient
12
12
  from confluent_kafka.schema_registry.avro import AvroDeserializer
13
13
 
14
14
  from datahub.configuration.kafka import KafkaConsumerConnectionConfig
15
+ from datahub.emitter.mce_builder import parse_ts_millis
15
16
  from datahub.ingestion.api.closeable import Closeable
16
17
  from datahub.ingestion.api.common import PipelineContext
17
18
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
@@ -92,7 +93,7 @@ class DataHubKafkaReader(Closeable):
92
93
  if mcl.created and mcl.created.time > stop_time.timestamp() * 1000:
93
94
  logger.info(
94
95
  f"Stopped reading from kafka, reached MCL "
95
- f"with audit stamp {datetime.fromtimestamp(mcl.created.time / 1000)}"
96
+ f"with audit stamp {parse_ts_millis(mcl.created.time)}"
96
97
  )
97
98
  break
98
99
 
@@ -130,7 +130,7 @@ class DataHubSource(StatefulIngestionSourceBase):
130
130
  self._commit_progress(i)
131
131
 
132
132
  def _get_kafka_workunits(
133
- self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] = []
133
+ self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
134
134
  ) -> Iterable[MetadataWorkUnit]:
135
135
  if self.config.kafka_connection is None:
136
136
  return
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from datetime import datetime
3
3
  from json import JSONDecodeError
4
- from typing import Dict, List, Optional, Tuple
4
+ from typing import Dict, List, Literal, Optional, Tuple
5
5
  from urllib.parse import urlparse
6
6
 
7
7
  import dateutil.parser
@@ -62,6 +62,11 @@ class DBTCloudConfig(DBTCommonConfig):
62
62
  description="The ID of the run to ingest metadata from. If not specified, we'll default to the latest run.",
63
63
  )
64
64
 
65
+ external_url_mode: Literal["explore", "ide"] = Field(
66
+ default="explore",
67
+ description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE',
68
+ )
69
+
65
70
  @root_validator(pre=True)
66
71
  def set_metadata_endpoint(cls, values: dict) -> dict:
67
72
  if values.get("access_url") and not values.get("metadata_endpoint"):
@@ -527,5 +532,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
527
532
  )
528
533
 
529
534
  def get_external_url(self, node: DBTNode) -> Optional[str]:
530
- # TODO: Once dbt Cloud supports deep linking to specific files, we can use that.
531
- return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}"
535
+ if self.config.external_url_mode == "explore":
536
+ return f"{self.config.access_url}/explore/{self.config.account_id}/projects/{self.config.project_id}/environments/production/details/{node.dbt_name}"
537
+ else:
538
+ return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}"
@@ -34,6 +34,7 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
34
34
  SoftDeletedEntitiesCleanupConfig,
35
35
  SoftDeletedEntitiesReport,
36
36
  )
37
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
37
38
 
38
39
  logger = logging.getLogger(__name__)
39
40
 
@@ -86,6 +87,7 @@ class DataHubGcSourceReport(
86
87
  DataProcessCleanupReport,
87
88
  SoftDeletedEntitiesReport,
88
89
  DatahubExecutionRequestCleanupReport,
90
+ IngestionStageReport,
89
91
  ):
90
92
  expired_tokens_revoked: int = 0
91
93
 
@@ -139,31 +141,40 @@ class DataHubGcSource(Source):
139
141
  ) -> Iterable[MetadataWorkUnit]:
140
142
  if self.config.cleanup_expired_tokens:
141
143
  try:
144
+ self.report.report_ingestion_stage_start("Expired Token Cleanup")
142
145
  self.revoke_expired_tokens()
143
146
  except Exception as e:
144
147
  self.report.failure("While trying to cleanup expired token ", exc=e)
145
148
  if self.config.truncate_indices:
146
149
  try:
150
+ self.report.report_ingestion_stage_start("Truncate Indices")
147
151
  self.truncate_indices()
148
152
  except Exception as e:
149
153
  self.report.failure("While trying to truncate indices ", exc=e)
150
154
  if self.config.soft_deleted_entities_cleanup.enabled:
151
155
  try:
156
+ self.report.report_ingestion_stage_start(
157
+ "Soft Deleted Entities Cleanup"
158
+ )
152
159
  self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
153
160
  except Exception as e:
154
161
  self.report.failure(
155
162
  "While trying to cleanup soft deleted entities ", exc=e
156
163
  )
157
- if self.config.execution_request_cleanup.enabled:
158
- try:
159
- self.execution_request_cleanup.run()
160
- except Exception as e:
161
- self.report.failure("While trying to cleanup execution request ", exc=e)
162
164
  if self.config.dataprocess_cleanup.enabled:
163
165
  try:
166
+ self.report.report_ingestion_stage_start("Data Process Cleanup")
164
167
  yield from self.dataprocess_cleanup.get_workunits_internal()
165
168
  except Exception as e:
166
169
  self.report.failure("While trying to cleanup data process ", exc=e)
170
+ if self.config.execution_request_cleanup.enabled:
171
+ try:
172
+ self.report.report_ingestion_stage_start("Execution request Cleanup")
173
+ self.execution_request_cleanup.run()
174
+ except Exception as e:
175
+ self.report.failure("While trying to cleanup execution request ", exc=e)
176
+ # Otherwise last stage's duration does not get calculated.
177
+ self.report.report_ingestion_stage_start("End")
167
178
  yield from []
168
179
 
169
180
  def truncate_indices(self) -> None:
@@ -177,6 +188,9 @@ class DataHubGcSource(Source):
177
188
  self._truncate_timeseries_helper(
178
189
  aspect_name="dashboardUsageStatistics", entity_type="dashboard"
179
190
  )
191
+ self._truncate_timeseries_helper(
192
+ aspect_name="queryusagestatistics", entity_type="query"
193
+ )
180
194
 
181
195
  def _truncate_timeseries_helper(self, aspect_name: str, entity_type: str) -> None:
182
196
  self._truncate_timeseries_with_watch_optional(
@@ -281,6 +295,8 @@ class DataHubGcSource(Source):
281
295
  list_access_tokens = expired_tokens_res.get("listAccessTokens", {})
282
296
  tokens = list_access_tokens.get("tokens", [])
283
297
  total = list_access_tokens.get("total", 0)
298
+ if tokens == []:
299
+ break
284
300
  for token in tokens:
285
301
  self.report.expired_tokens_revoked += 1
286
302
  token_id = token["id"]
@@ -167,9 +167,11 @@ class DataJobEntity:
167
167
  class DataProcessCleanupReport(SourceReport):
168
168
  num_aspects_removed: int = 0
169
169
  num_aspect_removed_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
170
- sample_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
170
+ sample_soft_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
171
171
  default_factory=TopKDict
172
172
  )
173
+ num_data_flows_found: int = 0
174
+ num_data_jobs_found: int = 0
173
175
 
174
176
 
175
177
  class DataProcessCleanup:
@@ -265,13 +267,17 @@ class DataProcessCleanup:
265
267
  self.report.report_failure(
266
268
  f"Exception while deleting DPI: {e}", exc=e
267
269
  )
268
- if deleted_count_last_n % self.config.batch_size == 0:
270
+ if (
271
+ deleted_count_last_n % self.config.batch_size == 0
272
+ and deleted_count_last_n > 0
273
+ ):
269
274
  logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
270
275
  if self.config.delay:
271
276
  logger.info(f"Sleeping for {self.config.delay} seconds")
272
277
  time.sleep(self.config.delay)
273
278
 
274
- logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
279
+ if deleted_count_last_n > 0:
280
+ logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
275
281
 
276
282
  def delete_entity(self, urn: str, type: str) -> None:
277
283
  assert self.ctx.graph
@@ -280,9 +286,9 @@ class DataProcessCleanup:
280
286
  self.report.num_aspect_removed_by_type[type] = (
281
287
  self.report.num_aspect_removed_by_type.get(type, 0) + 1
282
288
  )
283
- if type not in self.report.sample_removed_aspects_by_type:
284
- self.report.sample_removed_aspects_by_type[type] = LossyList()
285
- self.report.sample_removed_aspects_by_type[type].append(urn)
289
+ if type not in self.report.sample_soft_deleted_aspects_by_type:
290
+ self.report.sample_soft_deleted_aspects_by_type[type] = LossyList()
291
+ self.report.sample_soft_deleted_aspects_by_type[type].append(urn)
286
292
 
287
293
  if self.dry_run:
288
294
  logger.info(
@@ -351,7 +357,10 @@ class DataProcessCleanup:
351
357
  except Exception as e:
352
358
  self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
353
359
 
354
- if deleted_count_retention % self.config.batch_size == 0:
360
+ if (
361
+ deleted_count_retention % self.config.batch_size == 0
362
+ and deleted_count_retention > 0
363
+ ):
355
364
  logger.info(
356
365
  f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
357
366
  )
@@ -393,6 +402,7 @@ class DataProcessCleanup:
393
402
  scrollAcrossEntities = result.get("scrollAcrossEntities")
394
403
  if not scrollAcrossEntities:
395
404
  raise ValueError("Missing scrollAcrossEntities in response")
405
+ self.report.num_data_flows_found += scrollAcrossEntities.get("count")
396
406
  logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
397
407
 
398
408
  scroll_id = scrollAcrossEntities.get("nextScrollId")
@@ -415,8 +425,9 @@ class DataProcessCleanup:
415
425
  assert self.ctx.graph
416
426
 
417
427
  dataFlows: Dict[str, DataFlowEntity] = {}
418
- for flow in self.get_data_flows():
419
- dataFlows[flow.urn] = flow
428
+ if self.config.delete_empty_data_flows:
429
+ for flow in self.get_data_flows():
430
+ dataFlows[flow.urn] = flow
420
431
 
421
432
  scroll_id: Optional[str] = None
422
433
  previous_scroll_id: Optional[str] = None
@@ -443,6 +454,7 @@ class DataProcessCleanup:
443
454
  if not scrollAcrossEntities:
444
455
  raise ValueError("Missing scrollAcrossEntities in response")
445
456
 
457
+ self.report.num_data_jobs_found += scrollAcrossEntities.get("count")
446
458
  logger.info(f"Got {scrollAcrossEntities.get('count')} DataJob entities")
447
459
 
448
460
  scroll_id = scrollAcrossEntities.get("nextScrollId")
@@ -481,7 +493,8 @@ class DataProcessCleanup:
481
493
 
482
494
  previous_scroll_id = scroll_id
483
495
 
484
- logger.info(f"Deleted {deleted_jobs} DataJobs")
496
+ if deleted_jobs > 0:
497
+ logger.info(f"Deleted {deleted_jobs} DataJobs")
485
498
  # Delete empty dataflows if needed
486
499
  if self.config.delete_empty_data_flows:
487
500
  deleted_data_flows: int = 0
@@ -1,3 +1,4 @@
1
+ import datetime
1
2
  import logging
2
3
  import time
3
4
  from typing import Any, Dict, Iterator, Optional
@@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
42
43
  description="Global switch for this cleanup task",
43
44
  )
44
45
 
46
+ runtime_limit_seconds: int = Field(
47
+ default=3600,
48
+ description="Maximum runtime in seconds for the cleanup task",
49
+ )
50
+
51
+ max_read_errors: int = Field(
52
+ default=10,
53
+ description="Maximum number of read errors before aborting",
54
+ )
55
+
45
56
  def keep_history_max_milliseconds(self):
46
57
  return self.keep_history_max_days * 24 * 3600 * 1000
47
58
 
48
59
 
49
60
  class DatahubExecutionRequestCleanupReport(SourceReport):
50
- execution_request_cleanup_records_read: int = 0
51
- execution_request_cleanup_records_preserved: int = 0
52
- execution_request_cleanup_records_deleted: int = 0
53
- execution_request_cleanup_read_errors: int = 0
54
- execution_request_cleanup_delete_errors: int = 0
61
+ ergc_records_read: int = 0
62
+ ergc_records_preserved: int = 0
63
+ ergc_records_deleted: int = 0
64
+ ergc_read_errors: int = 0
65
+ ergc_delete_errors: int = 0
66
+ ergc_start_time: Optional[datetime.datetime] = None
67
+ ergc_end_time: Optional[datetime.datetime] = None
55
68
 
56
69
 
57
70
  class CleanupRecord(BaseModel):
@@ -124,6 +137,15 @@ class DatahubExecutionRequestCleanup:
124
137
  params.update(overrides)
125
138
 
126
139
  while True:
140
+ if self._reached_runtime_limit():
141
+ break
142
+ if self.report.ergc_read_errors >= self.config.max_read_errors:
143
+ self.report.failure(
144
+ title="Too many read errors, aborting",
145
+ message="Too many read errors, aborting",
146
+ context=str(self.instance_id),
147
+ )
148
+ break
127
149
  try:
128
150
  url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}"
129
151
  response = self.graph._session.get(url, headers=headers, params=params)
@@ -138,10 +160,13 @@ class DatahubExecutionRequestCleanup:
138
160
  break
139
161
  params["scrollId"] = document["scrollId"]
140
162
  except Exception as e:
141
- logger.error(
142
- f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}"
163
+ self.report.failure(
164
+ title="Failed to fetch next batch of execution requests",
165
+ message="Failed to fetch next batch of execution requests",
166
+ context=str(self.instance_id),
167
+ exc=e,
143
168
  )
144
- self.report.execution_request_cleanup_read_errors += 1
169
+ self.report.ergc_read_errors += 1
145
170
 
146
171
  def _scroll_garbage_records(self):
147
172
  state: Dict[str, Dict] = {}
@@ -150,7 +175,7 @@ class DatahubExecutionRequestCleanup:
150
175
  running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
151
176
 
152
177
  for entry in self._scroll_execution_requests():
153
- self.report.execution_request_cleanup_records_read += 1
178
+ self.report.ergc_records_read += 1
154
179
  key = entry.ingestion_source
155
180
 
156
181
  # Always delete corrupted records
@@ -171,7 +196,7 @@ class DatahubExecutionRequestCleanup:
171
196
 
172
197
  # Do not delete if number of requests is below minimum
173
198
  if state[key]["count"] < self.config.keep_history_min_count:
174
- self.report.execution_request_cleanup_records_preserved += 1
199
+ self.report.ergc_records_preserved += 1
175
200
  continue
176
201
 
177
202
  # Do not delete if number of requests do not exceed allowed maximum,
@@ -179,7 +204,7 @@ class DatahubExecutionRequestCleanup:
179
204
  if (state[key]["count"] < self.config.keep_history_max_count) and (
180
205
  entry.requested_at > state[key]["cutoffTimestamp"]
181
206
  ):
182
- self.report.execution_request_cleanup_records_preserved += 1
207
+ self.report.ergc_records_preserved += 1
183
208
  continue
184
209
 
185
210
  # Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not
@@ -188,7 +213,7 @@ class DatahubExecutionRequestCleanup:
188
213
  "RUNNING",
189
214
  "PENDING",
190
215
  ]:
191
- self.report.execution_request_cleanup_records_preserved += 1
216
+ self.report.ergc_records_preserved += 1
192
217
  continue
193
218
 
194
219
  # Otherwise delete current record
@@ -200,7 +225,7 @@ class DatahubExecutionRequestCleanup:
200
225
  f"record timestamp: {entry.requested_at}."
201
226
  )
202
227
  )
203
- self.report.execution_request_cleanup_records_deleted += 1
228
+ self.report.ergc_records_deleted += 1
204
229
  yield entry
205
230
 
206
231
  def _delete_entry(self, entry: CleanupRecord) -> None:
@@ -210,10 +235,26 @@ class DatahubExecutionRequestCleanup:
210
235
  )
211
236
  self.graph.delete_entity(entry.urn, True)
212
237
  except Exception as e:
213
- self.report.execution_request_cleanup_delete_errors += 1
214
- logger.error(
215
- f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}"
238
+ self.report.ergc_delete_errors += 1
239
+ self.report.failure(
240
+ title="Failed to delete ExecutionRequest",
241
+ message="Failed to delete ExecutionRequest",
242
+ context=str(self.instance_id),
243
+ exc=e,
244
+ )
245
+
246
+ def _reached_runtime_limit(self) -> bool:
247
+ if (
248
+ self.config.runtime_limit_seconds
249
+ and self.report.ergc_start_time
250
+ and (
251
+ datetime.datetime.now() - self.report.ergc_start_time
252
+ >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
216
253
  )
254
+ ):
255
+ logger.info(f"ergc({self.instance_id}): max runtime reached.")
256
+ return True
257
+ return False
217
258
 
218
259
  def run(self) -> None:
219
260
  if not self.config.enabled:
@@ -221,6 +262,7 @@ class DatahubExecutionRequestCleanup:
221
262
  f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled."
222
263
  )
223
264
  return
265
+ self.report.ergc_start_time = datetime.datetime.now()
224
266
 
225
267
  logger.info(
226
268
  (
@@ -232,8 +274,11 @@ class DatahubExecutionRequestCleanup:
232
274
  )
233
275
 
234
276
  for entry in self._scroll_garbage_records():
277
+ if self._reached_runtime_limit():
278
+ break
235
279
  self._delete_entry(entry)
236
280
 
281
+ self.report.ergc_end_time = datetime.datetime.now()
237
282
  logger.info(
238
283
  f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records."
239
284
  )