PyPI - acryl-datahub - Versions diffs - 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc25py3-none-any.whl → 0.15.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show

{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
datahub/__init__.py +1 -1
datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
datahub/configuration/common.py +2 -5
datahub/configuration/source_common.py +13 -0
datahub/emitter/mce_builder.py +20 -4
datahub/emitter/mcp_builder.py +2 -7
datahub/emitter/mcp_patch_builder.py +37 -13
datahub/emitter/rest_emitter.py +25 -3
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
datahub/ingestion/api/closeable.py +3 -3
datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
datahub/ingestion/api/report.py +4 -1
datahub/ingestion/api/sink.py +4 -3
datahub/ingestion/api/source.py +4 -0
datahub/ingestion/api/source_helpers.py +2 -6
datahub/ingestion/glossary/classifier.py +2 -3
datahub/ingestion/graph/client.py +6 -3
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
datahub/ingestion/source/aws/aws_common.py +231 -27
datahub/ingestion/source/aws/glue.py +12 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
datahub/ingestion/source/datahub/config.py +22 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
datahub/ingestion/source/gc/datahub_gc.py +21 -5
datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
datahub/ingestion/source/iceberg/iceberg.py +27 -1
datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
datahub/ingestion/source/kafka_connect/__init__.py +0 -0
datahub/ingestion/source/kafka_connect/common.py +202 -0
datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
datahub/ingestion/source/looker/looker_common.py +63 -2
datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
datahub/ingestion/source/looker/looker_source.py +31 -4
datahub/ingestion/source/looker/looker_usage.py +23 -17
datahub/ingestion/source/mlflow.py +30 -5
datahub/ingestion/source/mode.py +40 -27
datahub/ingestion/source/powerbi/config.py +1 -14
datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
datahub/ingestion/source/s3/source.py +1 -1
datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
datahub/ingestion/source/sql/hive.py +621 -8
datahub/ingestion/source/sql/hive_metastore.py +7 -0
datahub/ingestion/source/sql/mssql/job_models.py +30 -1
datahub/ingestion/source/sql/mssql/source.py +15 -1
datahub/ingestion/source/sql/sql_common.py +41 -102
datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
datahub/ingestion/source/sql/sql_report.py +2 -0
datahub/ingestion/source/state/checkpoint.py +2 -1
datahub/ingestion/source/tableau/tableau.py +122 -45
datahub/ingestion/source/tableau/tableau_common.py +18 -0
datahub/ingestion/source/tableau/tableau_constant.py +3 -1
datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/proxy.py +8 -27
datahub/ingestion/source/usage/usage_common.py +15 -1
datahub/ingestion/source_report/ingestion_stage.py +3 -0
datahub/metadata/_schema_classes.py +256 -3
datahub/metadata/_urns/urn_defs.py +168 -168
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
datahub/metadata/schema.avsc +252 -33
datahub/metadata/schemas/DataJobKey.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
datahub/metadata/schemas/MLModelProperties.avsc +62 -2
datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
datahub/specific/aspect_helpers/__init__.py +0 -0
datahub/specific/aspect_helpers/custom_properties.py +79 -0
datahub/specific/aspect_helpers/ownership.py +67 -0
datahub/specific/aspect_helpers/structured_properties.py +72 -0
datahub/specific/aspect_helpers/tags.py +42 -0
datahub/specific/aspect_helpers/terms.py +43 -0
datahub/specific/chart.py +28 -184
datahub/specific/dashboard.py +31 -196
datahub/specific/datajob.py +34 -189
datahub/specific/dataproduct.py +24 -86
datahub/specific/dataset.py +48 -133
datahub/specific/form.py +12 -32
datahub/specific/structured_property.py +9 -9
datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
datahub/sql_parsing/sqlglot_lineage.py +15 -5
datahub/sql_parsing/tool_meta_extractor.py +119 -5
datahub/utilities/time.py +8 -3
datahub/utilities/urns/_urn_base.py +5 -7
datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
datahub/specific/custom_properties.py +0 -37
datahub/specific/ownership.py +0 -48
datahub/specific/structured_properties.py +0 -53
{acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/datahub/config.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import Optional, Set
+import pydantic
 from pydantic import Field, root_validator
 from datahub.configuration.common import AllowDenyPattern
@@ -14,6 +15,17 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
 DEFAULT_DATABASE_TABLE_NAME = "metadata_aspect_v2"
 DEFAULT_KAFKA_TOPIC_NAME = "MetadataChangeLog_Timeseries_v1"
 DEFAULT_DATABASE_BATCH_SIZE = 10_000
+DEFAULT_EXCLUDE_ASPECTS = {
+    "dataHubIngestionSourceKey",
+    "dataHubIngestionSourceInfo",
+    "datahubIngestionRunSummary",
+    "datahubIngestionCheckpoint",
+    "dataHubSecretKey",
+    "dataHubSecretValue",
+    "globalSettingsKey",
+    "globalSettingsInfo",
+    "testResults",
+}
 class DataHubSourceConfig(StatefulIngestionConfigBase):
@@ -44,7 +56,7 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
     )
     exclude_aspects: Set[str] = Field(
-        default_factory=set,
+        default=DEFAULT_EXCLUDE_ASPECTS,
         description="Set of aspect names to exclude from ingestion",
     )
@@ -108,3 +120,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
                 " Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
             )
         return values
+    @pydantic.validator("database_connection")
+    def validate_mysql_scheme(
+        cls, v: SQLAlchemyConnectionConfig
+    ) -> SQLAlchemyConnectionConfig:
+        if "mysql" in v.scheme:
+            if v.scheme != "mysql+pymysql":
+                raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
+        return v

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -151,8 +151,10 @@ class DataHubDatabaseReader:
         self, query: str, params: Dict[str, Any]
     ) -> Iterable[Dict[str, Any]]:
         with self.engine.connect() as conn:
-            if self.engine.dialect.name == "postgresql":
+            if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
                 with conn.begin():  # Transaction required for PostgreSQL server-side cursor
+                    # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
+                    # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
                     conn = conn.execution_options(
                         stream_results=True,
                         yield_per=self.config.database_query_batch_size,
@@ -160,22 +162,6 @@ class DataHubDatabaseReader:
                     result = conn.execute(query, params)
                     for row in result:
                         yield dict(row)
-            elif self.engine.dialect.name == "mysql":  # MySQL
-                import MySQLdb
-                with contextlib.closing(
-                    conn.connection.cursor(MySQLdb.cursors.SSCursor)
-                ) as cursor:
-                    logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
-                    cursor.execute(query, params)
-                    columns = [desc[0] for desc in cursor.description]
-                    while True:
-                        rows = cursor.fetchmany(self.config.database_query_batch_size)
-                        if not rows:
-                            break  # Use break instead of return in generator
-                        for row in rows:
-                            yield dict(zip(columns, row))
             else:
                 raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")

datahub/ingestion/source/datahub/datahub_kafka_reader.py CHANGED Viewed

@@ -12,6 +12,7 @@ from confluent_kafka.schema_registry import SchemaRegistryClient
 from confluent_kafka.schema_registry.avro import AvroDeserializer
 from datahub.configuration.kafka import KafkaConsumerConnectionConfig
+from datahub.emitter.mce_builder import parse_ts_millis
 from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.source.datahub.config import DataHubSourceConfig
@@ -92,7 +93,7 @@ class DataHubKafkaReader(Closeable):
             if mcl.created and mcl.created.time > stop_time.timestamp() * 1000:
                 logger.info(
                     f"Stopped reading from kafka, reached MCL "
-                    f"with audit stamp {datetime.fromtimestamp(mcl.created.time / 1000)}"
+                    f"with audit stamp {parse_ts_millis(mcl.created.time)}"
                 )
                 break

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -130,7 +130,7 @@ class DataHubSource(StatefulIngestionSourceBase):
             self._commit_progress(i)
     def _get_kafka_workunits(
-        self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] = []
+        self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
     ) -> Iterable[MetadataWorkUnit]:
         if self.config.kafka_connection is None:
             return

datahub/ingestion/source/dbt/dbt_cloud.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from datetime import datetime
 from json import JSONDecodeError
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Literal, Optional, Tuple
 from urllib.parse import urlparse
 import dateutil.parser
@@ -62,6 +62,11 @@ class DBTCloudConfig(DBTCommonConfig):
         description="The ID of the run to ingest metadata from. If not specified, we'll default to the latest run.",
     )
+    external_url_mode: Literal["explore", "ide"] = Field(
+        default="explore",
+        description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE',
+    )
     @root_validator(pre=True)
     def set_metadata_endpoint(cls, values: dict) -> dict:
         if values.get("access_url") and not values.get("metadata_endpoint"):
@@ -527,5 +532,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
         )
     def get_external_url(self, node: DBTNode) -> Optional[str]:
-        # TODO: Once dbt Cloud supports deep linking to specific files, we can use that.
-        return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}"
+        if self.config.external_url_mode == "explore":
+            return f"{self.config.access_url}/explore/{self.config.account_id}/projects/{self.config.project_id}/environments/production/details/{node.dbt_name}"
+        else:
+            return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}"

datahub/ingestion/source/gc/datahub_gc.py CHANGED Viewed

@@ -34,6 +34,7 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
     SoftDeletedEntitiesCleanupConfig,
     SoftDeletedEntitiesReport,
 )
+from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
 logger = logging.getLogger(__name__)
@@ -86,6 +87,7 @@ class DataHubGcSourceReport(
     DataProcessCleanupReport,
     SoftDeletedEntitiesReport,
     DatahubExecutionRequestCleanupReport,
+    IngestionStageReport,
 ):
     expired_tokens_revoked: int = 0
@@ -139,31 +141,40 @@ class DataHubGcSource(Source):
     ) -> Iterable[MetadataWorkUnit]:
         if self.config.cleanup_expired_tokens:
             try:
+                self.report.report_ingestion_stage_start("Expired Token Cleanup")
                 self.revoke_expired_tokens()
             except Exception as e:
                 self.report.failure("While trying to cleanup expired token ", exc=e)
         if self.config.truncate_indices:
             try:
+                self.report.report_ingestion_stage_start("Truncate Indices")
                 self.truncate_indices()
             except Exception as e:
                 self.report.failure("While trying to truncate indices ", exc=e)
         if self.config.soft_deleted_entities_cleanup.enabled:
             try:
+                self.report.report_ingestion_stage_start(
+                    "Soft Deleted Entities Cleanup"
+                )
                 self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
             except Exception as e:
                 self.report.failure(
                     "While trying to cleanup soft deleted entities ", exc=e
                 )
-        if self.config.execution_request_cleanup.enabled:
-            try:
-                self.execution_request_cleanup.run()
-            except Exception as e:
-                self.report.failure("While trying to cleanup execution request ", exc=e)
         if self.config.dataprocess_cleanup.enabled:
             try:
+                self.report.report_ingestion_stage_start("Data Process Cleanup")
                 yield from self.dataprocess_cleanup.get_workunits_internal()
             except Exception as e:
                 self.report.failure("While trying to cleanup data process ", exc=e)
+        if self.config.execution_request_cleanup.enabled:
+            try:
+                self.report.report_ingestion_stage_start("Execution request Cleanup")
+                self.execution_request_cleanup.run()
+            except Exception as e:
+                self.report.failure("While trying to cleanup execution request ", exc=e)
+        # Otherwise last stage's duration does not get calculated.
+        self.report.report_ingestion_stage_start("End")
         yield from []
     def truncate_indices(self) -> None:
@@ -177,6 +188,9 @@ class DataHubGcSource(Source):
         self._truncate_timeseries_helper(
             aspect_name="dashboardUsageStatistics", entity_type="dashboard"
         )
+        self._truncate_timeseries_helper(
+            aspect_name="queryusagestatistics", entity_type="query"
+        )
     def _truncate_timeseries_helper(self, aspect_name: str, entity_type: str) -> None:
         self._truncate_timeseries_with_watch_optional(
@@ -281,6 +295,8 @@ class DataHubGcSource(Source):
             list_access_tokens = expired_tokens_res.get("listAccessTokens", {})
             tokens = list_access_tokens.get("tokens", [])
             total = list_access_tokens.get("total", 0)
+            if tokens == []:
+                break
             for token in tokens:
                 self.report.expired_tokens_revoked += 1
                 token_id = token["id"]

datahub/ingestion/source/gc/dataprocess_cleanup.py CHANGED Viewed

@@ -167,9 +167,11 @@ class DataJobEntity:
 class DataProcessCleanupReport(SourceReport):
     num_aspects_removed: int = 0
     num_aspect_removed_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
-    sample_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
+    sample_soft_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
         default_factory=TopKDict
     )
+    num_data_flows_found: int = 0
+    num_data_jobs_found: int = 0
 class DataProcessCleanup:
@@ -265,13 +267,17 @@ class DataProcessCleanup:
                     self.report.report_failure(
                         f"Exception while deleting DPI: {e}", exc=e
                     )
-            if deleted_count_last_n % self.config.batch_size == 0:
+            if (
+                deleted_count_last_n % self.config.batch_size == 0
+                and deleted_count_last_n > 0
+            ):
                 logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
                 if self.config.delay:
                     logger.info(f"Sleeping for {self.config.delay} seconds")
                     time.sleep(self.config.delay)
-        logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
+        if deleted_count_last_n > 0:
+            logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
     def delete_entity(self, urn: str, type: str) -> None:
         assert self.ctx.graph
@@ -280,9 +286,9 @@ class DataProcessCleanup:
         self.report.num_aspect_removed_by_type[type] = (
             self.report.num_aspect_removed_by_type.get(type, 0) + 1
         )
-        if type not in self.report.sample_removed_aspects_by_type:
-            self.report.sample_removed_aspects_by_type[type] = LossyList()
-        self.report.sample_removed_aspects_by_type[type].append(urn)
+        if type not in self.report.sample_soft_deleted_aspects_by_type:
+            self.report.sample_soft_deleted_aspects_by_type[type] = LossyList()
+        self.report.sample_soft_deleted_aspects_by_type[type].append(urn)
         if self.dry_run:
             logger.info(
@@ -351,7 +357,10 @@ class DataProcessCleanup:
             except Exception as e:
                 self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
-            if deleted_count_retention % self.config.batch_size == 0:
+            if (
+                deleted_count_retention % self.config.batch_size == 0
+                and deleted_count_retention > 0
+            ):
                 logger.info(
                     f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
                 )
@@ -393,6 +402,7 @@ class DataProcessCleanup:
             scrollAcrossEntities = result.get("scrollAcrossEntities")
             if not scrollAcrossEntities:
                 raise ValueError("Missing scrollAcrossEntities in response")
+            self.report.num_data_flows_found += scrollAcrossEntities.get("count")
             logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
             scroll_id = scrollAcrossEntities.get("nextScrollId")
@@ -415,8 +425,9 @@ class DataProcessCleanup:
         assert self.ctx.graph
         dataFlows: Dict[str, DataFlowEntity] = {}
-        for flow in self.get_data_flows():
-            dataFlows[flow.urn] = flow
+        if self.config.delete_empty_data_flows:
+            for flow in self.get_data_flows():
+                dataFlows[flow.urn] = flow
         scroll_id: Optional[str] = None
         previous_scroll_id: Optional[str] = None
@@ -443,6 +454,7 @@ class DataProcessCleanup:
             if not scrollAcrossEntities:
                 raise ValueError("Missing scrollAcrossEntities in response")
+            self.report.num_data_jobs_found += scrollAcrossEntities.get("count")
             logger.info(f"Got {scrollAcrossEntities.get('count')} DataJob entities")
             scroll_id = scrollAcrossEntities.get("nextScrollId")
@@ -481,7 +493,8 @@ class DataProcessCleanup:
             previous_scroll_id = scroll_id
-        logger.info(f"Deleted {deleted_jobs} DataJobs")
+        if deleted_jobs > 0:
+            logger.info(f"Deleted {deleted_jobs} DataJobs")
         # Delete empty dataflows if needed
         if self.config.delete_empty_data_flows:
             deleted_data_flows: int = 0

datahub/ingestion/source/gc/execution_request_cleanup.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import datetime
 import logging
 import time
 from typing import Any, Dict, Iterator, Optional
@@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
         description="Global switch for this cleanup task",
     )
+    runtime_limit_seconds: int = Field(
+        default=3600,
+        description="Maximum runtime in seconds for the cleanup task",
+    )
+    max_read_errors: int = Field(
+        default=10,
+        description="Maximum number of read errors before aborting",
+    )
     def keep_history_max_milliseconds(self):
         return self.keep_history_max_days * 24 * 3600 * 1000
 class DatahubExecutionRequestCleanupReport(SourceReport):
-    execution_request_cleanup_records_read: int = 0
-    execution_request_cleanup_records_preserved: int = 0
-    execution_request_cleanup_records_deleted: int = 0
-    execution_request_cleanup_read_errors: int = 0
-    execution_request_cleanup_delete_errors: int = 0
+    ergc_records_read: int = 0
+    ergc_records_preserved: int = 0
+    ergc_records_deleted: int = 0
+    ergc_read_errors: int = 0
+    ergc_delete_errors: int = 0
+    ergc_start_time: Optional[datetime.datetime] = None
+    ergc_end_time: Optional[datetime.datetime] = None
 class CleanupRecord(BaseModel):
@@ -124,6 +137,15 @@ class DatahubExecutionRequestCleanup:
         params.update(overrides)
         while True:
+            if self._reached_runtime_limit():
+                break
+            if self.report.ergc_read_errors >= self.config.max_read_errors:
+                self.report.failure(
+                    title="Too many read errors, aborting",
+                    message="Too many read errors, aborting",
+                    context=str(self.instance_id),
+                )
+                break
             try:
                 url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}"
                 response = self.graph._session.get(url, headers=headers, params=params)
@@ -138,10 +160,13 @@ class DatahubExecutionRequestCleanup:
                     break
                 params["scrollId"] = document["scrollId"]
             except Exception as e:
-                logger.error(
-                    f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}"
+                self.report.failure(
+                    title="Failed to fetch next batch of execution requests",
+                    message="Failed to fetch next batch of execution requests",
+                    context=str(self.instance_id),
+                    exc=e,
                 )
-                self.report.execution_request_cleanup_read_errors += 1
+                self.report.ergc_read_errors += 1
     def _scroll_garbage_records(self):
         state: Dict[str, Dict] = {}
@@ -150,7 +175,7 @@ class DatahubExecutionRequestCleanup:
         running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
         for entry in self._scroll_execution_requests():
-            self.report.execution_request_cleanup_records_read += 1
+            self.report.ergc_records_read += 1
             key = entry.ingestion_source
             # Always delete corrupted records
@@ -171,7 +196,7 @@ class DatahubExecutionRequestCleanup:
             # Do not delete if number of requests is below minimum
             if state[key]["count"] < self.config.keep_history_min_count:
-                self.report.execution_request_cleanup_records_preserved += 1
+                self.report.ergc_records_preserved += 1
                 continue
             # Do not delete if number of requests do not exceed allowed maximum,
@@ -179,7 +204,7 @@ class DatahubExecutionRequestCleanup:
             if (state[key]["count"] < self.config.keep_history_max_count) and (
                 entry.requested_at > state[key]["cutoffTimestamp"]
             ):
-                self.report.execution_request_cleanup_records_preserved += 1
+                self.report.ergc_records_preserved += 1
                 continue
             # Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not
@@ -188,7 +213,7 @@ class DatahubExecutionRequestCleanup:
                 "RUNNING",
                 "PENDING",
             ]:
-                self.report.execution_request_cleanup_records_preserved += 1
+                self.report.ergc_records_preserved += 1
                 continue
             # Otherwise delete current record
@@ -200,7 +225,7 @@ class DatahubExecutionRequestCleanup:
                     f"record timestamp: {entry.requested_at}."
                 )
             )
-            self.report.execution_request_cleanup_records_deleted += 1
+            self.report.ergc_records_deleted += 1
             yield entry
     def _delete_entry(self, entry: CleanupRecord) -> None:
@@ -210,10 +235,26 @@ class DatahubExecutionRequestCleanup:
             )
             self.graph.delete_entity(entry.urn, True)
         except Exception as e:
-            self.report.execution_request_cleanup_delete_errors += 1
-            logger.error(
-                f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}"
+            self.report.ergc_delete_errors += 1
+            self.report.failure(
+                title="Failed to delete ExecutionRequest",
+                message="Failed to delete ExecutionRequest",
+                context=str(self.instance_id),
+                exc=e,
+            )
+    def _reached_runtime_limit(self) -> bool:
+        if (
+            self.config.runtime_limit_seconds
+            and self.report.ergc_start_time
+            and (
+                datetime.datetime.now() - self.report.ergc_start_time
+                >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
             )
+        ):
+            logger.info(f"ergc({self.instance_id}): max runtime reached.")
+            return True
+        return False
     def run(self) -> None:
         if not self.config.enabled:
@@ -221,6 +262,7 @@ class DatahubExecutionRequestCleanup:
                 f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled."
             )
             return
+        self.report.ergc_start_time = datetime.datetime.now()
         logger.info(
             (
@@ -232,8 +274,11 @@ class DatahubExecutionRequestCleanup:
         )
         for entry in self._scroll_garbage_records():
+            if self._reached_runtime_limit():
+                break
             self._delete_entry(entry)
+        self.report.ergc_end_time = datetime.datetime.now()
         logger.info(
             f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records."
         )

acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc25py3-none-any.whl → 0.15.0.1py3-none-any.whl