PyPI - brawny - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.22__py3-none-any.whl - Mend

brawny 0.1.13py3-none-any.whl → 0.1.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

brawny/__init__.py +2 -0
brawny/_context.py +5 -5
brawny/_rpc/__init__.py +36 -12
brawny/_rpc/broadcast.py +14 -13
brawny/_rpc/caller.py +243 -0
brawny/_rpc/client.py +539 -0
brawny/_rpc/clients.py +11 -11
brawny/_rpc/context.py +23 -0
brawny/_rpc/errors.py +465 -31
brawny/_rpc/gas.py +7 -6
brawny/_rpc/pool.py +18 -0
brawny/_rpc/retry.py +266 -0
brawny/_rpc/retry_policy.py +81 -0
brawny/accounts.py +28 -9
brawny/alerts/__init__.py +15 -18
brawny/alerts/abi_resolver.py +212 -36
brawny/alerts/base.py +2 -2
brawny/alerts/contracts.py +77 -10
brawny/alerts/errors.py +30 -3
brawny/alerts/events.py +38 -5
brawny/alerts/health.py +19 -13
brawny/alerts/send.py +513 -55
brawny/api.py +39 -11
brawny/assets/AGENTS.md +325 -0
brawny/async_runtime.py +48 -0
brawny/chain.py +3 -3
brawny/cli/commands/__init__.py +2 -0
brawny/cli/commands/console.py +69 -19
brawny/cli/commands/contract.py +2 -2
brawny/cli/commands/controls.py +121 -0
brawny/cli/commands/health.py +2 -2
brawny/cli/commands/job_dev.py +6 -5
brawny/cli/commands/jobs.py +99 -2
brawny/cli/commands/maintenance.py +13 -29
brawny/cli/commands/migrate.py +1 -0
brawny/cli/commands/run.py +10 -3
brawny/cli/commands/script.py +8 -3
brawny/cli/commands/signer.py +143 -26
brawny/cli/helpers.py +0 -3
brawny/cli_templates.py +25 -349
brawny/config/__init__.py +4 -1
brawny/config/models.py +43 -57
brawny/config/parser.py +268 -57
brawny/config/validation.py +52 -15
brawny/daemon/context.py +4 -2
brawny/daemon/core.py +185 -63
brawny/daemon/loops.py +166 -98
brawny/daemon/supervisor.py +261 -0
brawny/db/__init__.py +14 -26
brawny/db/base.py +248 -151
brawny/db/global_cache.py +11 -1
brawny/db/migrate.py +175 -28
brawny/db/migrations/001_init.sql +4 -3
brawny/db/migrations/010_add_nonce_gap_index.sql +1 -1
brawny/db/migrations/011_add_job_logs.sql +1 -2
brawny/db/migrations/012_add_claimed_by.sql +2 -2
brawny/db/migrations/013_attempt_unique.sql +10 -0
brawny/db/migrations/014_add_lease_expires_at.sql +5 -0
brawny/db/migrations/015_add_signer_alias.sql +14 -0
brawny/db/migrations/016_runtime_controls_and_quarantine.sql +32 -0
brawny/db/migrations/017_add_job_drain.sql +6 -0
brawny/db/migrations/018_add_nonce_reset_audit.sql +20 -0
brawny/db/migrations/019_add_job_cooldowns.sql +8 -0
brawny/db/migrations/020_attempt_unique_initial.sql +7 -0
brawny/db/ops/__init__.py +3 -25
brawny/db/ops/logs.py +1 -2
brawny/db/queries.py +47 -91
brawny/db/serialized.py +65 -0
brawny/db/sqlite/__init__.py +1001 -0
brawny/db/sqlite/connection.py +231 -0
brawny/db/sqlite/execute.py +116 -0
brawny/db/sqlite/mappers.py +190 -0
brawny/db/sqlite/repos/attempts.py +372 -0
brawny/db/sqlite/repos/block_state.py +102 -0
brawny/db/sqlite/repos/cache.py +104 -0
brawny/db/sqlite/repos/intents.py +1021 -0
brawny/db/sqlite/repos/jobs.py +200 -0
brawny/db/sqlite/repos/maintenance.py +182 -0
brawny/db/sqlite/repos/signers_nonces.py +566 -0
brawny/db/sqlite/tx.py +119 -0
brawny/http.py +194 -0
brawny/invariants.py +11 -24
brawny/jobs/base.py +8 -0
brawny/jobs/job_validation.py +2 -1
brawny/keystore.py +83 -7
brawny/lifecycle.py +64 -12
brawny/logging.py +0 -2
brawny/metrics.py +84 -12
brawny/model/contexts.py +111 -9
brawny/model/enums.py +1 -0
brawny/model/errors.py +18 -0
brawny/model/types.py +47 -131
brawny/network_guard.py +133 -0
brawny/networks/__init__.py +5 -5
brawny/networks/config.py +1 -7
brawny/networks/manager.py +14 -11
brawny/runtime_controls.py +74 -0
brawny/scheduler/poller.py +11 -7
brawny/scheduler/reorg.py +95 -39
brawny/scheduler/runner.py +442 -168
brawny/scheduler/shutdown.py +3 -3
brawny/script_tx.py +3 -3
brawny/telegram.py +53 -7
brawny/testing.py +1 -0
brawny/timeout.py +38 -0
brawny/tx/executor.py +922 -308
brawny/tx/intent.py +54 -16
brawny/tx/monitor.py +31 -12
brawny/tx/nonce.py +212 -90
brawny/tx/replacement.py +69 -18
brawny/tx/retry_policy.py +24 -0
brawny/tx/stages/types.py +75 -0
brawny/types.py +18 -0
brawny/utils.py +41 -0
{brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/METADATA +3 -3
brawny-0.1.22.dist-info/RECORD +163 -0
brawny/_rpc/manager.py +0 -982
brawny/_rpc/selector.py +0 -156
brawny/db/base_new.py +0 -165
brawny/db/mappers.py +0 -182
brawny/db/migrations/008_add_transactions.sql +0 -72
brawny/db/ops/attempts.py +0 -108
brawny/db/ops/blocks.py +0 -83
brawny/db/ops/cache.py +0 -93
brawny/db/ops/intents.py +0 -296
brawny/db/ops/jobs.py +0 -110
brawny/db/ops/nonces.py +0 -322
brawny/db/postgres.py +0 -2535
brawny/db/postgres_new.py +0 -196
brawny/db/sqlite.py +0 -2733
brawny/db/sqlite_new.py +0 -191
brawny-0.1.13.dist-info/RECORD +0 -141
{brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/WHEEL +0 -0
{brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/entry_points.txt +0 -0
{brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/top_level.txt +0 -0

brawny/daemon/loops.py CHANGED Viewed

@@ -12,14 +12,15 @@ from typing import TYPE_CHECKING
 from brawny.metrics import (
     ACTIVE_WORKERS,
+    BACKGROUND_TASK_ERRORS,
     INTENT_CLAIMED,
     INTENT_RELEASED,
     INTENT_SENDING_STUCK,
     INTENTS_BACKING_OFF,
+    CLAIM_RECLAIM_SKIPPED,
     get_metrics,
 )
 from brawny.model.enums import AttemptStatus, IntentStatus
-from brawny.tx.intent import transition_intent
 if TYPE_CHECKING:
     from threading import Thread
@@ -46,36 +47,40 @@ def run_worker(
         state: Daemon state with callbacks
         dry_run: If True, claim and release without executing
     """
-    assert ctx.executor is not None or dry_run, "run_worker requires executor unless dry_run"
+    if ctx.executor is None and not dry_run:
+        raise RuntimeError("run_worker requires executor unless dry_run")
     ctx.log.debug("worker.started", worker_id=worker_id)
     while not stop_event.is_set():
-        released = ctx.db.release_stale_intent_claims(
-            max_age_seconds=ctx.config.claim_timeout_seconds
-        )
-        if released > 0:
-            ctx.log.info(
-                "worker.stale_claims_released",
-                worker_id=worker_id,
-                released=released,
-            )
-            metrics = get_metrics()
-            metrics.counter(INTENT_RELEASED).inc(
-                released,
-                chain_id=ctx.chain_id,
-                reason="stale_claim",
-            )
+        if ctx.controls and ctx.controls.is_active("drain_workers"):
+            ctx.log.warning("runtime.control.drain_workers", worker_id=worker_id)
+            time.sleep(1.0)
+            continue
         claim_token = state.make_claim_token(worker_id)
         claimed_by = state.make_claimed_by(worker_id)
-        intent = ctx.db.claim_next_intent(claim_token, claimed_by=claimed_by)
+        claimed = ctx.db.claim_next_intent(
+            claim_token,
+            claimed_by=claimed_by,
+            lease_seconds=ctx.config.claim_timeout_seconds,
+        )
-        if intent is None:
+        if claimed is None:
             wakeup_hint.wait(timeout=1.0)
             wakeup_hint.clear()
             continue
+        intent = ctx.db.get_intent(claimed.intent_id)
+        if intent is None:
+            ctx.log.error(
+                "worker.claimed_intent_missing",
+                intent_id=str(claimed.intent_id),
+                claim_token=claimed.claim_token,
+                claimed_by=claimed.claimed_by,
+                worker_id=worker_id,
+            )
+            continue
         ctx.log.info(
             "intent.claimed",
             intent_id=str(intent.intent_id),
@@ -91,7 +96,10 @@ def run_worker(
         if dry_run:
             ctx.log.info("worker.dry_run", intent_id=str(intent.intent_id))
-            released = ctx.db.release_intent_claim(intent.intent_id)
+            released = ctx.db.release_claim_if_token_and_no_attempts(
+                intent_id=claimed.intent_id,
+                claim_token=claimed.claim_token,
+            )
             if not released:
                 ctx.log.warning(
                     "worker.dry_run_release_failed",
@@ -107,7 +115,7 @@ def run_worker(
         state.inflight_inc()
         try:
-            outcome = ctx.executor.execute(intent)
+            outcome = ctx.executor.process_claimed_intent(claimed, intent=intent)
             ctx.log.info(
                 "worker.executed",
                 intent_id=str(intent.intent_id),
@@ -128,7 +136,7 @@ def run_worker(
                 error=e,
                 job_id=intent.job_id,
                 intent_id=str(intent.intent_id),
-                claim_token=intent.claim_token,
+                claim_token=claimed.claim_token,
                 status=intent.status.value if hasattr(intent.status, "value") else str(intent.status),
                 action="Check logs; intent will retry or timeout",
                 db_dialect=ctx.db.dialect,
@@ -136,53 +144,6 @@ def run_worker(
                 health_chat_id=ctx.health_chat_id,
                 cooldown_seconds=ctx.health_cooldown,
             )
-            try:
-                attempts = ctx.db.get_attempts_for_intent(intent.intent_id)
-            except Exception as query_err:
-                ctx.log.warning(
-                    "worker.exception_attempts_lookup_failed",
-                    intent_id=str(intent.intent_id),
-                    job_id=intent.job_id,
-                    error=str(query_err)[:200],
-                )
-                attempts = None
-            if attempts == []:
-                if not intent.claim_token:
-                    ctx.log.warning(
-                        "worker.claim_token_missing",
-                        intent_id=str(intent.intent_id),
-                        job_id=intent.job_id,
-                    )
-                else:
-                    try:
-                        released = ctx.db.release_intent_claim_if_token(
-                            intent.intent_id,
-                            intent.claim_token,
-                        )
-                        if released:
-                            ctx.log.info(
-                                "worker.claim_released_on_error",
-                                intent_id=str(intent.intent_id),
-                            )
-                            metrics = get_metrics()
-                            metrics.counter(INTENT_RELEASED).inc(
-                                chain_id=ctx.chain_id,
-                                reason="pre_attempt_exception",
-                            )
-                    except Exception:
-                        ctx.log.exception(
-                            "worker.claim_release_failed",
-                            intent_id=str(intent.intent_id),
-                        )
-            else:
-                ctx.log.warning(
-                    "worker.exception_with_attempts",
-                    intent_id=str(intent.intent_id),
-                    attempt_count=(len(attempts) if attempts is not None else None),
-                    hint="Not releasing claim; monitor/replacer should handle",
-                )
         finally:
             state.inflight_dec()
@@ -201,9 +162,12 @@ def run_monitor(
         ctx: Daemon context with shared components
         worker_threads: List of worker threads for gauge reporting
     """
-    assert ctx.monitor is not None, "run_monitor requires monitor"
-    assert ctx.replacer is not None, "run_monitor requires replacer"
-    assert ctx.nonce_manager is not None, "run_monitor requires nonce_manager"
+    if ctx.monitor is None:
+        raise RuntimeError("run_monitor requires monitor")
+    if ctx.replacer is None:
+        raise RuntimeError("run_monitor requires replacer")
+    if ctx.nonce_manager is None:
+        raise RuntimeError("run_monitor requires nonce_manager")
     ctx.log.debug("monitor.started")
     last_reconcile = time.time()
@@ -211,6 +175,8 @@ def run_monitor(
     last_worker_gauge = 0.0
     last_sending_recover = 0.0
     last_log_cleanup = 0.0
+    last_claim_reap = 0.0
+    last_lease_reclaim = 0.0
     while not stop_event.is_set():
         try:
@@ -244,6 +210,14 @@ def run_monitor(
                 _recover_stuck_sending(ctx)
                 last_sending_recover = now
+            if now - last_lease_reclaim >= 30:
+                _requeue_expired_claims(ctx)
+                last_lease_reclaim = now
+            if now - last_claim_reap >= 30:
+                _reap_stale_claims(ctx)
+                last_claim_reap = now
             # Job log cleanup (hourly)
             if now - last_log_cleanup >= 3600:
                 try:
@@ -266,6 +240,8 @@ def run_monitor(
                 last_log_cleanup = now
         except Exception as e:
             ctx.log.error("monitor.error", error=str(e)[:200])
+            metrics = get_metrics()
+            metrics.counter(BACKGROUND_TASK_ERRORS).inc(task="monitor")
             health_alert(
                 component="brawny.tx.monitor",
                 chain_id=ctx.chain_id,
@@ -288,7 +264,8 @@ def _recover_stuck_sending(ctx: "DaemonContext") -> None:
     Args:
         ctx: Daemon context with shared components
     """
-    assert ctx.nonce_manager is not None, "_recover_stuck_sending requires nonce_manager"
+    if ctx.nonce_manager is None:
+        raise RuntimeError("_recover_stuck_sending requires nonce_manager")
     stuck_sending = ctx.db.list_sending_intents_older_than(
         max_age_seconds=ctx.config.claim_timeout_seconds,
@@ -296,32 +273,123 @@ def _recover_stuck_sending(ctx: "DaemonContext") -> None:
     )
     for intent in stuck_sending:
         attempt = ctx.db.get_latest_attempt_for_intent(intent.intent_id)
-        if attempt and attempt.tx_hash:
-            transition_intent(
-                ctx.db,
-                intent.intent_id,
-                IntentStatus.PENDING,
-                "sending_recover",
-                chain_id=ctx.chain_id,
-            )
-        else:
-            if attempt:
-                ctx.db.update_attempt_status(
-                    attempt.attempt_id,
-                    AttemptStatus.FAILED.value,
-                    error_code="sending_stuck",
-                    error_detail="Intent stuck in sending without broadcast",
-                )
-                ctx.nonce_manager.release(intent.signer_address, attempt.nonce)
-            transition_intent(
-                ctx.db,
-                intent.intent_id,
-                IntentStatus.CREATED,
-                "sending_stuck",
-                chain_id=ctx.chain_id,
-            )
+        ctx.db.set_signer_quarantined(
+            ctx.chain_id,
+            intent.signer_address,
+            reason="stuck_sending",
+            source="recover_stuck_sending",
+        )
+        ctx.log.warning(
+            "intent.sending_quarantined",
+            intent_id=str(intent.intent_id),
+            job_id=intent.job_id,
+            attempt_id=str(attempt.attempt_id) if attempt else None,
+        )
         metrics = get_metrics()
         metrics.counter(INTENT_SENDING_STUCK).inc(
             chain_id=ctx.chain_id,
             age_bucket=">claim_timeout",
         )
+def _requeue_expired_claims(ctx: "DaemonContext") -> None:
+    grace_seconds = 15
+    limit = 50
+    requeued = ctx.db.requeue_expired_claims_no_attempts(
+        limit=limit,
+        grace_seconds=grace_seconds,
+        chain_id=ctx.chain_id,
+    )
+    skipped = ctx.db.count_expired_claims_with_attempts(
+        limit=limit,
+        grace_seconds=grace_seconds,
+        chain_id=ctx.chain_id,
+    )
+    if requeued == 0 and skipped == 0:
+        return
+    if requeued > 0:
+        ctx.log.info(
+            "claim.lease_requeued",
+            count=requeued,
+        )
+        metrics = get_metrics()
+        metrics.counter(INTENT_RELEASED).inc(
+            requeued,
+            chain_id=ctx.chain_id,
+            reason="lease_expired",
+        )
+    if skipped > 0:
+        ctx.log.warning(
+            "claim.lease_requeue_skipped_with_attempts",
+            count=skipped,
+        )
+        metrics = get_metrics()
+        metrics.counter(CLAIM_RECLAIM_SKIPPED).inc(
+            skipped,
+            chain_id=ctx.chain_id,
+        )
+    if ctx.config.debug.enable_null_lease_reclaim:
+        cutoff_seconds = 15 * 60
+        requeued_null = ctx.db.requeue_missing_lease_claims_no_attempts(
+            limit=limit,
+            cutoff_seconds=cutoff_seconds,
+            chain_id=ctx.chain_id,
+        )
+        skipped_null = ctx.db.count_missing_lease_claims_with_attempts(
+            limit=limit,
+            cutoff_seconds=cutoff_seconds,
+            chain_id=ctx.chain_id,
+        )
+        if requeued_null > 0:
+            ctx.log.warning(
+                "claim.null_lease_requeued",
+                count=requeued_null,
+            )
+            metrics = get_metrics()
+            metrics.counter(INTENT_RELEASED).inc(
+                requeued_null,
+                chain_id=ctx.chain_id,
+                reason="missing_lease",
+            )
+        if skipped_null > 0:
+            ctx.log.warning(
+                "claim.null_lease_requeue_skipped_with_attempts",
+                count=skipped_null,
+            )
+            metrics = get_metrics()
+            metrics.counter(CLAIM_RECLAIM_SKIPPED).inc(
+                skipped_null,
+                chain_id=ctx.chain_id,
+            )
+def _reap_stale_claims(ctx: "DaemonContext") -> None:
+    """Reap stale claimed intents with attempts.
+    If a broadcast attempt exists, move to PENDING for monitor/reconcile.
+    Otherwise, release claim back to CREATED and mark attempts failed.
+    """
+    if ctx.nonce_manager is None:
+        raise RuntimeError("_reap_stale_claims requires nonce_manager")
+    stale = ctx.db.list_claimed_intents_older_than(
+        max_age_seconds=ctx.config.claim_timeout_seconds,
+        chain_id=ctx.chain_id,
+    )
+    if not stale:
+        return
+    ctx.log.warning(
+        "claim.reap_detected",
+        count=len(stale),
+        action="containment_only",
+    )
+    ctx.db.set_runtime_control(
+        control="pause_new_intents",
+        active=True,
+        expires_at=datetime.utcnow() + timedelta(seconds=300),
+        reason="stale_claims_detected",
+        actor="reaper",
+        mode="auto",
+    )

brawny/daemon/supervisor.py ADDED Viewed

@@ -0,0 +1,261 @@
+"""Worker thread supervision with health tracking and failure handling.
+Provides fail-fast supervision for daemon worker threads. When a worker fails
+(exception or silent return), the supervisor signals shutdown so the daemon
+can exit cleanly with a non-zero exit code.
+"""
+from __future__ import annotations
+import threading
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any, Callable
+from brawny.logging import get_logger
+logger = get_logger(__name__)
+class WorkerStatus(Enum):
+    """Status of a supervised worker thread."""
+    STARTING = "starting"
+    RUNNING = "running"
+    FAILED = "failed"
+    STOPPED = "stopped"  # Exited without exception (still a failure in daemon)
+@dataclass
+class WorkerState:
+    """State for a supervised worker thread."""
+    name: str
+    target: Callable[[], None]
+    daemon: bool
+    status: WorkerStatus = WorkerStatus.STARTING
+    thread: threading.Thread | None = None
+    started_at: datetime | None = None
+    failed_at: datetime | None = None
+    failure_count: int = 0
+    last_error: str | None = None
+class WorkerSupervisor:
+    """Supervises worker threads with health tracking and failure handling.
+    Responsibilities:
+    - Start workers with exception-catching wrapper
+    - Record status + last error
+    - Signal shutdown on failure (fail-fast mode)
+    - Provide snapshot for health checks
+    Does NOT:
+    - Call sys.exit() (daemon decides exit)
+    - Auto-restart workers (V1 - keeps it simple)
+    """
+    def __init__(
+        self,
+        *,
+        fail_fast: bool = True,
+        liveness_check_interval: float = 5.0,
+    ) -> None:
+        """Initialize the supervisor.
+        Args:
+            fail_fast: If True, trigger shutdown on any worker failure (default for tx systems)
+            liveness_check_interval: How often to check thread liveness (seconds)
+        """
+        self._workers: dict[str, WorkerState] = {}
+        self._lock = threading.Lock()
+        self._shutdown_event = threading.Event()
+        self._fatal_reason: str | None = None
+        self._fail_fast = fail_fast
+        self._liveness_interval = liveness_check_interval
+        self._liveness_thread: threading.Thread | None = None
+    def add(
+        self,
+        name: str,
+        target: Callable[[], None],
+        *,
+        daemon: bool = True,
+    ) -> None:
+        """Register a worker to be supervised (does not start it).
+        Args:
+            name: Unique name for the worker
+            target: The function to run in the worker thread
+            daemon: Whether the thread should be a daemon thread
+        """
+        with self._lock:
+            if name in self._workers:
+                raise ValueError(f"Worker {name!r} already registered")
+            self._workers[name] = WorkerState(
+                name=name,
+                target=target,
+                daemon=daemon,
+            )
+    def start_all(self) -> None:
+        """Start all registered workers and the liveness monitor."""
+        with self._lock:
+            for state in self._workers.values():
+                self._start_worker(state)
+        # Start liveness monitor thread
+        self._liveness_thread = threading.Thread(
+            target=self._liveness_monitor,
+            name="supervisor-liveness",
+            daemon=True,
+        )
+        self._liveness_thread.start()
+    def _start_worker(self, state: WorkerState) -> None:
+        """Start a single worker thread with supervision wrapper."""
+        name = state.name
+        target = state.target
+        def supervised_target() -> None:
+            # Update state to RUNNING
+            with self._lock:
+                state.status = WorkerStatus.RUNNING
+                state.started_at = datetime.now(timezone.utc)
+            logger.info("worker.started", worker=name)
+            try:
+                target()
+                # If we get here, worker returned normally - that's a bug in a daemon
+                self._handle_worker_exit(name, reason="returned normally (bug)")
+            except Exception as e:
+                self._handle_worker_failure(name, e)
+        thread = threading.Thread(
+            target=supervised_target,
+            name=f"worker-{name}",
+            daemon=state.daemon,
+        )
+        state.thread = thread
+        thread.start()
+    def _handle_worker_failure(self, name: str, error: Exception) -> None:
+        """Handle worker thread failure (exception)."""
+        # Capture fields under lock, then release before logging
+        with self._lock:
+            worker = self._workers[name]
+            worker.status = WorkerStatus.FAILED
+            worker.failed_at = datetime.now(timezone.utc)
+            worker.failure_count += 1
+            worker.last_error = str(error)
+            failure_count = worker.failure_count
+        # Log after releasing lock
+        logger.error(
+            "worker.failed",
+            worker=name,
+            error=str(error),
+            failure_count=failure_count,
+            exc_info=True,
+        )
+        self._trigger_shutdown(f"worker {name!r} failed: {error}")
+    def _handle_worker_exit(self, name: str, reason: str) -> None:
+        """Handle worker thread exiting (no exception, but still a failure)."""
+        with self._lock:
+            worker = self._workers[name]
+            worker.status = WorkerStatus.STOPPED
+            worker.failed_at = datetime.now(timezone.utc)
+            worker.last_error = reason
+        logger.error("worker.exited", worker=name, reason=reason)
+        self._trigger_shutdown(f"worker {name!r} exited: {reason}")
+    def _trigger_shutdown(self, reason: str) -> None:
+        """Trigger shutdown with reason.
+        When fail_fast=True: Sets shutdown_event, daemon should exit.
+        When fail_fast=False: Does NOT set shutdown_event. Daemon keeps running
+            but all_healthy() returns False. Health checks should use all_healthy()
+            to report degraded status even if process continues.
+        """
+        # Always record the reason (useful for debugging even if not shutting down)
+        with self._lock:
+            if self._fatal_reason is None:
+                self._fatal_reason = reason
+        if self._fail_fast:
+            logger.critical("supervisor.shutdown", reason=reason)
+            self._shutdown_event.set()
+        else:
+            # Log but don't trigger shutdown - daemon continues in degraded state
+            logger.error("supervisor.worker_failed_no_shutdown", reason=reason)
+    def _liveness_monitor(self) -> None:
+        """Periodically check that all workers are still alive."""
+        while not self._shutdown_event.wait(self._liveness_interval):
+            dead_name: str | None = None
+            with self._lock:
+                for name, state in self._workers.items():
+                    if state.status == WorkerStatus.RUNNING:
+                        if state.thread is not None and not state.thread.is_alive():
+                            # Thread died without us catching it (shouldn't happen, but defensive)
+                            state.status = WorkerStatus.STOPPED
+                            state.failed_at = datetime.now(timezone.utc)
+                            state.last_error = "thread died unexpectedly"
+                            dead_name = name  # Capture before releasing lock
+                            break
+            # Handle dead worker outside lock
+            if dead_name is not None:
+                logger.error("worker.dead", worker=dead_name)
+                self._trigger_shutdown(f"worker {dead_name!r} died unexpectedly")
+    def snapshot(self) -> dict[str, dict[str, Any]]:
+        """Return snapshot of all worker states for health checks."""
+        with self._lock:
+            return {
+                name: {
+                    "status": state.status.value,
+                    "started_at": state.started_at.isoformat() if state.started_at else None,
+                    "failed_at": state.failed_at.isoformat() if state.failed_at else None,
+                    "failure_count": state.failure_count,
+                    "last_error": state.last_error,
+                    "alive": state.thread.is_alive() if state.thread else False,
+                }
+                for name, state in self._workers.items()
+            }
+    def all_healthy(self) -> bool:
+        """Check if all workers are healthy (running and alive)."""
+        with self._lock:
+            return all(
+                state.status == WorkerStatus.RUNNING
+                and state.thread is not None
+                and state.thread.is_alive()
+                for state in self._workers.values()
+            )
+    def shutdown_requested(self) -> bool:
+        """Check if shutdown has been triggered."""
+        return self._shutdown_event.is_set()
+    def fatal_reason(self) -> str | None:
+        """Return the reason for fatal shutdown, if any."""
+        with self._lock:
+            return self._fatal_reason
+    def wait_for_shutdown(self, timeout: float | None = None) -> bool:
+        """Wait for shutdown signal. Returns True if shutdown was signaled."""
+        return self._shutdown_event.wait(timeout)
+    def request_shutdown(self, reason: str = "requested") -> None:
+        """Request supervisor shutdown (e.g., from signal handler)."""
+        with self._lock:
+            if self._fatal_reason is None:
+                self._fatal_reason = reason
+        self._shutdown_event.set()

brawny 0.1.13__py3-none-any.whl → 0.1.22__py3-none-any.whl

brawny 0.1.13py3-none-any.whl → 0.1.22py3-none-any.whl