brawny 0.1.13__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. brawny/__init__.py +2 -0
  2. brawny/_context.py +5 -5
  3. brawny/_rpc/__init__.py +36 -12
  4. brawny/_rpc/broadcast.py +14 -13
  5. brawny/_rpc/caller.py +243 -0
  6. brawny/_rpc/client.py +539 -0
  7. brawny/_rpc/clients.py +11 -11
  8. brawny/_rpc/context.py +23 -0
  9. brawny/_rpc/errors.py +465 -31
  10. brawny/_rpc/gas.py +7 -6
  11. brawny/_rpc/pool.py +18 -0
  12. brawny/_rpc/retry.py +266 -0
  13. brawny/_rpc/retry_policy.py +81 -0
  14. brawny/accounts.py +28 -9
  15. brawny/alerts/__init__.py +15 -18
  16. brawny/alerts/abi_resolver.py +212 -36
  17. brawny/alerts/base.py +2 -2
  18. brawny/alerts/contracts.py +77 -10
  19. brawny/alerts/errors.py +30 -3
  20. brawny/alerts/events.py +38 -5
  21. brawny/alerts/health.py +19 -13
  22. brawny/alerts/send.py +513 -55
  23. brawny/api.py +39 -11
  24. brawny/assets/AGENTS.md +325 -0
  25. brawny/async_runtime.py +48 -0
  26. brawny/chain.py +3 -3
  27. brawny/cli/commands/__init__.py +2 -0
  28. brawny/cli/commands/console.py +69 -19
  29. brawny/cli/commands/contract.py +2 -2
  30. brawny/cli/commands/controls.py +121 -0
  31. brawny/cli/commands/health.py +2 -2
  32. brawny/cli/commands/job_dev.py +6 -5
  33. brawny/cli/commands/jobs.py +99 -2
  34. brawny/cli/commands/maintenance.py +13 -29
  35. brawny/cli/commands/migrate.py +1 -0
  36. brawny/cli/commands/run.py +10 -3
  37. brawny/cli/commands/script.py +8 -3
  38. brawny/cli/commands/signer.py +143 -26
  39. brawny/cli/helpers.py +0 -3
  40. brawny/cli_templates.py +25 -349
  41. brawny/config/__init__.py +4 -1
  42. brawny/config/models.py +43 -57
  43. brawny/config/parser.py +268 -57
  44. brawny/config/validation.py +52 -15
  45. brawny/daemon/context.py +4 -2
  46. brawny/daemon/core.py +185 -63
  47. brawny/daemon/loops.py +166 -98
  48. brawny/daemon/supervisor.py +261 -0
  49. brawny/db/__init__.py +14 -26
  50. brawny/db/base.py +248 -151
  51. brawny/db/global_cache.py +11 -1
  52. brawny/db/migrate.py +175 -28
  53. brawny/db/migrations/001_init.sql +4 -3
  54. brawny/db/migrations/010_add_nonce_gap_index.sql +1 -1
  55. brawny/db/migrations/011_add_job_logs.sql +1 -2
  56. brawny/db/migrations/012_add_claimed_by.sql +2 -2
  57. brawny/db/migrations/013_attempt_unique.sql +10 -0
  58. brawny/db/migrations/014_add_lease_expires_at.sql +5 -0
  59. brawny/db/migrations/015_add_signer_alias.sql +14 -0
  60. brawny/db/migrations/016_runtime_controls_and_quarantine.sql +32 -0
  61. brawny/db/migrations/017_add_job_drain.sql +6 -0
  62. brawny/db/migrations/018_add_nonce_reset_audit.sql +20 -0
  63. brawny/db/migrations/019_add_job_cooldowns.sql +8 -0
  64. brawny/db/migrations/020_attempt_unique_initial.sql +7 -0
  65. brawny/db/ops/__init__.py +3 -25
  66. brawny/db/ops/logs.py +1 -2
  67. brawny/db/queries.py +47 -91
  68. brawny/db/serialized.py +65 -0
  69. brawny/db/sqlite/__init__.py +1001 -0
  70. brawny/db/sqlite/connection.py +231 -0
  71. brawny/db/sqlite/execute.py +116 -0
  72. brawny/db/sqlite/mappers.py +190 -0
  73. brawny/db/sqlite/repos/attempts.py +372 -0
  74. brawny/db/sqlite/repos/block_state.py +102 -0
  75. brawny/db/sqlite/repos/cache.py +104 -0
  76. brawny/db/sqlite/repos/intents.py +1021 -0
  77. brawny/db/sqlite/repos/jobs.py +200 -0
  78. brawny/db/sqlite/repos/maintenance.py +182 -0
  79. brawny/db/sqlite/repos/signers_nonces.py +566 -0
  80. brawny/db/sqlite/tx.py +119 -0
  81. brawny/http.py +194 -0
  82. brawny/invariants.py +11 -24
  83. brawny/jobs/base.py +8 -0
  84. brawny/jobs/job_validation.py +2 -1
  85. brawny/keystore.py +83 -7
  86. brawny/lifecycle.py +64 -12
  87. brawny/logging.py +0 -2
  88. brawny/metrics.py +84 -12
  89. brawny/model/contexts.py +111 -9
  90. brawny/model/enums.py +1 -0
  91. brawny/model/errors.py +18 -0
  92. brawny/model/types.py +47 -131
  93. brawny/network_guard.py +133 -0
  94. brawny/networks/__init__.py +5 -5
  95. brawny/networks/config.py +1 -7
  96. brawny/networks/manager.py +14 -11
  97. brawny/runtime_controls.py +74 -0
  98. brawny/scheduler/poller.py +11 -7
  99. brawny/scheduler/reorg.py +95 -39
  100. brawny/scheduler/runner.py +442 -168
  101. brawny/scheduler/shutdown.py +3 -3
  102. brawny/script_tx.py +3 -3
  103. brawny/telegram.py +53 -7
  104. brawny/testing.py +1 -0
  105. brawny/timeout.py +38 -0
  106. brawny/tx/executor.py +922 -308
  107. brawny/tx/intent.py +54 -16
  108. brawny/tx/monitor.py +31 -12
  109. brawny/tx/nonce.py +212 -90
  110. brawny/tx/replacement.py +69 -18
  111. brawny/tx/retry_policy.py +24 -0
  112. brawny/tx/stages/types.py +75 -0
  113. brawny/types.py +18 -0
  114. brawny/utils.py +41 -0
  115. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/METADATA +3 -3
  116. brawny-0.1.22.dist-info/RECORD +163 -0
  117. brawny/_rpc/manager.py +0 -982
  118. brawny/_rpc/selector.py +0 -156
  119. brawny/db/base_new.py +0 -165
  120. brawny/db/mappers.py +0 -182
  121. brawny/db/migrations/008_add_transactions.sql +0 -72
  122. brawny/db/ops/attempts.py +0 -108
  123. brawny/db/ops/blocks.py +0 -83
  124. brawny/db/ops/cache.py +0 -93
  125. brawny/db/ops/intents.py +0 -296
  126. brawny/db/ops/jobs.py +0 -110
  127. brawny/db/ops/nonces.py +0 -322
  128. brawny/db/postgres.py +0 -2535
  129. brawny/db/postgres_new.py +0 -196
  130. brawny/db/sqlite.py +0 -2733
  131. brawny/db/sqlite_new.py +0 -191
  132. brawny-0.1.13.dist-info/RECORD +0 -141
  133. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/WHEEL +0 -0
  134. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/entry_points.txt +0 -0
  135. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/top_level.txt +0 -0
brawny/daemon/loops.py CHANGED
@@ -12,14 +12,15 @@ from typing import TYPE_CHECKING
12
12
 
13
13
  from brawny.metrics import (
14
14
  ACTIVE_WORKERS,
15
+ BACKGROUND_TASK_ERRORS,
15
16
  INTENT_CLAIMED,
16
17
  INTENT_RELEASED,
17
18
  INTENT_SENDING_STUCK,
18
19
  INTENTS_BACKING_OFF,
20
+ CLAIM_RECLAIM_SKIPPED,
19
21
  get_metrics,
20
22
  )
21
23
  from brawny.model.enums import AttemptStatus, IntentStatus
22
- from brawny.tx.intent import transition_intent
23
24
 
24
25
  if TYPE_CHECKING:
25
26
  from threading import Thread
@@ -46,36 +47,40 @@ def run_worker(
46
47
  state: Daemon state with callbacks
47
48
  dry_run: If True, claim and release without executing
48
49
  """
49
- assert ctx.executor is not None or dry_run, "run_worker requires executor unless dry_run"
50
+ if ctx.executor is None and not dry_run:
51
+ raise RuntimeError("run_worker requires executor unless dry_run")
50
52
 
51
53
  ctx.log.debug("worker.started", worker_id=worker_id)
52
54
 
53
55
  while not stop_event.is_set():
54
- released = ctx.db.release_stale_intent_claims(
55
- max_age_seconds=ctx.config.claim_timeout_seconds
56
- )
57
- if released > 0:
58
- ctx.log.info(
59
- "worker.stale_claims_released",
60
- worker_id=worker_id,
61
- released=released,
62
- )
63
- metrics = get_metrics()
64
- metrics.counter(INTENT_RELEASED).inc(
65
- released,
66
- chain_id=ctx.chain_id,
67
- reason="stale_claim",
68
- )
69
-
56
+ if ctx.controls and ctx.controls.is_active("drain_workers"):
57
+ ctx.log.warning("runtime.control.drain_workers", worker_id=worker_id)
58
+ time.sleep(1.0)
59
+ continue
70
60
  claim_token = state.make_claim_token(worker_id)
71
61
  claimed_by = state.make_claimed_by(worker_id)
72
- intent = ctx.db.claim_next_intent(claim_token, claimed_by=claimed_by)
62
+ claimed = ctx.db.claim_next_intent(
63
+ claim_token,
64
+ claimed_by=claimed_by,
65
+ lease_seconds=ctx.config.claim_timeout_seconds,
66
+ )
73
67
 
74
- if intent is None:
68
+ if claimed is None:
75
69
  wakeup_hint.wait(timeout=1.0)
76
70
  wakeup_hint.clear()
77
71
  continue
78
72
 
73
+ intent = ctx.db.get_intent(claimed.intent_id)
74
+ if intent is None:
75
+ ctx.log.error(
76
+ "worker.claimed_intent_missing",
77
+ intent_id=str(claimed.intent_id),
78
+ claim_token=claimed.claim_token,
79
+ claimed_by=claimed.claimed_by,
80
+ worker_id=worker_id,
81
+ )
82
+ continue
83
+
79
84
  ctx.log.info(
80
85
  "intent.claimed",
81
86
  intent_id=str(intent.intent_id),
@@ -91,7 +96,10 @@ def run_worker(
91
96
 
92
97
  if dry_run:
93
98
  ctx.log.info("worker.dry_run", intent_id=str(intent.intent_id))
94
- released = ctx.db.release_intent_claim(intent.intent_id)
99
+ released = ctx.db.release_claim_if_token_and_no_attempts(
100
+ intent_id=claimed.intent_id,
101
+ claim_token=claimed.claim_token,
102
+ )
95
103
  if not released:
96
104
  ctx.log.warning(
97
105
  "worker.dry_run_release_failed",
@@ -107,7 +115,7 @@ def run_worker(
107
115
 
108
116
  state.inflight_inc()
109
117
  try:
110
- outcome = ctx.executor.execute(intent)
118
+ outcome = ctx.executor.process_claimed_intent(claimed, intent=intent)
111
119
  ctx.log.info(
112
120
  "worker.executed",
113
121
  intent_id=str(intent.intent_id),
@@ -128,7 +136,7 @@ def run_worker(
128
136
  error=e,
129
137
  job_id=intent.job_id,
130
138
  intent_id=str(intent.intent_id),
131
- claim_token=intent.claim_token,
139
+ claim_token=claimed.claim_token,
132
140
  status=intent.status.value if hasattr(intent.status, "value") else str(intent.status),
133
141
  action="Check logs; intent will retry or timeout",
134
142
  db_dialect=ctx.db.dialect,
@@ -136,53 +144,6 @@ def run_worker(
136
144
  health_chat_id=ctx.health_chat_id,
137
145
  cooldown_seconds=ctx.health_cooldown,
138
146
  )
139
-
140
- try:
141
- attempts = ctx.db.get_attempts_for_intent(intent.intent_id)
142
- except Exception as query_err:
143
- ctx.log.warning(
144
- "worker.exception_attempts_lookup_failed",
145
- intent_id=str(intent.intent_id),
146
- job_id=intent.job_id,
147
- error=str(query_err)[:200],
148
- )
149
- attempts = None
150
-
151
- if attempts == []:
152
- if not intent.claim_token:
153
- ctx.log.warning(
154
- "worker.claim_token_missing",
155
- intent_id=str(intent.intent_id),
156
- job_id=intent.job_id,
157
- )
158
- else:
159
- try:
160
- released = ctx.db.release_intent_claim_if_token(
161
- intent.intent_id,
162
- intent.claim_token,
163
- )
164
- if released:
165
- ctx.log.info(
166
- "worker.claim_released_on_error",
167
- intent_id=str(intent.intent_id),
168
- )
169
- metrics = get_metrics()
170
- metrics.counter(INTENT_RELEASED).inc(
171
- chain_id=ctx.chain_id,
172
- reason="pre_attempt_exception",
173
- )
174
- except Exception:
175
- ctx.log.exception(
176
- "worker.claim_release_failed",
177
- intent_id=str(intent.intent_id),
178
- )
179
- else:
180
- ctx.log.warning(
181
- "worker.exception_with_attempts",
182
- intent_id=str(intent.intent_id),
183
- attempt_count=(len(attempts) if attempts is not None else None),
184
- hint="Not releasing claim; monitor/replacer should handle",
185
- )
186
147
  finally:
187
148
  state.inflight_dec()
188
149
 
@@ -201,9 +162,12 @@ def run_monitor(
201
162
  ctx: Daemon context with shared components
202
163
  worker_threads: List of worker threads for gauge reporting
203
164
  """
204
- assert ctx.monitor is not None, "run_monitor requires monitor"
205
- assert ctx.replacer is not None, "run_monitor requires replacer"
206
- assert ctx.nonce_manager is not None, "run_monitor requires nonce_manager"
165
+ if ctx.monitor is None:
166
+ raise RuntimeError("run_monitor requires monitor")
167
+ if ctx.replacer is None:
168
+ raise RuntimeError("run_monitor requires replacer")
169
+ if ctx.nonce_manager is None:
170
+ raise RuntimeError("run_monitor requires nonce_manager")
207
171
 
208
172
  ctx.log.debug("monitor.started")
209
173
  last_reconcile = time.time()
@@ -211,6 +175,8 @@ def run_monitor(
211
175
  last_worker_gauge = 0.0
212
176
  last_sending_recover = 0.0
213
177
  last_log_cleanup = 0.0
178
+ last_claim_reap = 0.0
179
+ last_lease_reclaim = 0.0
214
180
 
215
181
  while not stop_event.is_set():
216
182
  try:
@@ -244,6 +210,14 @@ def run_monitor(
244
210
  _recover_stuck_sending(ctx)
245
211
  last_sending_recover = now
246
212
 
213
+ if now - last_lease_reclaim >= 30:
214
+ _requeue_expired_claims(ctx)
215
+ last_lease_reclaim = now
216
+
217
+ if now - last_claim_reap >= 30:
218
+ _reap_stale_claims(ctx)
219
+ last_claim_reap = now
220
+
247
221
  # Job log cleanup (hourly)
248
222
  if now - last_log_cleanup >= 3600:
249
223
  try:
@@ -266,6 +240,8 @@ def run_monitor(
266
240
  last_log_cleanup = now
267
241
  except Exception as e:
268
242
  ctx.log.error("monitor.error", error=str(e)[:200])
243
+ metrics = get_metrics()
244
+ metrics.counter(BACKGROUND_TASK_ERRORS).inc(task="monitor")
269
245
  health_alert(
270
246
  component="brawny.tx.monitor",
271
247
  chain_id=ctx.chain_id,
@@ -288,7 +264,8 @@ def _recover_stuck_sending(ctx: "DaemonContext") -> None:
288
264
  Args:
289
265
  ctx: Daemon context with shared components
290
266
  """
291
- assert ctx.nonce_manager is not None, "_recover_stuck_sending requires nonce_manager"
267
+ if ctx.nonce_manager is None:
268
+ raise RuntimeError("_recover_stuck_sending requires nonce_manager")
292
269
 
293
270
  stuck_sending = ctx.db.list_sending_intents_older_than(
294
271
  max_age_seconds=ctx.config.claim_timeout_seconds,
@@ -296,32 +273,123 @@ def _recover_stuck_sending(ctx: "DaemonContext") -> None:
296
273
  )
297
274
  for intent in stuck_sending:
298
275
  attempt = ctx.db.get_latest_attempt_for_intent(intent.intent_id)
299
- if attempt and attempt.tx_hash:
300
- transition_intent(
301
- ctx.db,
302
- intent.intent_id,
303
- IntentStatus.PENDING,
304
- "sending_recover",
305
- chain_id=ctx.chain_id,
306
- )
307
- else:
308
- if attempt:
309
- ctx.db.update_attempt_status(
310
- attempt.attempt_id,
311
- AttemptStatus.FAILED.value,
312
- error_code="sending_stuck",
313
- error_detail="Intent stuck in sending without broadcast",
314
- )
315
- ctx.nonce_manager.release(intent.signer_address, attempt.nonce)
316
- transition_intent(
317
- ctx.db,
318
- intent.intent_id,
319
- IntentStatus.CREATED,
320
- "sending_stuck",
321
- chain_id=ctx.chain_id,
322
- )
276
+ ctx.db.set_signer_quarantined(
277
+ ctx.chain_id,
278
+ intent.signer_address,
279
+ reason="stuck_sending",
280
+ source="recover_stuck_sending",
281
+ )
282
+ ctx.log.warning(
283
+ "intent.sending_quarantined",
284
+ intent_id=str(intent.intent_id),
285
+ job_id=intent.job_id,
286
+ attempt_id=str(attempt.attempt_id) if attempt else None,
287
+ )
323
288
  metrics = get_metrics()
324
289
  metrics.counter(INTENT_SENDING_STUCK).inc(
325
290
  chain_id=ctx.chain_id,
326
291
  age_bucket=">claim_timeout",
327
292
  )
293
+
294
+
295
+ def _requeue_expired_claims(ctx: "DaemonContext") -> None:
296
+ grace_seconds = 15
297
+ limit = 50
298
+ requeued = ctx.db.requeue_expired_claims_no_attempts(
299
+ limit=limit,
300
+ grace_seconds=grace_seconds,
301
+ chain_id=ctx.chain_id,
302
+ )
303
+ skipped = ctx.db.count_expired_claims_with_attempts(
304
+ limit=limit,
305
+ grace_seconds=grace_seconds,
306
+ chain_id=ctx.chain_id,
307
+ )
308
+ if requeued == 0 and skipped == 0:
309
+ return
310
+ if requeued > 0:
311
+ ctx.log.info(
312
+ "claim.lease_requeued",
313
+ count=requeued,
314
+ )
315
+ metrics = get_metrics()
316
+ metrics.counter(INTENT_RELEASED).inc(
317
+ requeued,
318
+ chain_id=ctx.chain_id,
319
+ reason="lease_expired",
320
+ )
321
+ if skipped > 0:
322
+ ctx.log.warning(
323
+ "claim.lease_requeue_skipped_with_attempts",
324
+ count=skipped,
325
+ )
326
+ metrics = get_metrics()
327
+ metrics.counter(CLAIM_RECLAIM_SKIPPED).inc(
328
+ skipped,
329
+ chain_id=ctx.chain_id,
330
+ )
331
+
332
+ if ctx.config.debug.enable_null_lease_reclaim:
333
+ cutoff_seconds = 15 * 60
334
+ requeued_null = ctx.db.requeue_missing_lease_claims_no_attempts(
335
+ limit=limit,
336
+ cutoff_seconds=cutoff_seconds,
337
+ chain_id=ctx.chain_id,
338
+ )
339
+ skipped_null = ctx.db.count_missing_lease_claims_with_attempts(
340
+ limit=limit,
341
+ cutoff_seconds=cutoff_seconds,
342
+ chain_id=ctx.chain_id,
343
+ )
344
+ if requeued_null > 0:
345
+ ctx.log.warning(
346
+ "claim.null_lease_requeued",
347
+ count=requeued_null,
348
+ )
349
+ metrics = get_metrics()
350
+ metrics.counter(INTENT_RELEASED).inc(
351
+ requeued_null,
352
+ chain_id=ctx.chain_id,
353
+ reason="missing_lease",
354
+ )
355
+ if skipped_null > 0:
356
+ ctx.log.warning(
357
+ "claim.null_lease_requeue_skipped_with_attempts",
358
+ count=skipped_null,
359
+ )
360
+ metrics = get_metrics()
361
+ metrics.counter(CLAIM_RECLAIM_SKIPPED).inc(
362
+ skipped_null,
363
+ chain_id=ctx.chain_id,
364
+ )
365
+
366
+
367
+ def _reap_stale_claims(ctx: "DaemonContext") -> None:
368
+ """Reap stale claimed intents with attempts.
369
+
370
+ If a broadcast attempt exists, move to PENDING for monitor/reconcile.
371
+ Otherwise, release claim back to CREATED and mark attempts failed.
372
+ """
373
+ if ctx.nonce_manager is None:
374
+ raise RuntimeError("_reap_stale_claims requires nonce_manager")
375
+
376
+ stale = ctx.db.list_claimed_intents_older_than(
377
+ max_age_seconds=ctx.config.claim_timeout_seconds,
378
+ chain_id=ctx.chain_id,
379
+ )
380
+ if not stale:
381
+ return
382
+
383
+ ctx.log.warning(
384
+ "claim.reap_detected",
385
+ count=len(stale),
386
+ action="containment_only",
387
+ )
388
+ ctx.db.set_runtime_control(
389
+ control="pause_new_intents",
390
+ active=True,
391
+ expires_at=datetime.utcnow() + timedelta(seconds=300),
392
+ reason="stale_claims_detected",
393
+ actor="reaper",
394
+ mode="auto",
395
+ )
@@ -0,0 +1,261 @@
1
+ """Worker thread supervision with health tracking and failure handling.
2
+
3
+ Provides fail-fast supervision for daemon worker threads. When a worker fails
4
+ (exception or silent return), the supervisor signals shutdown so the daemon
5
+ can exit cleanly with a non-zero exit code.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import threading
11
+ from dataclasses import dataclass
12
+ from datetime import datetime, timezone
13
+ from enum import Enum
14
+ from typing import Any, Callable
15
+
16
+ from brawny.logging import get_logger
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ class WorkerStatus(Enum):
22
+ """Status of a supervised worker thread."""
23
+
24
+ STARTING = "starting"
25
+ RUNNING = "running"
26
+ FAILED = "failed"
27
+ STOPPED = "stopped" # Exited without exception (still a failure in daemon)
28
+
29
+
30
+ @dataclass
31
+ class WorkerState:
32
+ """State for a supervised worker thread."""
33
+
34
+ name: str
35
+ target: Callable[[], None]
36
+ daemon: bool
37
+ status: WorkerStatus = WorkerStatus.STARTING
38
+ thread: threading.Thread | None = None
39
+ started_at: datetime | None = None
40
+ failed_at: datetime | None = None
41
+ failure_count: int = 0
42
+ last_error: str | None = None
43
+
44
+
45
+ class WorkerSupervisor:
46
+ """Supervises worker threads with health tracking and failure handling.
47
+
48
+ Responsibilities:
49
+ - Start workers with exception-catching wrapper
50
+ - Record status + last error
51
+ - Signal shutdown on failure (fail-fast mode)
52
+ - Provide snapshot for health checks
53
+
54
+ Does NOT:
55
+ - Call sys.exit() (daemon decides exit)
56
+ - Auto-restart workers (V1 - keeps it simple)
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ *,
62
+ fail_fast: bool = True,
63
+ liveness_check_interval: float = 5.0,
64
+ ) -> None:
65
+ """Initialize the supervisor.
66
+
67
+ Args:
68
+ fail_fast: If True, trigger shutdown on any worker failure (default for tx systems)
69
+ liveness_check_interval: How often to check thread liveness (seconds)
70
+ """
71
+ self._workers: dict[str, WorkerState] = {}
72
+ self._lock = threading.Lock()
73
+ self._shutdown_event = threading.Event()
74
+ self._fatal_reason: str | None = None
75
+ self._fail_fast = fail_fast
76
+ self._liveness_interval = liveness_check_interval
77
+ self._liveness_thread: threading.Thread | None = None
78
+
79
+ def add(
80
+ self,
81
+ name: str,
82
+ target: Callable[[], None],
83
+ *,
84
+ daemon: bool = True,
85
+ ) -> None:
86
+ """Register a worker to be supervised (does not start it).
87
+
88
+ Args:
89
+ name: Unique name for the worker
90
+ target: The function to run in the worker thread
91
+ daemon: Whether the thread should be a daemon thread
92
+ """
93
+ with self._lock:
94
+ if name in self._workers:
95
+ raise ValueError(f"Worker {name!r} already registered")
96
+ self._workers[name] = WorkerState(
97
+ name=name,
98
+ target=target,
99
+ daemon=daemon,
100
+ )
101
+
102
+ def start_all(self) -> None:
103
+ """Start all registered workers and the liveness monitor."""
104
+ with self._lock:
105
+ for state in self._workers.values():
106
+ self._start_worker(state)
107
+
108
+ # Start liveness monitor thread
109
+ self._liveness_thread = threading.Thread(
110
+ target=self._liveness_monitor,
111
+ name="supervisor-liveness",
112
+ daemon=True,
113
+ )
114
+ self._liveness_thread.start()
115
+
116
+ def _start_worker(self, state: WorkerState) -> None:
117
+ """Start a single worker thread with supervision wrapper."""
118
+ name = state.name
119
+ target = state.target
120
+
121
+ def supervised_target() -> None:
122
+ # Update state to RUNNING
123
+ with self._lock:
124
+ state.status = WorkerStatus.RUNNING
125
+ state.started_at = datetime.now(timezone.utc)
126
+
127
+ logger.info("worker.started", worker=name)
128
+
129
+ try:
130
+ target()
131
+ # If we get here, worker returned normally - that's a bug in a daemon
132
+ self._handle_worker_exit(name, reason="returned normally (bug)")
133
+ except Exception as e:
134
+ self._handle_worker_failure(name, e)
135
+
136
+ thread = threading.Thread(
137
+ target=supervised_target,
138
+ name=f"worker-{name}",
139
+ daemon=state.daemon,
140
+ )
141
+ state.thread = thread
142
+ thread.start()
143
+
144
+ def _handle_worker_failure(self, name: str, error: Exception) -> None:
145
+ """Handle worker thread failure (exception)."""
146
+ # Capture fields under lock, then release before logging
147
+ with self._lock:
148
+ worker = self._workers[name]
149
+ worker.status = WorkerStatus.FAILED
150
+ worker.failed_at = datetime.now(timezone.utc)
151
+ worker.failure_count += 1
152
+ worker.last_error = str(error)
153
+ failure_count = worker.failure_count
154
+
155
+ # Log after releasing lock
156
+ logger.error(
157
+ "worker.failed",
158
+ worker=name,
159
+ error=str(error),
160
+ failure_count=failure_count,
161
+ exc_info=True,
162
+ )
163
+
164
+ self._trigger_shutdown(f"worker {name!r} failed: {error}")
165
+
166
+ def _handle_worker_exit(self, name: str, reason: str) -> None:
167
+ """Handle worker thread exiting (no exception, but still a failure)."""
168
+ with self._lock:
169
+ worker = self._workers[name]
170
+ worker.status = WorkerStatus.STOPPED
171
+ worker.failed_at = datetime.now(timezone.utc)
172
+ worker.last_error = reason
173
+
174
+ logger.error("worker.exited", worker=name, reason=reason)
175
+ self._trigger_shutdown(f"worker {name!r} exited: {reason}")
176
+
177
+ def _trigger_shutdown(self, reason: str) -> None:
178
+ """Trigger shutdown with reason.
179
+
180
+ When fail_fast=True: Sets shutdown_event, daemon should exit.
181
+ When fail_fast=False: Does NOT set shutdown_event. Daemon keeps running
182
+ but all_healthy() returns False. Health checks should use all_healthy()
183
+ to report degraded status even if process continues.
184
+ """
185
+ # Always record the reason (useful for debugging even if not shutting down)
186
+ with self._lock:
187
+ if self._fatal_reason is None:
188
+ self._fatal_reason = reason
189
+
190
+ if self._fail_fast:
191
+ logger.critical("supervisor.shutdown", reason=reason)
192
+ self._shutdown_event.set()
193
+ else:
194
+ # Log but don't trigger shutdown - daemon continues in degraded state
195
+ logger.error("supervisor.worker_failed_no_shutdown", reason=reason)
196
+
197
+ def _liveness_monitor(self) -> None:
198
+ """Periodically check that all workers are still alive."""
199
+ while not self._shutdown_event.wait(self._liveness_interval):
200
+ dead_name: str | None = None
201
+
202
+ with self._lock:
203
+ for name, state in self._workers.items():
204
+ if state.status == WorkerStatus.RUNNING:
205
+ if state.thread is not None and not state.thread.is_alive():
206
+ # Thread died without us catching it (shouldn't happen, but defensive)
207
+ state.status = WorkerStatus.STOPPED
208
+ state.failed_at = datetime.now(timezone.utc)
209
+ state.last_error = "thread died unexpectedly"
210
+ dead_name = name # Capture before releasing lock
211
+ break
212
+
213
+ # Handle dead worker outside lock
214
+ if dead_name is not None:
215
+ logger.error("worker.dead", worker=dead_name)
216
+ self._trigger_shutdown(f"worker {dead_name!r} died unexpectedly")
217
+
218
+ def snapshot(self) -> dict[str, dict[str, Any]]:
219
+ """Return snapshot of all worker states for health checks."""
220
+ with self._lock:
221
+ return {
222
+ name: {
223
+ "status": state.status.value,
224
+ "started_at": state.started_at.isoformat() if state.started_at else None,
225
+ "failed_at": state.failed_at.isoformat() if state.failed_at else None,
226
+ "failure_count": state.failure_count,
227
+ "last_error": state.last_error,
228
+ "alive": state.thread.is_alive() if state.thread else False,
229
+ }
230
+ for name, state in self._workers.items()
231
+ }
232
+
233
+ def all_healthy(self) -> bool:
234
+ """Check if all workers are healthy (running and alive)."""
235
+ with self._lock:
236
+ return all(
237
+ state.status == WorkerStatus.RUNNING
238
+ and state.thread is not None
239
+ and state.thread.is_alive()
240
+ for state in self._workers.values()
241
+ )
242
+
243
+ def shutdown_requested(self) -> bool:
244
+ """Check if shutdown has been triggered."""
245
+ return self._shutdown_event.is_set()
246
+
247
+ def fatal_reason(self) -> str | None:
248
+ """Return the reason for fatal shutdown, if any."""
249
+ with self._lock:
250
+ return self._fatal_reason
251
+
252
+ def wait_for_shutdown(self, timeout: float | None = None) -> bool:
253
+ """Wait for shutdown signal. Returns True if shutdown was signaled."""
254
+ return self._shutdown_event.wait(timeout)
255
+
256
+ def request_shutdown(self, reason: str = "requested") -> None:
257
+ """Request supervisor shutdown (e.g., from signal handler)."""
258
+ with self._lock:
259
+ if self._fatal_reason is None:
260
+ self._fatal_reason = reason
261
+ self._shutdown_event.set()