brawny 0.1.13__py3-none-any.whl → 0.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brawny/__init__.py +2 -0
- brawny/_context.py +5 -5
- brawny/_rpc/__init__.py +36 -12
- brawny/_rpc/broadcast.py +14 -13
- brawny/_rpc/caller.py +243 -0
- brawny/_rpc/client.py +539 -0
- brawny/_rpc/clients.py +11 -11
- brawny/_rpc/context.py +23 -0
- brawny/_rpc/errors.py +465 -31
- brawny/_rpc/gas.py +7 -6
- brawny/_rpc/pool.py +18 -0
- brawny/_rpc/retry.py +266 -0
- brawny/_rpc/retry_policy.py +81 -0
- brawny/accounts.py +28 -9
- brawny/alerts/__init__.py +15 -18
- brawny/alerts/abi_resolver.py +212 -36
- brawny/alerts/base.py +2 -2
- brawny/alerts/contracts.py +77 -10
- brawny/alerts/errors.py +30 -3
- brawny/alerts/events.py +38 -5
- brawny/alerts/health.py +19 -13
- brawny/alerts/send.py +513 -55
- brawny/api.py +39 -11
- brawny/assets/AGENTS.md +325 -0
- brawny/async_runtime.py +48 -0
- brawny/chain.py +3 -3
- brawny/cli/commands/__init__.py +2 -0
- brawny/cli/commands/console.py +69 -19
- brawny/cli/commands/contract.py +2 -2
- brawny/cli/commands/controls.py +121 -0
- brawny/cli/commands/health.py +2 -2
- brawny/cli/commands/job_dev.py +6 -5
- brawny/cli/commands/jobs.py +99 -2
- brawny/cli/commands/maintenance.py +13 -29
- brawny/cli/commands/migrate.py +1 -0
- brawny/cli/commands/run.py +10 -3
- brawny/cli/commands/script.py +8 -3
- brawny/cli/commands/signer.py +143 -26
- brawny/cli/helpers.py +0 -3
- brawny/cli_templates.py +25 -349
- brawny/config/__init__.py +4 -1
- brawny/config/models.py +43 -57
- brawny/config/parser.py +268 -57
- brawny/config/validation.py +52 -15
- brawny/daemon/context.py +4 -2
- brawny/daemon/core.py +185 -63
- brawny/daemon/loops.py +166 -98
- brawny/daemon/supervisor.py +261 -0
- brawny/db/__init__.py +14 -26
- brawny/db/base.py +248 -151
- brawny/db/global_cache.py +11 -1
- brawny/db/migrate.py +175 -28
- brawny/db/migrations/001_init.sql +4 -3
- brawny/db/migrations/010_add_nonce_gap_index.sql +1 -1
- brawny/db/migrations/011_add_job_logs.sql +1 -2
- brawny/db/migrations/012_add_claimed_by.sql +2 -2
- brawny/db/migrations/013_attempt_unique.sql +10 -0
- brawny/db/migrations/014_add_lease_expires_at.sql +5 -0
- brawny/db/migrations/015_add_signer_alias.sql +14 -0
- brawny/db/migrations/016_runtime_controls_and_quarantine.sql +32 -0
- brawny/db/migrations/017_add_job_drain.sql +6 -0
- brawny/db/migrations/018_add_nonce_reset_audit.sql +20 -0
- brawny/db/migrations/019_add_job_cooldowns.sql +8 -0
- brawny/db/migrations/020_attempt_unique_initial.sql +7 -0
- brawny/db/ops/__init__.py +3 -25
- brawny/db/ops/logs.py +1 -2
- brawny/db/queries.py +47 -91
- brawny/db/serialized.py +65 -0
- brawny/db/sqlite/__init__.py +1001 -0
- brawny/db/sqlite/connection.py +231 -0
- brawny/db/sqlite/execute.py +116 -0
- brawny/db/sqlite/mappers.py +190 -0
- brawny/db/sqlite/repos/attempts.py +372 -0
- brawny/db/sqlite/repos/block_state.py +102 -0
- brawny/db/sqlite/repos/cache.py +104 -0
- brawny/db/sqlite/repos/intents.py +1021 -0
- brawny/db/sqlite/repos/jobs.py +200 -0
- brawny/db/sqlite/repos/maintenance.py +182 -0
- brawny/db/sqlite/repos/signers_nonces.py +566 -0
- brawny/db/sqlite/tx.py +119 -0
- brawny/http.py +194 -0
- brawny/invariants.py +11 -24
- brawny/jobs/base.py +8 -0
- brawny/jobs/job_validation.py +2 -1
- brawny/keystore.py +83 -7
- brawny/lifecycle.py +64 -12
- brawny/logging.py +0 -2
- brawny/metrics.py +84 -12
- brawny/model/contexts.py +111 -9
- brawny/model/enums.py +1 -0
- brawny/model/errors.py +18 -0
- brawny/model/types.py +47 -131
- brawny/network_guard.py +133 -0
- brawny/networks/__init__.py +5 -5
- brawny/networks/config.py +1 -7
- brawny/networks/manager.py +14 -11
- brawny/runtime_controls.py +74 -0
- brawny/scheduler/poller.py +11 -7
- brawny/scheduler/reorg.py +95 -39
- brawny/scheduler/runner.py +442 -168
- brawny/scheduler/shutdown.py +3 -3
- brawny/script_tx.py +3 -3
- brawny/telegram.py +53 -7
- brawny/testing.py +1 -0
- brawny/timeout.py +38 -0
- brawny/tx/executor.py +922 -308
- brawny/tx/intent.py +54 -16
- brawny/tx/monitor.py +31 -12
- brawny/tx/nonce.py +212 -90
- brawny/tx/replacement.py +69 -18
- brawny/tx/retry_policy.py +24 -0
- brawny/tx/stages/types.py +75 -0
- brawny/types.py +18 -0
- brawny/utils.py +41 -0
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/METADATA +3 -3
- brawny-0.1.22.dist-info/RECORD +163 -0
- brawny/_rpc/manager.py +0 -982
- brawny/_rpc/selector.py +0 -156
- brawny/db/base_new.py +0 -165
- brawny/db/mappers.py +0 -182
- brawny/db/migrations/008_add_transactions.sql +0 -72
- brawny/db/ops/attempts.py +0 -108
- brawny/db/ops/blocks.py +0 -83
- brawny/db/ops/cache.py +0 -93
- brawny/db/ops/intents.py +0 -296
- brawny/db/ops/jobs.py +0 -110
- brawny/db/ops/nonces.py +0 -322
- brawny/db/postgres.py +0 -2535
- brawny/db/postgres_new.py +0 -196
- brawny/db/sqlite.py +0 -2733
- brawny/db/sqlite_new.py +0 -191
- brawny-0.1.13.dist-info/RECORD +0 -141
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/WHEEL +0 -0
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/entry_points.txt +0 -0
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/top_level.txt +0 -0
brawny/daemon/loops.py
CHANGED
|
@@ -12,14 +12,15 @@ from typing import TYPE_CHECKING
|
|
|
12
12
|
|
|
13
13
|
from brawny.metrics import (
|
|
14
14
|
ACTIVE_WORKERS,
|
|
15
|
+
BACKGROUND_TASK_ERRORS,
|
|
15
16
|
INTENT_CLAIMED,
|
|
16
17
|
INTENT_RELEASED,
|
|
17
18
|
INTENT_SENDING_STUCK,
|
|
18
19
|
INTENTS_BACKING_OFF,
|
|
20
|
+
CLAIM_RECLAIM_SKIPPED,
|
|
19
21
|
get_metrics,
|
|
20
22
|
)
|
|
21
23
|
from brawny.model.enums import AttemptStatus, IntentStatus
|
|
22
|
-
from brawny.tx.intent import transition_intent
|
|
23
24
|
|
|
24
25
|
if TYPE_CHECKING:
|
|
25
26
|
from threading import Thread
|
|
@@ -46,36 +47,40 @@ def run_worker(
|
|
|
46
47
|
state: Daemon state with callbacks
|
|
47
48
|
dry_run: If True, claim and release without executing
|
|
48
49
|
"""
|
|
49
|
-
|
|
50
|
+
if ctx.executor is None and not dry_run:
|
|
51
|
+
raise RuntimeError("run_worker requires executor unless dry_run")
|
|
50
52
|
|
|
51
53
|
ctx.log.debug("worker.started", worker_id=worker_id)
|
|
52
54
|
|
|
53
55
|
while not stop_event.is_set():
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
ctx.log.info(
|
|
59
|
-
"worker.stale_claims_released",
|
|
60
|
-
worker_id=worker_id,
|
|
61
|
-
released=released,
|
|
62
|
-
)
|
|
63
|
-
metrics = get_metrics()
|
|
64
|
-
metrics.counter(INTENT_RELEASED).inc(
|
|
65
|
-
released,
|
|
66
|
-
chain_id=ctx.chain_id,
|
|
67
|
-
reason="stale_claim",
|
|
68
|
-
)
|
|
69
|
-
|
|
56
|
+
if ctx.controls and ctx.controls.is_active("drain_workers"):
|
|
57
|
+
ctx.log.warning("runtime.control.drain_workers", worker_id=worker_id)
|
|
58
|
+
time.sleep(1.0)
|
|
59
|
+
continue
|
|
70
60
|
claim_token = state.make_claim_token(worker_id)
|
|
71
61
|
claimed_by = state.make_claimed_by(worker_id)
|
|
72
|
-
|
|
62
|
+
claimed = ctx.db.claim_next_intent(
|
|
63
|
+
claim_token,
|
|
64
|
+
claimed_by=claimed_by,
|
|
65
|
+
lease_seconds=ctx.config.claim_timeout_seconds,
|
|
66
|
+
)
|
|
73
67
|
|
|
74
|
-
if
|
|
68
|
+
if claimed is None:
|
|
75
69
|
wakeup_hint.wait(timeout=1.0)
|
|
76
70
|
wakeup_hint.clear()
|
|
77
71
|
continue
|
|
78
72
|
|
|
73
|
+
intent = ctx.db.get_intent(claimed.intent_id)
|
|
74
|
+
if intent is None:
|
|
75
|
+
ctx.log.error(
|
|
76
|
+
"worker.claimed_intent_missing",
|
|
77
|
+
intent_id=str(claimed.intent_id),
|
|
78
|
+
claim_token=claimed.claim_token,
|
|
79
|
+
claimed_by=claimed.claimed_by,
|
|
80
|
+
worker_id=worker_id,
|
|
81
|
+
)
|
|
82
|
+
continue
|
|
83
|
+
|
|
79
84
|
ctx.log.info(
|
|
80
85
|
"intent.claimed",
|
|
81
86
|
intent_id=str(intent.intent_id),
|
|
@@ -91,7 +96,10 @@ def run_worker(
|
|
|
91
96
|
|
|
92
97
|
if dry_run:
|
|
93
98
|
ctx.log.info("worker.dry_run", intent_id=str(intent.intent_id))
|
|
94
|
-
released = ctx.db.
|
|
99
|
+
released = ctx.db.release_claim_if_token_and_no_attempts(
|
|
100
|
+
intent_id=claimed.intent_id,
|
|
101
|
+
claim_token=claimed.claim_token,
|
|
102
|
+
)
|
|
95
103
|
if not released:
|
|
96
104
|
ctx.log.warning(
|
|
97
105
|
"worker.dry_run_release_failed",
|
|
@@ -107,7 +115,7 @@ def run_worker(
|
|
|
107
115
|
|
|
108
116
|
state.inflight_inc()
|
|
109
117
|
try:
|
|
110
|
-
outcome = ctx.executor.
|
|
118
|
+
outcome = ctx.executor.process_claimed_intent(claimed, intent=intent)
|
|
111
119
|
ctx.log.info(
|
|
112
120
|
"worker.executed",
|
|
113
121
|
intent_id=str(intent.intent_id),
|
|
@@ -128,7 +136,7 @@ def run_worker(
|
|
|
128
136
|
error=e,
|
|
129
137
|
job_id=intent.job_id,
|
|
130
138
|
intent_id=str(intent.intent_id),
|
|
131
|
-
claim_token=
|
|
139
|
+
claim_token=claimed.claim_token,
|
|
132
140
|
status=intent.status.value if hasattr(intent.status, "value") else str(intent.status),
|
|
133
141
|
action="Check logs; intent will retry or timeout",
|
|
134
142
|
db_dialect=ctx.db.dialect,
|
|
@@ -136,53 +144,6 @@ def run_worker(
|
|
|
136
144
|
health_chat_id=ctx.health_chat_id,
|
|
137
145
|
cooldown_seconds=ctx.health_cooldown,
|
|
138
146
|
)
|
|
139
|
-
|
|
140
|
-
try:
|
|
141
|
-
attempts = ctx.db.get_attempts_for_intent(intent.intent_id)
|
|
142
|
-
except Exception as query_err:
|
|
143
|
-
ctx.log.warning(
|
|
144
|
-
"worker.exception_attempts_lookup_failed",
|
|
145
|
-
intent_id=str(intent.intent_id),
|
|
146
|
-
job_id=intent.job_id,
|
|
147
|
-
error=str(query_err)[:200],
|
|
148
|
-
)
|
|
149
|
-
attempts = None
|
|
150
|
-
|
|
151
|
-
if attempts == []:
|
|
152
|
-
if not intent.claim_token:
|
|
153
|
-
ctx.log.warning(
|
|
154
|
-
"worker.claim_token_missing",
|
|
155
|
-
intent_id=str(intent.intent_id),
|
|
156
|
-
job_id=intent.job_id,
|
|
157
|
-
)
|
|
158
|
-
else:
|
|
159
|
-
try:
|
|
160
|
-
released = ctx.db.release_intent_claim_if_token(
|
|
161
|
-
intent.intent_id,
|
|
162
|
-
intent.claim_token,
|
|
163
|
-
)
|
|
164
|
-
if released:
|
|
165
|
-
ctx.log.info(
|
|
166
|
-
"worker.claim_released_on_error",
|
|
167
|
-
intent_id=str(intent.intent_id),
|
|
168
|
-
)
|
|
169
|
-
metrics = get_metrics()
|
|
170
|
-
metrics.counter(INTENT_RELEASED).inc(
|
|
171
|
-
chain_id=ctx.chain_id,
|
|
172
|
-
reason="pre_attempt_exception",
|
|
173
|
-
)
|
|
174
|
-
except Exception:
|
|
175
|
-
ctx.log.exception(
|
|
176
|
-
"worker.claim_release_failed",
|
|
177
|
-
intent_id=str(intent.intent_id),
|
|
178
|
-
)
|
|
179
|
-
else:
|
|
180
|
-
ctx.log.warning(
|
|
181
|
-
"worker.exception_with_attempts",
|
|
182
|
-
intent_id=str(intent.intent_id),
|
|
183
|
-
attempt_count=(len(attempts) if attempts is not None else None),
|
|
184
|
-
hint="Not releasing claim; monitor/replacer should handle",
|
|
185
|
-
)
|
|
186
147
|
finally:
|
|
187
148
|
state.inflight_dec()
|
|
188
149
|
|
|
@@ -201,9 +162,12 @@ def run_monitor(
|
|
|
201
162
|
ctx: Daemon context with shared components
|
|
202
163
|
worker_threads: List of worker threads for gauge reporting
|
|
203
164
|
"""
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
165
|
+
if ctx.monitor is None:
|
|
166
|
+
raise RuntimeError("run_monitor requires monitor")
|
|
167
|
+
if ctx.replacer is None:
|
|
168
|
+
raise RuntimeError("run_monitor requires replacer")
|
|
169
|
+
if ctx.nonce_manager is None:
|
|
170
|
+
raise RuntimeError("run_monitor requires nonce_manager")
|
|
207
171
|
|
|
208
172
|
ctx.log.debug("monitor.started")
|
|
209
173
|
last_reconcile = time.time()
|
|
@@ -211,6 +175,8 @@ def run_monitor(
|
|
|
211
175
|
last_worker_gauge = 0.0
|
|
212
176
|
last_sending_recover = 0.0
|
|
213
177
|
last_log_cleanup = 0.0
|
|
178
|
+
last_claim_reap = 0.0
|
|
179
|
+
last_lease_reclaim = 0.0
|
|
214
180
|
|
|
215
181
|
while not stop_event.is_set():
|
|
216
182
|
try:
|
|
@@ -244,6 +210,14 @@ def run_monitor(
|
|
|
244
210
|
_recover_stuck_sending(ctx)
|
|
245
211
|
last_sending_recover = now
|
|
246
212
|
|
|
213
|
+
if now - last_lease_reclaim >= 30:
|
|
214
|
+
_requeue_expired_claims(ctx)
|
|
215
|
+
last_lease_reclaim = now
|
|
216
|
+
|
|
217
|
+
if now - last_claim_reap >= 30:
|
|
218
|
+
_reap_stale_claims(ctx)
|
|
219
|
+
last_claim_reap = now
|
|
220
|
+
|
|
247
221
|
# Job log cleanup (hourly)
|
|
248
222
|
if now - last_log_cleanup >= 3600:
|
|
249
223
|
try:
|
|
@@ -266,6 +240,8 @@ def run_monitor(
|
|
|
266
240
|
last_log_cleanup = now
|
|
267
241
|
except Exception as e:
|
|
268
242
|
ctx.log.error("monitor.error", error=str(e)[:200])
|
|
243
|
+
metrics = get_metrics()
|
|
244
|
+
metrics.counter(BACKGROUND_TASK_ERRORS).inc(task="monitor")
|
|
269
245
|
health_alert(
|
|
270
246
|
component="brawny.tx.monitor",
|
|
271
247
|
chain_id=ctx.chain_id,
|
|
@@ -288,7 +264,8 @@ def _recover_stuck_sending(ctx: "DaemonContext") -> None:
|
|
|
288
264
|
Args:
|
|
289
265
|
ctx: Daemon context with shared components
|
|
290
266
|
"""
|
|
291
|
-
|
|
267
|
+
if ctx.nonce_manager is None:
|
|
268
|
+
raise RuntimeError("_recover_stuck_sending requires nonce_manager")
|
|
292
269
|
|
|
293
270
|
stuck_sending = ctx.db.list_sending_intents_older_than(
|
|
294
271
|
max_age_seconds=ctx.config.claim_timeout_seconds,
|
|
@@ -296,32 +273,123 @@ def _recover_stuck_sending(ctx: "DaemonContext") -> None:
|
|
|
296
273
|
)
|
|
297
274
|
for intent in stuck_sending:
|
|
298
275
|
attempt = ctx.db.get_latest_attempt_for_intent(intent.intent_id)
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
AttemptStatus.FAILED.value,
|
|
312
|
-
error_code="sending_stuck",
|
|
313
|
-
error_detail="Intent stuck in sending without broadcast",
|
|
314
|
-
)
|
|
315
|
-
ctx.nonce_manager.release(intent.signer_address, attempt.nonce)
|
|
316
|
-
transition_intent(
|
|
317
|
-
ctx.db,
|
|
318
|
-
intent.intent_id,
|
|
319
|
-
IntentStatus.CREATED,
|
|
320
|
-
"sending_stuck",
|
|
321
|
-
chain_id=ctx.chain_id,
|
|
322
|
-
)
|
|
276
|
+
ctx.db.set_signer_quarantined(
|
|
277
|
+
ctx.chain_id,
|
|
278
|
+
intent.signer_address,
|
|
279
|
+
reason="stuck_sending",
|
|
280
|
+
source="recover_stuck_sending",
|
|
281
|
+
)
|
|
282
|
+
ctx.log.warning(
|
|
283
|
+
"intent.sending_quarantined",
|
|
284
|
+
intent_id=str(intent.intent_id),
|
|
285
|
+
job_id=intent.job_id,
|
|
286
|
+
attempt_id=str(attempt.attempt_id) if attempt else None,
|
|
287
|
+
)
|
|
323
288
|
metrics = get_metrics()
|
|
324
289
|
metrics.counter(INTENT_SENDING_STUCK).inc(
|
|
325
290
|
chain_id=ctx.chain_id,
|
|
326
291
|
age_bucket=">claim_timeout",
|
|
327
292
|
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _requeue_expired_claims(ctx: "DaemonContext") -> None:
|
|
296
|
+
grace_seconds = 15
|
|
297
|
+
limit = 50
|
|
298
|
+
requeued = ctx.db.requeue_expired_claims_no_attempts(
|
|
299
|
+
limit=limit,
|
|
300
|
+
grace_seconds=grace_seconds,
|
|
301
|
+
chain_id=ctx.chain_id,
|
|
302
|
+
)
|
|
303
|
+
skipped = ctx.db.count_expired_claims_with_attempts(
|
|
304
|
+
limit=limit,
|
|
305
|
+
grace_seconds=grace_seconds,
|
|
306
|
+
chain_id=ctx.chain_id,
|
|
307
|
+
)
|
|
308
|
+
if requeued == 0 and skipped == 0:
|
|
309
|
+
return
|
|
310
|
+
if requeued > 0:
|
|
311
|
+
ctx.log.info(
|
|
312
|
+
"claim.lease_requeued",
|
|
313
|
+
count=requeued,
|
|
314
|
+
)
|
|
315
|
+
metrics = get_metrics()
|
|
316
|
+
metrics.counter(INTENT_RELEASED).inc(
|
|
317
|
+
requeued,
|
|
318
|
+
chain_id=ctx.chain_id,
|
|
319
|
+
reason="lease_expired",
|
|
320
|
+
)
|
|
321
|
+
if skipped > 0:
|
|
322
|
+
ctx.log.warning(
|
|
323
|
+
"claim.lease_requeue_skipped_with_attempts",
|
|
324
|
+
count=skipped,
|
|
325
|
+
)
|
|
326
|
+
metrics = get_metrics()
|
|
327
|
+
metrics.counter(CLAIM_RECLAIM_SKIPPED).inc(
|
|
328
|
+
skipped,
|
|
329
|
+
chain_id=ctx.chain_id,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
if ctx.config.debug.enable_null_lease_reclaim:
|
|
333
|
+
cutoff_seconds = 15 * 60
|
|
334
|
+
requeued_null = ctx.db.requeue_missing_lease_claims_no_attempts(
|
|
335
|
+
limit=limit,
|
|
336
|
+
cutoff_seconds=cutoff_seconds,
|
|
337
|
+
chain_id=ctx.chain_id,
|
|
338
|
+
)
|
|
339
|
+
skipped_null = ctx.db.count_missing_lease_claims_with_attempts(
|
|
340
|
+
limit=limit,
|
|
341
|
+
cutoff_seconds=cutoff_seconds,
|
|
342
|
+
chain_id=ctx.chain_id,
|
|
343
|
+
)
|
|
344
|
+
if requeued_null > 0:
|
|
345
|
+
ctx.log.warning(
|
|
346
|
+
"claim.null_lease_requeued",
|
|
347
|
+
count=requeued_null,
|
|
348
|
+
)
|
|
349
|
+
metrics = get_metrics()
|
|
350
|
+
metrics.counter(INTENT_RELEASED).inc(
|
|
351
|
+
requeued_null,
|
|
352
|
+
chain_id=ctx.chain_id,
|
|
353
|
+
reason="missing_lease",
|
|
354
|
+
)
|
|
355
|
+
if skipped_null > 0:
|
|
356
|
+
ctx.log.warning(
|
|
357
|
+
"claim.null_lease_requeue_skipped_with_attempts",
|
|
358
|
+
count=skipped_null,
|
|
359
|
+
)
|
|
360
|
+
metrics = get_metrics()
|
|
361
|
+
metrics.counter(CLAIM_RECLAIM_SKIPPED).inc(
|
|
362
|
+
skipped_null,
|
|
363
|
+
chain_id=ctx.chain_id,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _reap_stale_claims(ctx: "DaemonContext") -> None:
|
|
368
|
+
"""Reap stale claimed intents with attempts.
|
|
369
|
+
|
|
370
|
+
If a broadcast attempt exists, move to PENDING for monitor/reconcile.
|
|
371
|
+
Otherwise, release claim back to CREATED and mark attempts failed.
|
|
372
|
+
"""
|
|
373
|
+
if ctx.nonce_manager is None:
|
|
374
|
+
raise RuntimeError("_reap_stale_claims requires nonce_manager")
|
|
375
|
+
|
|
376
|
+
stale = ctx.db.list_claimed_intents_older_than(
|
|
377
|
+
max_age_seconds=ctx.config.claim_timeout_seconds,
|
|
378
|
+
chain_id=ctx.chain_id,
|
|
379
|
+
)
|
|
380
|
+
if not stale:
|
|
381
|
+
return
|
|
382
|
+
|
|
383
|
+
ctx.log.warning(
|
|
384
|
+
"claim.reap_detected",
|
|
385
|
+
count=len(stale),
|
|
386
|
+
action="containment_only",
|
|
387
|
+
)
|
|
388
|
+
ctx.db.set_runtime_control(
|
|
389
|
+
control="pause_new_intents",
|
|
390
|
+
active=True,
|
|
391
|
+
expires_at=datetime.utcnow() + timedelta(seconds=300),
|
|
392
|
+
reason="stale_claims_detected",
|
|
393
|
+
actor="reaper",
|
|
394
|
+
mode="auto",
|
|
395
|
+
)
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""Worker thread supervision with health tracking and failure handling.
|
|
2
|
+
|
|
3
|
+
Provides fail-fast supervision for daemon worker threads. When a worker fails
|
|
4
|
+
(exception or silent return), the supervisor signals shutdown so the daemon
|
|
5
|
+
can exit cleanly with a non-zero exit code.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import threading
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from enum import Enum
|
|
14
|
+
from typing import Any, Callable
|
|
15
|
+
|
|
16
|
+
from brawny.logging import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class WorkerStatus(Enum):
|
|
22
|
+
"""Status of a supervised worker thread."""
|
|
23
|
+
|
|
24
|
+
STARTING = "starting"
|
|
25
|
+
RUNNING = "running"
|
|
26
|
+
FAILED = "failed"
|
|
27
|
+
STOPPED = "stopped" # Exited without exception (still a failure in daemon)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class WorkerState:
|
|
32
|
+
"""State for a supervised worker thread."""
|
|
33
|
+
|
|
34
|
+
name: str
|
|
35
|
+
target: Callable[[], None]
|
|
36
|
+
daemon: bool
|
|
37
|
+
status: WorkerStatus = WorkerStatus.STARTING
|
|
38
|
+
thread: threading.Thread | None = None
|
|
39
|
+
started_at: datetime | None = None
|
|
40
|
+
failed_at: datetime | None = None
|
|
41
|
+
failure_count: int = 0
|
|
42
|
+
last_error: str | None = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class WorkerSupervisor:
|
|
46
|
+
"""Supervises worker threads with health tracking and failure handling.
|
|
47
|
+
|
|
48
|
+
Responsibilities:
|
|
49
|
+
- Start workers with exception-catching wrapper
|
|
50
|
+
- Record status + last error
|
|
51
|
+
- Signal shutdown on failure (fail-fast mode)
|
|
52
|
+
- Provide snapshot for health checks
|
|
53
|
+
|
|
54
|
+
Does NOT:
|
|
55
|
+
- Call sys.exit() (daemon decides exit)
|
|
56
|
+
- Auto-restart workers (V1 - keeps it simple)
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
*,
|
|
62
|
+
fail_fast: bool = True,
|
|
63
|
+
liveness_check_interval: float = 5.0,
|
|
64
|
+
) -> None:
|
|
65
|
+
"""Initialize the supervisor.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
fail_fast: If True, trigger shutdown on any worker failure (default for tx systems)
|
|
69
|
+
liveness_check_interval: How often to check thread liveness (seconds)
|
|
70
|
+
"""
|
|
71
|
+
self._workers: dict[str, WorkerState] = {}
|
|
72
|
+
self._lock = threading.Lock()
|
|
73
|
+
self._shutdown_event = threading.Event()
|
|
74
|
+
self._fatal_reason: str | None = None
|
|
75
|
+
self._fail_fast = fail_fast
|
|
76
|
+
self._liveness_interval = liveness_check_interval
|
|
77
|
+
self._liveness_thread: threading.Thread | None = None
|
|
78
|
+
|
|
79
|
+
def add(
|
|
80
|
+
self,
|
|
81
|
+
name: str,
|
|
82
|
+
target: Callable[[], None],
|
|
83
|
+
*,
|
|
84
|
+
daemon: bool = True,
|
|
85
|
+
) -> None:
|
|
86
|
+
"""Register a worker to be supervised (does not start it).
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
name: Unique name for the worker
|
|
90
|
+
target: The function to run in the worker thread
|
|
91
|
+
daemon: Whether the thread should be a daemon thread
|
|
92
|
+
"""
|
|
93
|
+
with self._lock:
|
|
94
|
+
if name in self._workers:
|
|
95
|
+
raise ValueError(f"Worker {name!r} already registered")
|
|
96
|
+
self._workers[name] = WorkerState(
|
|
97
|
+
name=name,
|
|
98
|
+
target=target,
|
|
99
|
+
daemon=daemon,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def start_all(self) -> None:
|
|
103
|
+
"""Start all registered workers and the liveness monitor."""
|
|
104
|
+
with self._lock:
|
|
105
|
+
for state in self._workers.values():
|
|
106
|
+
self._start_worker(state)
|
|
107
|
+
|
|
108
|
+
# Start liveness monitor thread
|
|
109
|
+
self._liveness_thread = threading.Thread(
|
|
110
|
+
target=self._liveness_monitor,
|
|
111
|
+
name="supervisor-liveness",
|
|
112
|
+
daemon=True,
|
|
113
|
+
)
|
|
114
|
+
self._liveness_thread.start()
|
|
115
|
+
|
|
116
|
+
def _start_worker(self, state: WorkerState) -> None:
|
|
117
|
+
"""Start a single worker thread with supervision wrapper."""
|
|
118
|
+
name = state.name
|
|
119
|
+
target = state.target
|
|
120
|
+
|
|
121
|
+
def supervised_target() -> None:
|
|
122
|
+
# Update state to RUNNING
|
|
123
|
+
with self._lock:
|
|
124
|
+
state.status = WorkerStatus.RUNNING
|
|
125
|
+
state.started_at = datetime.now(timezone.utc)
|
|
126
|
+
|
|
127
|
+
logger.info("worker.started", worker=name)
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
target()
|
|
131
|
+
# If we get here, worker returned normally - that's a bug in a daemon
|
|
132
|
+
self._handle_worker_exit(name, reason="returned normally (bug)")
|
|
133
|
+
except Exception as e:
|
|
134
|
+
self._handle_worker_failure(name, e)
|
|
135
|
+
|
|
136
|
+
thread = threading.Thread(
|
|
137
|
+
target=supervised_target,
|
|
138
|
+
name=f"worker-{name}",
|
|
139
|
+
daemon=state.daemon,
|
|
140
|
+
)
|
|
141
|
+
state.thread = thread
|
|
142
|
+
thread.start()
|
|
143
|
+
|
|
144
|
+
def _handle_worker_failure(self, name: str, error: Exception) -> None:
|
|
145
|
+
"""Handle worker thread failure (exception)."""
|
|
146
|
+
# Capture fields under lock, then release before logging
|
|
147
|
+
with self._lock:
|
|
148
|
+
worker = self._workers[name]
|
|
149
|
+
worker.status = WorkerStatus.FAILED
|
|
150
|
+
worker.failed_at = datetime.now(timezone.utc)
|
|
151
|
+
worker.failure_count += 1
|
|
152
|
+
worker.last_error = str(error)
|
|
153
|
+
failure_count = worker.failure_count
|
|
154
|
+
|
|
155
|
+
# Log after releasing lock
|
|
156
|
+
logger.error(
|
|
157
|
+
"worker.failed",
|
|
158
|
+
worker=name,
|
|
159
|
+
error=str(error),
|
|
160
|
+
failure_count=failure_count,
|
|
161
|
+
exc_info=True,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
self._trigger_shutdown(f"worker {name!r} failed: {error}")
|
|
165
|
+
|
|
166
|
+
def _handle_worker_exit(self, name: str, reason: str) -> None:
|
|
167
|
+
"""Handle worker thread exiting (no exception, but still a failure)."""
|
|
168
|
+
with self._lock:
|
|
169
|
+
worker = self._workers[name]
|
|
170
|
+
worker.status = WorkerStatus.STOPPED
|
|
171
|
+
worker.failed_at = datetime.now(timezone.utc)
|
|
172
|
+
worker.last_error = reason
|
|
173
|
+
|
|
174
|
+
logger.error("worker.exited", worker=name, reason=reason)
|
|
175
|
+
self._trigger_shutdown(f"worker {name!r} exited: {reason}")
|
|
176
|
+
|
|
177
|
+
def _trigger_shutdown(self, reason: str) -> None:
|
|
178
|
+
"""Trigger shutdown with reason.
|
|
179
|
+
|
|
180
|
+
When fail_fast=True: Sets shutdown_event, daemon should exit.
|
|
181
|
+
When fail_fast=False: Does NOT set shutdown_event. Daemon keeps running
|
|
182
|
+
but all_healthy() returns False. Health checks should use all_healthy()
|
|
183
|
+
to report degraded status even if process continues.
|
|
184
|
+
"""
|
|
185
|
+
# Always record the reason (useful for debugging even if not shutting down)
|
|
186
|
+
with self._lock:
|
|
187
|
+
if self._fatal_reason is None:
|
|
188
|
+
self._fatal_reason = reason
|
|
189
|
+
|
|
190
|
+
if self._fail_fast:
|
|
191
|
+
logger.critical("supervisor.shutdown", reason=reason)
|
|
192
|
+
self._shutdown_event.set()
|
|
193
|
+
else:
|
|
194
|
+
# Log but don't trigger shutdown - daemon continues in degraded state
|
|
195
|
+
logger.error("supervisor.worker_failed_no_shutdown", reason=reason)
|
|
196
|
+
|
|
197
|
+
def _liveness_monitor(self) -> None:
|
|
198
|
+
"""Periodically check that all workers are still alive."""
|
|
199
|
+
while not self._shutdown_event.wait(self._liveness_interval):
|
|
200
|
+
dead_name: str | None = None
|
|
201
|
+
|
|
202
|
+
with self._lock:
|
|
203
|
+
for name, state in self._workers.items():
|
|
204
|
+
if state.status == WorkerStatus.RUNNING:
|
|
205
|
+
if state.thread is not None and not state.thread.is_alive():
|
|
206
|
+
# Thread died without us catching it (shouldn't happen, but defensive)
|
|
207
|
+
state.status = WorkerStatus.STOPPED
|
|
208
|
+
state.failed_at = datetime.now(timezone.utc)
|
|
209
|
+
state.last_error = "thread died unexpectedly"
|
|
210
|
+
dead_name = name # Capture before releasing lock
|
|
211
|
+
break
|
|
212
|
+
|
|
213
|
+
# Handle dead worker outside lock
|
|
214
|
+
if dead_name is not None:
|
|
215
|
+
logger.error("worker.dead", worker=dead_name)
|
|
216
|
+
self._trigger_shutdown(f"worker {dead_name!r} died unexpectedly")
|
|
217
|
+
|
|
218
|
+
def snapshot(self) -> dict[str, dict[str, Any]]:
|
|
219
|
+
"""Return snapshot of all worker states for health checks."""
|
|
220
|
+
with self._lock:
|
|
221
|
+
return {
|
|
222
|
+
name: {
|
|
223
|
+
"status": state.status.value,
|
|
224
|
+
"started_at": state.started_at.isoformat() if state.started_at else None,
|
|
225
|
+
"failed_at": state.failed_at.isoformat() if state.failed_at else None,
|
|
226
|
+
"failure_count": state.failure_count,
|
|
227
|
+
"last_error": state.last_error,
|
|
228
|
+
"alive": state.thread.is_alive() if state.thread else False,
|
|
229
|
+
}
|
|
230
|
+
for name, state in self._workers.items()
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
def all_healthy(self) -> bool:
|
|
234
|
+
"""Check if all workers are healthy (running and alive)."""
|
|
235
|
+
with self._lock:
|
|
236
|
+
return all(
|
|
237
|
+
state.status == WorkerStatus.RUNNING
|
|
238
|
+
and state.thread is not None
|
|
239
|
+
and state.thread.is_alive()
|
|
240
|
+
for state in self._workers.values()
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def shutdown_requested(self) -> bool:
|
|
244
|
+
"""Check if shutdown has been triggered."""
|
|
245
|
+
return self._shutdown_event.is_set()
|
|
246
|
+
|
|
247
|
+
def fatal_reason(self) -> str | None:
|
|
248
|
+
"""Return the reason for fatal shutdown, if any."""
|
|
249
|
+
with self._lock:
|
|
250
|
+
return self._fatal_reason
|
|
251
|
+
|
|
252
|
+
def wait_for_shutdown(self, timeout: float | None = None) -> bool:
|
|
253
|
+
"""Wait for shutdown signal. Returns True if shutdown was signaled."""
|
|
254
|
+
return self._shutdown_event.wait(timeout)
|
|
255
|
+
|
|
256
|
+
def request_shutdown(self, reason: str = "requested") -> None:
|
|
257
|
+
"""Request supervisor shutdown (e.g., from signal handler)."""
|
|
258
|
+
with self._lock:
|
|
259
|
+
if self._fatal_reason is None:
|
|
260
|
+
self._fatal_reason = reason
|
|
261
|
+
self._shutdown_event.set()
|