brawny 0.1.13__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. brawny/__init__.py +2 -0
  2. brawny/_context.py +5 -5
  3. brawny/_rpc/__init__.py +36 -12
  4. brawny/_rpc/broadcast.py +14 -13
  5. brawny/_rpc/caller.py +243 -0
  6. brawny/_rpc/client.py +539 -0
  7. brawny/_rpc/clients.py +11 -11
  8. brawny/_rpc/context.py +23 -0
  9. brawny/_rpc/errors.py +465 -31
  10. brawny/_rpc/gas.py +7 -6
  11. brawny/_rpc/pool.py +18 -0
  12. brawny/_rpc/retry.py +266 -0
  13. brawny/_rpc/retry_policy.py +81 -0
  14. brawny/accounts.py +28 -9
  15. brawny/alerts/__init__.py +15 -18
  16. brawny/alerts/abi_resolver.py +212 -36
  17. brawny/alerts/base.py +2 -2
  18. brawny/alerts/contracts.py +77 -10
  19. brawny/alerts/errors.py +30 -3
  20. brawny/alerts/events.py +38 -5
  21. brawny/alerts/health.py +19 -13
  22. brawny/alerts/send.py +513 -55
  23. brawny/api.py +39 -11
  24. brawny/assets/AGENTS.md +325 -0
  25. brawny/async_runtime.py +48 -0
  26. brawny/chain.py +3 -3
  27. brawny/cli/commands/__init__.py +2 -0
  28. brawny/cli/commands/console.py +69 -19
  29. brawny/cli/commands/contract.py +2 -2
  30. brawny/cli/commands/controls.py +121 -0
  31. brawny/cli/commands/health.py +2 -2
  32. brawny/cli/commands/job_dev.py +6 -5
  33. brawny/cli/commands/jobs.py +99 -2
  34. brawny/cli/commands/maintenance.py +13 -29
  35. brawny/cli/commands/migrate.py +1 -0
  36. brawny/cli/commands/run.py +10 -3
  37. brawny/cli/commands/script.py +8 -3
  38. brawny/cli/commands/signer.py +143 -26
  39. brawny/cli/helpers.py +0 -3
  40. brawny/cli_templates.py +25 -349
  41. brawny/config/__init__.py +4 -1
  42. brawny/config/models.py +43 -57
  43. brawny/config/parser.py +268 -57
  44. brawny/config/validation.py +52 -15
  45. brawny/daemon/context.py +4 -2
  46. brawny/daemon/core.py +185 -63
  47. brawny/daemon/loops.py +166 -98
  48. brawny/daemon/supervisor.py +261 -0
  49. brawny/db/__init__.py +14 -26
  50. brawny/db/base.py +248 -151
  51. brawny/db/global_cache.py +11 -1
  52. brawny/db/migrate.py +175 -28
  53. brawny/db/migrations/001_init.sql +4 -3
  54. brawny/db/migrations/010_add_nonce_gap_index.sql +1 -1
  55. brawny/db/migrations/011_add_job_logs.sql +1 -2
  56. brawny/db/migrations/012_add_claimed_by.sql +2 -2
  57. brawny/db/migrations/013_attempt_unique.sql +10 -0
  58. brawny/db/migrations/014_add_lease_expires_at.sql +5 -0
  59. brawny/db/migrations/015_add_signer_alias.sql +14 -0
  60. brawny/db/migrations/016_runtime_controls_and_quarantine.sql +32 -0
  61. brawny/db/migrations/017_add_job_drain.sql +6 -0
  62. brawny/db/migrations/018_add_nonce_reset_audit.sql +20 -0
  63. brawny/db/migrations/019_add_job_cooldowns.sql +8 -0
  64. brawny/db/migrations/020_attempt_unique_initial.sql +7 -0
  65. brawny/db/ops/__init__.py +3 -25
  66. brawny/db/ops/logs.py +1 -2
  67. brawny/db/queries.py +47 -91
  68. brawny/db/serialized.py +65 -0
  69. brawny/db/sqlite/__init__.py +1001 -0
  70. brawny/db/sqlite/connection.py +231 -0
  71. brawny/db/sqlite/execute.py +116 -0
  72. brawny/db/sqlite/mappers.py +190 -0
  73. brawny/db/sqlite/repos/attempts.py +372 -0
  74. brawny/db/sqlite/repos/block_state.py +102 -0
  75. brawny/db/sqlite/repos/cache.py +104 -0
  76. brawny/db/sqlite/repos/intents.py +1021 -0
  77. brawny/db/sqlite/repos/jobs.py +200 -0
  78. brawny/db/sqlite/repos/maintenance.py +182 -0
  79. brawny/db/sqlite/repos/signers_nonces.py +566 -0
  80. brawny/db/sqlite/tx.py +119 -0
  81. brawny/http.py +194 -0
  82. brawny/invariants.py +11 -24
  83. brawny/jobs/base.py +8 -0
  84. brawny/jobs/job_validation.py +2 -1
  85. brawny/keystore.py +83 -7
  86. brawny/lifecycle.py +64 -12
  87. brawny/logging.py +0 -2
  88. brawny/metrics.py +84 -12
  89. brawny/model/contexts.py +111 -9
  90. brawny/model/enums.py +1 -0
  91. brawny/model/errors.py +18 -0
  92. brawny/model/types.py +47 -131
  93. brawny/network_guard.py +133 -0
  94. brawny/networks/__init__.py +5 -5
  95. brawny/networks/config.py +1 -7
  96. brawny/networks/manager.py +14 -11
  97. brawny/runtime_controls.py +74 -0
  98. brawny/scheduler/poller.py +11 -7
  99. brawny/scheduler/reorg.py +95 -39
  100. brawny/scheduler/runner.py +442 -168
  101. brawny/scheduler/shutdown.py +3 -3
  102. brawny/script_tx.py +3 -3
  103. brawny/telegram.py +53 -7
  104. brawny/testing.py +1 -0
  105. brawny/timeout.py +38 -0
  106. brawny/tx/executor.py +922 -308
  107. brawny/tx/intent.py +54 -16
  108. brawny/tx/monitor.py +31 -12
  109. brawny/tx/nonce.py +212 -90
  110. brawny/tx/replacement.py +69 -18
  111. brawny/tx/retry_policy.py +24 -0
  112. brawny/tx/stages/types.py +75 -0
  113. brawny/types.py +18 -0
  114. brawny/utils.py +41 -0
  115. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/METADATA +3 -3
  116. brawny-0.1.22.dist-info/RECORD +163 -0
  117. brawny/_rpc/manager.py +0 -982
  118. brawny/_rpc/selector.py +0 -156
  119. brawny/db/base_new.py +0 -165
  120. brawny/db/mappers.py +0 -182
  121. brawny/db/migrations/008_add_transactions.sql +0 -72
  122. brawny/db/ops/attempts.py +0 -108
  123. brawny/db/ops/blocks.py +0 -83
  124. brawny/db/ops/cache.py +0 -93
  125. brawny/db/ops/intents.py +0 -296
  126. brawny/db/ops/jobs.py +0 -110
  127. brawny/db/ops/nonces.py +0 -322
  128. brawny/db/postgres.py +0 -2535
  129. brawny/db/postgres_new.py +0 -196
  130. brawny/db/sqlite.py +0 -2733
  131. brawny/db/sqlite_new.py +0 -191
  132. brawny-0.1.13.dist-info/RECORD +0 -141
  133. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/WHEEL +0 -0
  134. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/entry_points.txt +0 -0
  135. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/top_level.txt +0 -0
@@ -81,6 +81,16 @@ def dedupe_preserve_order(endpoints: list[str]) -> list[str]:
81
81
  return result
82
82
 
83
83
 
84
+ def canonicalize_endpoints(endpoints: list[str]) -> list[str]:
85
+ """Canonicalize endpoint list for stable comparison.
86
+
87
+ Applies canonicalize_endpoint to each entry, then sorts for deterministic
88
+ equality checks.
89
+ """
90
+ canonical = [canonicalize_endpoint(ep) for ep in endpoints]
91
+ return sorted(set(canonical))
92
+
93
+
84
94
  REMOVED_FIELDS = {
85
95
  "alerts_dx_enabled",
86
96
  "allowed_signers",
@@ -151,20 +161,39 @@ def validate_config(config: "Config") -> None:
151
161
  # Required fields
152
162
  if not config.database_url:
153
163
  errors.append("database_url is required")
154
- elif not (
155
- config.database_url.startswith("postgresql://")
156
- or config.database_url.startswith("postgres://")
157
- or config.database_url.startswith("sqlite:///")
158
- ):
159
- errors.append(
160
- "database_url must start with postgresql://, postgres://, or sqlite:///"
161
- )
162
- elif config.database_url.startswith("sqlite:///") and config.worker_count > 1:
163
- errors.append("SQLite does not support worker_count > 1. Use Postgres for production.")
164
+ elif not config.database_url.startswith("sqlite:///"):
165
+ errors.append("database_url must start with sqlite:///")
166
+ elif config.worker_count > 1:
167
+ errors.append("SQLite does not support worker_count > 1.")
164
168
 
165
169
  if not config.rpc_groups:
166
170
  errors.append("rpc_groups is required (at least one group)")
167
171
 
172
+ if config.guardrails:
173
+ lint_paths = config.guardrails.lint_paths
174
+ if not isinstance(lint_paths, list):
175
+ errors.append("guardrails.lint_paths must be a list")
176
+ else:
177
+ for idx, value in enumerate(lint_paths):
178
+ if not isinstance(value, str) or not value.strip():
179
+ errors.append(f"guardrails.lint_paths[{idx}] must be a non-empty string")
180
+
181
+ if config.debug and not isinstance(config.debug.allow_console, bool):
182
+ errors.append("debug.allow_console must be a boolean")
183
+
184
+ if config.intent_cooldown:
185
+ cd = config.intent_cooldown
186
+ if not isinstance(cd.enabled, bool):
187
+ errors.append("intent_cooldown.enabled must be a boolean")
188
+ if cd.default_seconds < 0:
189
+ errors.append("intent_cooldown.default_seconds cannot be negative")
190
+ if cd.max_seconds < 0:
191
+ errors.append("intent_cooldown.max_seconds cannot be negative")
192
+ if cd.max_seconds < cd.default_seconds:
193
+ errors.append("intent_cooldown.max_seconds must be >= default_seconds")
194
+ if cd.prune_older_than_days < 0:
195
+ errors.append("intent_cooldown.prune_older_than_days cannot be negative")
196
+
168
197
  if config.chain_id <= 0:
169
198
  errors.append("chain_id must be positive")
170
199
 
@@ -192,6 +221,19 @@ def validate_config(config: "Config") -> None:
192
221
  if config.keystore_type == KeystoreType.FILE and not config.keystore_path:
193
222
  errors.append("keystore_path is required when keystore_type is 'file'")
194
223
 
224
+ # HTTP allowlist validation
225
+ for domain in config.http.allowed_domains:
226
+ if "://" in domain:
227
+ errors.append(f"http.allowed_domains entries must be hostnames, got: {domain}")
228
+ if "/" in domain:
229
+ errors.append(f"http.allowed_domains entries must not include paths: {domain}")
230
+ if config.http.connect_timeout_seconds <= 0:
231
+ errors.append("http.connect_timeout_seconds must be positive")
232
+ if config.http.read_timeout_seconds <= 0:
233
+ errors.append("http.read_timeout_seconds must be positive")
234
+ if config.http.max_retries < 0:
235
+ errors.append("http.max_retries cannot be negative")
236
+
195
237
  if errors:
196
238
  raise ConfigError(
197
239
  "Configuration validation failed:\n" + "\n".join(f" - {e}" for e in errors)
@@ -234,11 +276,6 @@ def validate_advanced_config(advanced: "AdvancedConfig") -> None:
234
276
  if advanced.rpc_max_retries < 0:
235
277
  errors.append("rpc_max_retries cannot be negative")
236
278
 
237
- if advanced.database_pool_size <= 0:
238
- errors.append("database_pool_size must be positive")
239
- if advanced.database_pool_max_overflow < 0:
240
- errors.append("database_pool_max_overflow cannot be negative")
241
-
242
279
  if errors:
243
280
  raise ConfigError(
244
281
  "Advanced configuration validation failed:\n"
brawny/daemon/context.py CHANGED
@@ -12,11 +12,12 @@ from typing import TYPE_CHECKING, Callable
12
12
  if TYPE_CHECKING:
13
13
  from brawny.config import Config
14
14
  from brawny.db.base import Database
15
- from brawny._rpc.manager import RPCManager
15
+ from brawny._rpc.clients import ReadClient
16
16
  from brawny.tx.executor import TxExecutor
17
17
  from brawny.tx.monitor import TxMonitor
18
18
  from brawny.tx.replacement import TxReplacer
19
19
  from brawny.tx.nonce import NonceManager
20
+ from brawny.runtime_controls import RuntimeControls
20
21
 
21
22
 
22
23
  @dataclass
@@ -29,12 +30,13 @@ class DaemonContext:
29
30
  config: "Config"
30
31
  log: Logger
31
32
  db: "Database"
32
- rpc: "RPCManager"
33
+ rpc: "ReadClient"
33
34
  executor: "TxExecutor | None"
34
35
  monitor: "TxMonitor | None"
35
36
  replacer: "TxReplacer | None"
36
37
  nonce_manager: "NonceManager | None"
37
38
  chain_id: int
39
+ controls: "RuntimeControls | None" = None
38
40
 
39
41
  # Health alerts (optional - None means disabled)
40
42
  health_send_fn: Callable[..., None] | None = None
brawny/daemon/core.py CHANGED
@@ -12,13 +12,15 @@ import socket
12
12
  import threading
13
13
  import time
14
14
  from threading import Event, Lock, Thread
15
- from typing import TYPE_CHECKING, Callable
15
+ from typing import TYPE_CHECKING, Any, Callable
16
16
 
17
17
  from brawny.alerts.contracts import ContractSystem
18
18
  from brawny.alerts.health import health_alert
19
- from brawny.alerts.send import create_send_health
19
+ from brawny.alerts.send import AlertService, create_send_health, set_alert_service
20
+ from brawny.async_runtime import clear_loop, register_loop, run_sync
20
21
  from brawny.daemon.context import DaemonContext, DaemonState, RuntimeOverrides
21
22
  from brawny.daemon.loops import run_monitor, run_worker
23
+ from brawny.daemon.supervisor import WorkerSupervisor
22
24
  from brawny.db import create_database
23
25
  from brawny.db.migrate import Migrator, verify_critical_schema
24
26
  from brawny.jobs.discovery import (
@@ -36,7 +38,7 @@ from brawny.metrics import ACTIVE_WORKERS, get_metrics
36
38
  from brawny.model.enums import IntentStatus
37
39
  from brawny.model.startup import StartupMessage
38
40
  from brawny.model.types import BlockInfo
39
- from brawny._rpc import RPCManager
41
+ from brawny._rpc.clients import ReadClient
40
42
  from brawny.scheduler.poller import BlockPoller
41
43
  from brawny.scheduler.reorg import ReorgDetector
42
44
  from brawny.scheduler.runner import JobRunner
@@ -45,6 +47,7 @@ from brawny.tx.executor import TxExecutor
45
47
  from brawny.tx.intent import transition_intent
46
48
  from brawny.tx.monitor import TxMonitor
47
49
  from brawny.tx.replacement import TxReplacer
50
+ from brawny.runtime_controls import RuntimeControls
48
51
  from brawny.validation import validate_job_routing
49
52
  from brawny.telegram import TelegramBot
50
53
 
@@ -82,13 +85,14 @@ class BrawnyDaemon:
82
85
 
83
86
  # Components (initialized in start())
84
87
  self._db: Database | None = None
85
- self._rpc: RPCManager | None = None
88
+ self._rpc: ReadClient | None = None
86
89
  self._keystore: Keystore | None = None
87
90
  self._contract_system: ContractSystem | None = None
88
91
  self._lifecycle: LifecycleDispatcher | None = None
89
92
  self._executor: TxExecutor | None = None
90
93
  self._monitor: TxMonitor | None = None
91
94
  self._replacer: TxReplacer | None = None
95
+ self._controls: RuntimeControls | None = None
92
96
  self._job_runner: JobRunner | None = None
93
97
  self._reorg_detector: ReorgDetector | None = None
94
98
  self._poller: BlockPoller | None = None
@@ -111,6 +115,9 @@ class BrawnyDaemon:
111
115
  self._monitor_thread: Thread | None = None
112
116
  self._monitor_stop = Event()
113
117
 
118
+ # Worker supervision (fail-fast on worker thread failures)
119
+ self._supervisor = WorkerSupervisor(fail_fast=True)
120
+
114
121
  # Inflight tracking
115
122
  self._inflight_lock = Lock()
116
123
  self._inflight_count = 0
@@ -124,19 +131,23 @@ class BrawnyDaemon:
124
131
 
125
132
  # Async event loop (owned by daemon, used by runner for async job.check())
126
133
  self._loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
127
- asyncio.set_event_loop(self._loop) # Make it the current loop for this thread
128
- self._loop_thread_id: int = threading.get_ident() # Assert ownership
134
+ self._loop_thread: Thread | None = None
135
+ self._loop_started = Event()
136
+ self._loop_thread_id: int | None = None
137
+ self._alert_service: AlertService | None = None
129
138
 
130
139
  @property
131
140
  def db(self) -> "Database":
132
141
  """Get database connection."""
133
- assert self._db is not None, "Daemon not started"
142
+ if self._db is None:
143
+ raise RuntimeError("Daemon not started")
134
144
  return self._db
135
145
 
136
146
  @property
137
- def rpc(self) -> RPCManager:
147
+ def rpc(self) -> ReadClient:
138
148
  """Get RPC manager."""
139
- assert self._rpc is not None, "Daemon not started"
149
+ if self._rpc is None:
150
+ raise RuntimeError("Daemon not started")
140
151
  return self._rpc
141
152
 
142
153
  @property
@@ -151,7 +162,8 @@ class BrawnyDaemon:
151
162
 
152
163
  def _check_schema(self) -> None:
153
164
  """Verify critical DB schema columns exist. Hard-fail if not."""
154
- assert self._db is not None
165
+ if self._db is None:
166
+ raise RuntimeError("Database not initialized")
155
167
 
156
168
  try:
157
169
  verify_critical_schema(self._db)
@@ -160,20 +172,46 @@ class BrawnyDaemon:
160
172
  self._log.critical(
161
173
  "schema.validation_failed",
162
174
  error=error_msg,
163
- table="tx_intents",
175
+ table="critical_schema",
164
176
  )
165
177
  health_alert(
166
178
  component="brawny.startup.schema",
167
179
  chain_id=self.config.chain_id,
168
180
  error=error_msg,
169
181
  level="critical",
170
- action="Run: brawny migrate",
182
+ action="See error for remediation",
171
183
  db_dialect=self._db.dialect,
172
184
  force_send=True,
173
185
  send_fn=self._health_send_fn,
174
186
  health_chat_id=self._health_chat_id,
175
187
  )
176
- raise SystemExit(f"DB schema mismatch: {error_msg}. Run: brawny migrate") from exc
188
+ raise SystemExit(f"DB schema mismatch: {error_msg}") from exc
189
+
190
+ def _start_async_loop(self) -> None:
191
+ if self._loop_thread and self._loop_thread.is_alive():
192
+ return
193
+
194
+ def _run_loop() -> None:
195
+ asyncio.set_event_loop(self._loop)
196
+ self._loop_thread_id = threading.get_ident()
197
+ register_loop(self._loop, self._loop_thread_id)
198
+ self._loop_started.set()
199
+ self._loop.run_forever()
200
+ self._loop.close()
201
+
202
+ self._loop_started.clear()
203
+ self._loop_thread = Thread(target=_run_loop, name="brawny-async-loop", daemon=True)
204
+ self._loop_thread.start()
205
+ self._loop_started.wait(timeout=5.0)
206
+ if not self._loop_started.is_set():
207
+ raise RuntimeError("Async loop failed to start")
208
+
209
+ def _stop_async_loop(self) -> None:
210
+ if self._loop and not self._loop.is_closed():
211
+ self._loop.call_soon_threadsafe(self._loop.stop)
212
+ if self._loop_thread:
213
+ self._loop_thread.join(timeout=5.0)
214
+ clear_loop()
177
215
 
178
216
  def _make_claim_token(self, worker_id: int) -> str:
179
217
  """Generate a unique claim token for a worker."""
@@ -202,7 +240,8 @@ class BrawnyDaemon:
202
240
 
203
241
  def _process_block(self, block: BlockInfo) -> None:
204
242
  """Process a single block."""
205
- assert self._job_runner is not None
243
+ if self._job_runner is None:
244
+ raise RuntimeError("Job runner not initialized")
206
245
 
207
246
  self._log.info(
208
247
  "block.ingest.start",
@@ -320,8 +359,10 @@ class BrawnyDaemon:
320
359
 
321
360
  def _reconcile_startup(self) -> None:
322
361
  """Reconcile state on startup."""
323
- assert self._db is not None
324
- assert self._monitor is not None or self.overrides.dry_run
362
+ if self._db is None:
363
+ raise RuntimeError("Database not initialized")
364
+ if self._monitor is None and not self.overrides.dry_run:
365
+ raise RuntimeError("Monitor not initialized")
325
366
 
326
367
  # Reconcile nonces
327
368
  if self._executor and self._executor.nonce_manager:
@@ -343,23 +384,19 @@ class BrawnyDaemon:
343
384
  "startup_recover_sending",
344
385
  chain_id=self.config.chain_id,
345
386
  )
346
- else:
347
- # No tx_hash means intent never got broadcast - reset to CREATED
348
- if attempt and self._executor and self._executor.nonce_manager:
349
- from brawny.model.enums import AttemptStatus
350
- self._db.update_attempt_status(
351
- attempt.attempt_id,
352
- AttemptStatus.FAILED.value,
353
- error_code="startup_stuck",
354
- error_detail="Stuck in SENDING without broadcast",
355
- )
356
- self._executor.nonce_manager.release(intent.signer_address, attempt.nonce)
357
- transition_intent(
358
- self._db,
359
- intent.intent_id,
360
- IntentStatus.CREATED,
361
- "startup_recover_sending",
362
- chain_id=self.config.chain_id,
387
+ continue
388
+ if not attempt or not attempt.tx_hash:
389
+ self._db.set_signer_quarantined(
390
+ self.config.chain_id,
391
+ intent.signer_address,
392
+ reason="startup_sending_no_tx_hash",
393
+ source="startup_reconcile",
394
+ )
395
+ self._log.warning(
396
+ "startup.quarantine_sending_no_tx",
397
+ intent_id=str(intent.intent_id),
398
+ job_id=intent.job_id,
399
+ attempt_id=str(attempt.attempt_id) if attempt else None,
363
400
  )
364
401
 
365
402
  if stuck_sending:
@@ -378,7 +415,7 @@ class BrawnyDaemon:
378
415
  )
379
416
 
380
417
  def _start_workers(self) -> None:
381
- """Start worker threads."""
418
+ """Start worker threads with supervision."""
382
419
  if self.overrides.dry_run:
383
420
  return
384
421
 
@@ -397,6 +434,7 @@ class BrawnyDaemon:
397
434
  monitor=self._monitor,
398
435
  replacer=self._replacer,
399
436
  nonce_manager=self._executor.nonce_manager if self._executor else None,
437
+ controls=self._controls,
400
438
  chain_id=self.config.chain_id,
401
439
  health_send_fn=self._health_send_fn,
402
440
  health_chat_id=self._health_chat_id,
@@ -409,22 +447,48 @@ class BrawnyDaemon:
409
447
  inflight_dec=self._inflight_done,
410
448
  )
411
449
 
450
+ # Register workers with supervisor
412
451
  for i in range(worker_count):
413
- t = Thread(
414
- target=run_worker,
415
- args=(i, self._stop, self._wakeup_hint, ctx, state, self.overrides.dry_run),
416
- daemon=True,
452
+ self._supervisor.add(
453
+ f"tx_worker_{i}",
454
+ lambda worker_id=i: run_worker(
455
+ worker_id, self._stop, self._wakeup_hint, ctx, state, self.overrides.dry_run
456
+ ),
417
457
  )
418
- t.start()
419
- self._worker_threads.append(t)
420
-
421
- # Start monitor thread
422
- self._monitor_thread = Thread(
423
- target=run_monitor,
424
- args=(self._monitor_stop, ctx, self._worker_threads),
425
- daemon=True,
458
+
459
+ # Register monitor as supervised worker
460
+ self._supervisor.add(
461
+ "tx_monitor",
462
+ lambda: run_monitor(self._monitor_stop, ctx, self._worker_threads),
426
463
  )
427
- self._monitor_thread.start()
464
+
465
+ # Start all supervised workers
466
+ self._supervisor.start_all()
467
+
468
+ # Track worker threads for backward compatibility (used in monitor and shutdown)
469
+ # The supervisor owns the actual threads, but we need references for metrics
470
+ with self._supervisor._lock:
471
+ for name, worker_state in self._supervisor._workers.items():
472
+ if name.startswith("tx_worker_") and worker_state.thread:
473
+ self._worker_threads.append(worker_state.thread)
474
+ elif name == "tx_monitor" and worker_state.thread:
475
+ self._monitor_thread = worker_state.thread
476
+
477
+ # Start supervisor watcher - signals daemon stop when supervisor triggers shutdown
478
+ def _watch_supervisor() -> None:
479
+ self._supervisor.wait_for_shutdown()
480
+ if not self._stop.is_set():
481
+ self._log.critical(
482
+ "daemon.supervisor_shutdown",
483
+ reason=self._supervisor.fatal_reason(),
484
+ )
485
+ self._stop.set()
486
+ self._wakeup_hint.set()
487
+ if self._poller:
488
+ self._poller.stop(timeout=0.1)
489
+
490
+ watcher = Thread(target=_watch_supervisor, name="supervisor-watcher", daemon=True)
491
+ watcher.start()
428
492
 
429
493
  # Initial gauge
430
494
  metrics = get_metrics()
@@ -467,9 +531,18 @@ class BrawnyDaemon:
467
531
  if alive:
468
532
  self._log.warning("shutdown.threads_still_alive", count=len(alive))
469
533
 
470
- # Close event loop
471
- if self._loop and not self._loop.is_closed():
472
- self._loop.close()
534
+ # Close HTTP clients to avoid leaked connections
535
+ # Keep calls qualified to avoid name collision (both modules export close_http_client)
536
+ from brawny.alerts import abi_resolver
537
+ from brawny.telegram import close_http_client as close_telegram_http_client
538
+
539
+ if self._alert_service is not None:
540
+ run_sync(self._alert_service.stop(flush_timeout=self.config.shutdown_grace_seconds))
541
+ set_alert_service(None)
542
+ abi_resolver.close_http_client()
543
+ close_telegram_http_client()
544
+
545
+ self._stop_async_loop()
473
546
 
474
547
  self._log.info("daemon.shutdown.complete")
475
548
 
@@ -486,13 +559,11 @@ class BrawnyDaemon:
486
559
  # Database
487
560
  self._db = create_database(
488
561
  self.config.database_url,
489
- pool_size=self.config.database_pool_size,
490
- pool_max_overflow=self.config.database_pool_max_overflow,
491
- pool_timeout=self.config.database_pool_timeout_seconds,
492
562
  circuit_breaker_failures=self.config.db_circuit_breaker_failures,
493
563
  circuit_breaker_seconds=self.config.db_circuit_breaker_seconds,
494
564
  )
495
565
  self._db.connect()
566
+ self._controls = RuntimeControls(self._db)
496
567
 
497
568
  # Migrations
498
569
  migrator = Migrator(self._db)
@@ -502,7 +573,7 @@ class BrawnyDaemon:
502
573
  migrator.migrate()
503
574
 
504
575
  # RPC
505
- self._rpc = RPCManager.from_config(self.config)
576
+ self._rpc = ReadClient.from_config(self.config)
506
577
 
507
578
  self._log.info(
508
579
  "startup.finality_policy",
@@ -554,7 +625,10 @@ class BrawnyDaemon:
554
625
 
555
626
  # Cache TelegramBot instance (if configured)
556
627
  if self.config.telegram.bot_token:
557
- self._telegram_bot = TelegramBot(token=self.config.telegram.bot_token)
628
+ self._telegram_bot = TelegramBot(
629
+ token=self.config.telegram.bot_token,
630
+ default_parse_mode=self.config.telegram.parse_mode or "Markdown",
631
+ )
558
632
 
559
633
  # Initialize health alerting
560
634
  tg = self.config.telegram
@@ -575,6 +649,16 @@ class BrawnyDaemon:
575
649
  if tg:
576
650
  self._health_cooldown = tg.health_cooldown_seconds
577
651
 
652
+ from brawny.alerts import send as alerts_send
653
+ self._alert_service = AlertService(
654
+ maxsize=alerts_send.ALERT_QUEUE_MAXSIZE,
655
+ max_attempts=alerts_send.ALERT_SEND_MAX_ATTEMPTS,
656
+ backoff_base_seconds=alerts_send.ALERT_SEND_BACKOFF_BASE_SECONDS,
657
+ backoff_max_seconds=alerts_send.ALERT_SEND_BACKOFF_MAX_SECONDS,
658
+ health_max_oldest_age_seconds=self.config._advanced_or_default().alerts_health_max_oldest_age_seconds,
659
+ )
660
+ set_alert_service(self._alert_service)
661
+
578
662
  # Validate schema (after health is set up so we can alert on failure)
579
663
  self._check_schema()
580
664
 
@@ -604,7 +688,8 @@ class BrawnyDaemon:
604
688
  )
605
689
  self._replacer = TxReplacer(
606
690
  self._db, self._rpc, self._keystore, self._executor.nonce_manager, self.config,
607
- lifecycle=self._lifecycle
691
+ lifecycle=self._lifecycle,
692
+ controls=self._controls,
608
693
  )
609
694
 
610
695
  # Job runner
@@ -616,7 +701,7 @@ class BrawnyDaemon:
616
701
  lifecycle=self._lifecycle,
617
702
  contract_system=self._contract_system,
618
703
  loop=self._loop,
619
- loop_thread_id=self._loop_thread_id,
704
+ controls=self._controls,
620
705
  )
621
706
  self._job_runner._on_intent_created = self._on_intent_created
622
707
 
@@ -649,22 +734,31 @@ class BrawnyDaemon:
649
734
 
650
735
  return validation_errors, routing_errors, startup_messages
651
736
 
652
- def run(self, blocking: bool = True) -> None:
653
- """Run the daemon.
737
+ def run(self, blocking: bool = True) -> int:
738
+ """Run the daemon. Returns exit code (0=clean, 1=failure).
739
+
740
+ Caller should: sys.exit(daemon.run())
654
741
 
655
742
  Args:
656
743
  blocking: If True, block until shutdown. If False, return immediately.
744
+
745
+ Returns:
746
+ Exit code: 0 for clean shutdown, 1 for worker failure
657
747
  """
658
- assert self._poller is not None, "Daemon not initialized"
748
+ if self._poller is None:
749
+ raise RuntimeError("Daemon not initialized")
750
+
751
+ # Start async loop and services
752
+ self._start_async_loop()
753
+ if self._alert_service is not None:
754
+ run_sync(self._alert_service.start())
659
755
 
660
756
  # Startup reconciliation
661
757
  self._reconcile_startup()
662
758
 
663
759
  # Warm gas cache before workers start (eliminates cold-start race)
664
760
  try:
665
- self._loop.run_until_complete(
666
- asyncio.wait_for(self._rpc.gas_quote(), timeout=5.0)
667
- )
761
+ run_sync(asyncio.wait_for(self._rpc.gas_quote(), timeout=5.0))
668
762
  self._log.debug("startup.gas_cache_warmed")
669
763
  except Exception as e:
670
764
  self._log.warning("startup.gas_cache_warm_failed", error=str(e))
@@ -685,6 +779,34 @@ class BrawnyDaemon:
685
779
  finally:
686
780
  self._shutdown()
687
781
 
782
+ # Return non-zero exit code if supervisor triggered shutdown due to worker failure
783
+ if self._supervisor.fatal_reason():
784
+ return 1
785
+ return 0
786
+
787
+ def health_check(self) -> dict[str, Any]:
788
+ """Return daemon health status.
789
+
790
+ Uses all_healthy() as primary health indicator. This ensures:
791
+ - fail_fast=True + worker fails → shutdown_requested()=True → healthy=False
792
+ - fail_fast=False + worker fails → all_healthy()=False → healthy=False
793
+
794
+ Either way, health checks report unhealthy when workers fail.
795
+ """
796
+ worker_snapshot = self._supervisor.snapshot()
797
+ workers_ok = self._supervisor.all_healthy()
798
+
799
+ from brawny.alerts import send as alerts_send
800
+ alert_health = alerts_send.get_alert_worker_health()
801
+ alerts_ok = bool(alert_health.get("healthy", True))
802
+
803
+ return {
804
+ "healthy": workers_ok and alerts_ok and not self._supervisor.shutdown_requested(),
805
+ "workers": worker_snapshot,
806
+ "fatal_reason": self._supervisor.fatal_reason(),
807
+ "alerts": alert_health,
808
+ }
809
+
688
810
  def stop(self, timeout: float = 5.0) -> None:
689
811
  """Stop the daemon.
690
812