brawny 0.1.13__py3-none-any.whl → 0.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brawny/__init__.py +2 -0
- brawny/_context.py +5 -5
- brawny/_rpc/__init__.py +36 -12
- brawny/_rpc/broadcast.py +14 -13
- brawny/_rpc/caller.py +243 -0
- brawny/_rpc/client.py +539 -0
- brawny/_rpc/clients.py +11 -11
- brawny/_rpc/context.py +23 -0
- brawny/_rpc/errors.py +465 -31
- brawny/_rpc/gas.py +7 -6
- brawny/_rpc/pool.py +18 -0
- brawny/_rpc/retry.py +266 -0
- brawny/_rpc/retry_policy.py +81 -0
- brawny/accounts.py +28 -9
- brawny/alerts/__init__.py +15 -18
- brawny/alerts/abi_resolver.py +212 -36
- brawny/alerts/base.py +2 -2
- brawny/alerts/contracts.py +77 -10
- brawny/alerts/errors.py +30 -3
- brawny/alerts/events.py +38 -5
- brawny/alerts/health.py +19 -13
- brawny/alerts/send.py +513 -55
- brawny/api.py +39 -11
- brawny/assets/AGENTS.md +325 -0
- brawny/async_runtime.py +48 -0
- brawny/chain.py +3 -3
- brawny/cli/commands/__init__.py +2 -0
- brawny/cli/commands/console.py +69 -19
- brawny/cli/commands/contract.py +2 -2
- brawny/cli/commands/controls.py +121 -0
- brawny/cli/commands/health.py +2 -2
- brawny/cli/commands/job_dev.py +6 -5
- brawny/cli/commands/jobs.py +99 -2
- brawny/cli/commands/maintenance.py +13 -29
- brawny/cli/commands/migrate.py +1 -0
- brawny/cli/commands/run.py +10 -3
- brawny/cli/commands/script.py +8 -3
- brawny/cli/commands/signer.py +143 -26
- brawny/cli/helpers.py +0 -3
- brawny/cli_templates.py +25 -349
- brawny/config/__init__.py +4 -1
- brawny/config/models.py +43 -57
- brawny/config/parser.py +268 -57
- brawny/config/validation.py +52 -15
- brawny/daemon/context.py +4 -2
- brawny/daemon/core.py +185 -63
- brawny/daemon/loops.py +166 -98
- brawny/daemon/supervisor.py +261 -0
- brawny/db/__init__.py +14 -26
- brawny/db/base.py +248 -151
- brawny/db/global_cache.py +11 -1
- brawny/db/migrate.py +175 -28
- brawny/db/migrations/001_init.sql +4 -3
- brawny/db/migrations/010_add_nonce_gap_index.sql +1 -1
- brawny/db/migrations/011_add_job_logs.sql +1 -2
- brawny/db/migrations/012_add_claimed_by.sql +2 -2
- brawny/db/migrations/013_attempt_unique.sql +10 -0
- brawny/db/migrations/014_add_lease_expires_at.sql +5 -0
- brawny/db/migrations/015_add_signer_alias.sql +14 -0
- brawny/db/migrations/016_runtime_controls_and_quarantine.sql +32 -0
- brawny/db/migrations/017_add_job_drain.sql +6 -0
- brawny/db/migrations/018_add_nonce_reset_audit.sql +20 -0
- brawny/db/migrations/019_add_job_cooldowns.sql +8 -0
- brawny/db/migrations/020_attempt_unique_initial.sql +7 -0
- brawny/db/ops/__init__.py +3 -25
- brawny/db/ops/logs.py +1 -2
- brawny/db/queries.py +47 -91
- brawny/db/serialized.py +65 -0
- brawny/db/sqlite/__init__.py +1001 -0
- brawny/db/sqlite/connection.py +231 -0
- brawny/db/sqlite/execute.py +116 -0
- brawny/db/sqlite/mappers.py +190 -0
- brawny/db/sqlite/repos/attempts.py +372 -0
- brawny/db/sqlite/repos/block_state.py +102 -0
- brawny/db/sqlite/repos/cache.py +104 -0
- brawny/db/sqlite/repos/intents.py +1021 -0
- brawny/db/sqlite/repos/jobs.py +200 -0
- brawny/db/sqlite/repos/maintenance.py +182 -0
- brawny/db/sqlite/repos/signers_nonces.py +566 -0
- brawny/db/sqlite/tx.py +119 -0
- brawny/http.py +194 -0
- brawny/invariants.py +11 -24
- brawny/jobs/base.py +8 -0
- brawny/jobs/job_validation.py +2 -1
- brawny/keystore.py +83 -7
- brawny/lifecycle.py +64 -12
- brawny/logging.py +0 -2
- brawny/metrics.py +84 -12
- brawny/model/contexts.py +111 -9
- brawny/model/enums.py +1 -0
- brawny/model/errors.py +18 -0
- brawny/model/types.py +47 -131
- brawny/network_guard.py +133 -0
- brawny/networks/__init__.py +5 -5
- brawny/networks/config.py +1 -7
- brawny/networks/manager.py +14 -11
- brawny/runtime_controls.py +74 -0
- brawny/scheduler/poller.py +11 -7
- brawny/scheduler/reorg.py +95 -39
- brawny/scheduler/runner.py +442 -168
- brawny/scheduler/shutdown.py +3 -3
- brawny/script_tx.py +3 -3
- brawny/telegram.py +53 -7
- brawny/testing.py +1 -0
- brawny/timeout.py +38 -0
- brawny/tx/executor.py +922 -308
- brawny/tx/intent.py +54 -16
- brawny/tx/monitor.py +31 -12
- brawny/tx/nonce.py +212 -90
- brawny/tx/replacement.py +69 -18
- brawny/tx/retry_policy.py +24 -0
- brawny/tx/stages/types.py +75 -0
- brawny/types.py +18 -0
- brawny/utils.py +41 -0
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/METADATA +3 -3
- brawny-0.1.22.dist-info/RECORD +163 -0
- brawny/_rpc/manager.py +0 -982
- brawny/_rpc/selector.py +0 -156
- brawny/db/base_new.py +0 -165
- brawny/db/mappers.py +0 -182
- brawny/db/migrations/008_add_transactions.sql +0 -72
- brawny/db/ops/attempts.py +0 -108
- brawny/db/ops/blocks.py +0 -83
- brawny/db/ops/cache.py +0 -93
- brawny/db/ops/intents.py +0 -296
- brawny/db/ops/jobs.py +0 -110
- brawny/db/ops/nonces.py +0 -322
- brawny/db/postgres.py +0 -2535
- brawny/db/postgres_new.py +0 -196
- brawny/db/sqlite.py +0 -2733
- brawny/db/sqlite_new.py +0 -191
- brawny-0.1.13.dist-info/RECORD +0 -141
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/WHEEL +0 -0
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/entry_points.txt +0 -0
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/top_level.txt +0 -0
brawny/config/validation.py
CHANGED
|
@@ -81,6 +81,16 @@ def dedupe_preserve_order(endpoints: list[str]) -> list[str]:
|
|
|
81
81
|
return result
|
|
82
82
|
|
|
83
83
|
|
|
84
|
+
def canonicalize_endpoints(endpoints: list[str]) -> list[str]:
|
|
85
|
+
"""Canonicalize endpoint list for stable comparison.
|
|
86
|
+
|
|
87
|
+
Applies canonicalize_endpoint to each entry, then sorts for deterministic
|
|
88
|
+
equality checks.
|
|
89
|
+
"""
|
|
90
|
+
canonical = [canonicalize_endpoint(ep) for ep in endpoints]
|
|
91
|
+
return sorted(set(canonical))
|
|
92
|
+
|
|
93
|
+
|
|
84
94
|
REMOVED_FIELDS = {
|
|
85
95
|
"alerts_dx_enabled",
|
|
86
96
|
"allowed_signers",
|
|
@@ -151,20 +161,39 @@ def validate_config(config: "Config") -> None:
|
|
|
151
161
|
# Required fields
|
|
152
162
|
if not config.database_url:
|
|
153
163
|
errors.append("database_url is required")
|
|
154
|
-
elif not (
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
):
|
|
159
|
-
errors.append(
|
|
160
|
-
"database_url must start with postgresql://, postgres://, or sqlite:///"
|
|
161
|
-
)
|
|
162
|
-
elif config.database_url.startswith("sqlite:///") and config.worker_count > 1:
|
|
163
|
-
errors.append("SQLite does not support worker_count > 1. Use Postgres for production.")
|
|
164
|
+
elif not config.database_url.startswith("sqlite:///"):
|
|
165
|
+
errors.append("database_url must start with sqlite:///")
|
|
166
|
+
elif config.worker_count > 1:
|
|
167
|
+
errors.append("SQLite does not support worker_count > 1.")
|
|
164
168
|
|
|
165
169
|
if not config.rpc_groups:
|
|
166
170
|
errors.append("rpc_groups is required (at least one group)")
|
|
167
171
|
|
|
172
|
+
if config.guardrails:
|
|
173
|
+
lint_paths = config.guardrails.lint_paths
|
|
174
|
+
if not isinstance(lint_paths, list):
|
|
175
|
+
errors.append("guardrails.lint_paths must be a list")
|
|
176
|
+
else:
|
|
177
|
+
for idx, value in enumerate(lint_paths):
|
|
178
|
+
if not isinstance(value, str) or not value.strip():
|
|
179
|
+
errors.append(f"guardrails.lint_paths[{idx}] must be a non-empty string")
|
|
180
|
+
|
|
181
|
+
if config.debug and not isinstance(config.debug.allow_console, bool):
|
|
182
|
+
errors.append("debug.allow_console must be a boolean")
|
|
183
|
+
|
|
184
|
+
if config.intent_cooldown:
|
|
185
|
+
cd = config.intent_cooldown
|
|
186
|
+
if not isinstance(cd.enabled, bool):
|
|
187
|
+
errors.append("intent_cooldown.enabled must be a boolean")
|
|
188
|
+
if cd.default_seconds < 0:
|
|
189
|
+
errors.append("intent_cooldown.default_seconds cannot be negative")
|
|
190
|
+
if cd.max_seconds < 0:
|
|
191
|
+
errors.append("intent_cooldown.max_seconds cannot be negative")
|
|
192
|
+
if cd.max_seconds < cd.default_seconds:
|
|
193
|
+
errors.append("intent_cooldown.max_seconds must be >= default_seconds")
|
|
194
|
+
if cd.prune_older_than_days < 0:
|
|
195
|
+
errors.append("intent_cooldown.prune_older_than_days cannot be negative")
|
|
196
|
+
|
|
168
197
|
if config.chain_id <= 0:
|
|
169
198
|
errors.append("chain_id must be positive")
|
|
170
199
|
|
|
@@ -192,6 +221,19 @@ def validate_config(config: "Config") -> None:
|
|
|
192
221
|
if config.keystore_type == KeystoreType.FILE and not config.keystore_path:
|
|
193
222
|
errors.append("keystore_path is required when keystore_type is 'file'")
|
|
194
223
|
|
|
224
|
+
# HTTP allowlist validation
|
|
225
|
+
for domain in config.http.allowed_domains:
|
|
226
|
+
if "://" in domain:
|
|
227
|
+
errors.append(f"http.allowed_domains entries must be hostnames, got: {domain}")
|
|
228
|
+
if "/" in domain:
|
|
229
|
+
errors.append(f"http.allowed_domains entries must not include paths: {domain}")
|
|
230
|
+
if config.http.connect_timeout_seconds <= 0:
|
|
231
|
+
errors.append("http.connect_timeout_seconds must be positive")
|
|
232
|
+
if config.http.read_timeout_seconds <= 0:
|
|
233
|
+
errors.append("http.read_timeout_seconds must be positive")
|
|
234
|
+
if config.http.max_retries < 0:
|
|
235
|
+
errors.append("http.max_retries cannot be negative")
|
|
236
|
+
|
|
195
237
|
if errors:
|
|
196
238
|
raise ConfigError(
|
|
197
239
|
"Configuration validation failed:\n" + "\n".join(f" - {e}" for e in errors)
|
|
@@ -234,11 +276,6 @@ def validate_advanced_config(advanced: "AdvancedConfig") -> None:
|
|
|
234
276
|
if advanced.rpc_max_retries < 0:
|
|
235
277
|
errors.append("rpc_max_retries cannot be negative")
|
|
236
278
|
|
|
237
|
-
if advanced.database_pool_size <= 0:
|
|
238
|
-
errors.append("database_pool_size must be positive")
|
|
239
|
-
if advanced.database_pool_max_overflow < 0:
|
|
240
|
-
errors.append("database_pool_max_overflow cannot be negative")
|
|
241
|
-
|
|
242
279
|
if errors:
|
|
243
280
|
raise ConfigError(
|
|
244
281
|
"Advanced configuration validation failed:\n"
|
brawny/daemon/context.py
CHANGED
|
@@ -12,11 +12,12 @@ from typing import TYPE_CHECKING, Callable
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
13
|
from brawny.config import Config
|
|
14
14
|
from brawny.db.base import Database
|
|
15
|
-
from brawny._rpc.
|
|
15
|
+
from brawny._rpc.clients import ReadClient
|
|
16
16
|
from brawny.tx.executor import TxExecutor
|
|
17
17
|
from brawny.tx.monitor import TxMonitor
|
|
18
18
|
from brawny.tx.replacement import TxReplacer
|
|
19
19
|
from brawny.tx.nonce import NonceManager
|
|
20
|
+
from brawny.runtime_controls import RuntimeControls
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
@dataclass
|
|
@@ -29,12 +30,13 @@ class DaemonContext:
|
|
|
29
30
|
config: "Config"
|
|
30
31
|
log: Logger
|
|
31
32
|
db: "Database"
|
|
32
|
-
rpc: "
|
|
33
|
+
rpc: "ReadClient"
|
|
33
34
|
executor: "TxExecutor | None"
|
|
34
35
|
monitor: "TxMonitor | None"
|
|
35
36
|
replacer: "TxReplacer | None"
|
|
36
37
|
nonce_manager: "NonceManager | None"
|
|
37
38
|
chain_id: int
|
|
39
|
+
controls: "RuntimeControls | None" = None
|
|
38
40
|
|
|
39
41
|
# Health alerts (optional - None means disabled)
|
|
40
42
|
health_send_fn: Callable[..., None] | None = None
|
brawny/daemon/core.py
CHANGED
|
@@ -12,13 +12,15 @@ import socket
|
|
|
12
12
|
import threading
|
|
13
13
|
import time
|
|
14
14
|
from threading import Event, Lock, Thread
|
|
15
|
-
from typing import TYPE_CHECKING, Callable
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
16
16
|
|
|
17
17
|
from brawny.alerts.contracts import ContractSystem
|
|
18
18
|
from brawny.alerts.health import health_alert
|
|
19
|
-
from brawny.alerts.send import create_send_health
|
|
19
|
+
from brawny.alerts.send import AlertService, create_send_health, set_alert_service
|
|
20
|
+
from brawny.async_runtime import clear_loop, register_loop, run_sync
|
|
20
21
|
from brawny.daemon.context import DaemonContext, DaemonState, RuntimeOverrides
|
|
21
22
|
from brawny.daemon.loops import run_monitor, run_worker
|
|
23
|
+
from brawny.daemon.supervisor import WorkerSupervisor
|
|
22
24
|
from brawny.db import create_database
|
|
23
25
|
from brawny.db.migrate import Migrator, verify_critical_schema
|
|
24
26
|
from brawny.jobs.discovery import (
|
|
@@ -36,7 +38,7 @@ from brawny.metrics import ACTIVE_WORKERS, get_metrics
|
|
|
36
38
|
from brawny.model.enums import IntentStatus
|
|
37
39
|
from brawny.model.startup import StartupMessage
|
|
38
40
|
from brawny.model.types import BlockInfo
|
|
39
|
-
from brawny._rpc import
|
|
41
|
+
from brawny._rpc.clients import ReadClient
|
|
40
42
|
from brawny.scheduler.poller import BlockPoller
|
|
41
43
|
from brawny.scheduler.reorg import ReorgDetector
|
|
42
44
|
from brawny.scheduler.runner import JobRunner
|
|
@@ -45,6 +47,7 @@ from brawny.tx.executor import TxExecutor
|
|
|
45
47
|
from brawny.tx.intent import transition_intent
|
|
46
48
|
from brawny.tx.monitor import TxMonitor
|
|
47
49
|
from brawny.tx.replacement import TxReplacer
|
|
50
|
+
from brawny.runtime_controls import RuntimeControls
|
|
48
51
|
from brawny.validation import validate_job_routing
|
|
49
52
|
from brawny.telegram import TelegramBot
|
|
50
53
|
|
|
@@ -82,13 +85,14 @@ class BrawnyDaemon:
|
|
|
82
85
|
|
|
83
86
|
# Components (initialized in start())
|
|
84
87
|
self._db: Database | None = None
|
|
85
|
-
self._rpc:
|
|
88
|
+
self._rpc: ReadClient | None = None
|
|
86
89
|
self._keystore: Keystore | None = None
|
|
87
90
|
self._contract_system: ContractSystem | None = None
|
|
88
91
|
self._lifecycle: LifecycleDispatcher | None = None
|
|
89
92
|
self._executor: TxExecutor | None = None
|
|
90
93
|
self._monitor: TxMonitor | None = None
|
|
91
94
|
self._replacer: TxReplacer | None = None
|
|
95
|
+
self._controls: RuntimeControls | None = None
|
|
92
96
|
self._job_runner: JobRunner | None = None
|
|
93
97
|
self._reorg_detector: ReorgDetector | None = None
|
|
94
98
|
self._poller: BlockPoller | None = None
|
|
@@ -111,6 +115,9 @@ class BrawnyDaemon:
|
|
|
111
115
|
self._monitor_thread: Thread | None = None
|
|
112
116
|
self._monitor_stop = Event()
|
|
113
117
|
|
|
118
|
+
# Worker supervision (fail-fast on worker thread failures)
|
|
119
|
+
self._supervisor = WorkerSupervisor(fail_fast=True)
|
|
120
|
+
|
|
114
121
|
# Inflight tracking
|
|
115
122
|
self._inflight_lock = Lock()
|
|
116
123
|
self._inflight_count = 0
|
|
@@ -124,19 +131,23 @@ class BrawnyDaemon:
|
|
|
124
131
|
|
|
125
132
|
# Async event loop (owned by daemon, used by runner for async job.check())
|
|
126
133
|
self._loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
|
|
127
|
-
|
|
128
|
-
self.
|
|
134
|
+
self._loop_thread: Thread | None = None
|
|
135
|
+
self._loop_started = Event()
|
|
136
|
+
self._loop_thread_id: int | None = None
|
|
137
|
+
self._alert_service: AlertService | None = None
|
|
129
138
|
|
|
130
139
|
@property
|
|
131
140
|
def db(self) -> "Database":
|
|
132
141
|
"""Get database connection."""
|
|
133
|
-
|
|
142
|
+
if self._db is None:
|
|
143
|
+
raise RuntimeError("Daemon not started")
|
|
134
144
|
return self._db
|
|
135
145
|
|
|
136
146
|
@property
|
|
137
|
-
def rpc(self) ->
|
|
147
|
+
def rpc(self) -> ReadClient:
|
|
138
148
|
"""Get RPC manager."""
|
|
139
|
-
|
|
149
|
+
if self._rpc is None:
|
|
150
|
+
raise RuntimeError("Daemon not started")
|
|
140
151
|
return self._rpc
|
|
141
152
|
|
|
142
153
|
@property
|
|
@@ -151,7 +162,8 @@ class BrawnyDaemon:
|
|
|
151
162
|
|
|
152
163
|
def _check_schema(self) -> None:
|
|
153
164
|
"""Verify critical DB schema columns exist. Hard-fail if not."""
|
|
154
|
-
|
|
165
|
+
if self._db is None:
|
|
166
|
+
raise RuntimeError("Database not initialized")
|
|
155
167
|
|
|
156
168
|
try:
|
|
157
169
|
verify_critical_schema(self._db)
|
|
@@ -160,20 +172,46 @@ class BrawnyDaemon:
|
|
|
160
172
|
self._log.critical(
|
|
161
173
|
"schema.validation_failed",
|
|
162
174
|
error=error_msg,
|
|
163
|
-
table="
|
|
175
|
+
table="critical_schema",
|
|
164
176
|
)
|
|
165
177
|
health_alert(
|
|
166
178
|
component="brawny.startup.schema",
|
|
167
179
|
chain_id=self.config.chain_id,
|
|
168
180
|
error=error_msg,
|
|
169
181
|
level="critical",
|
|
170
|
-
action="
|
|
182
|
+
action="See error for remediation",
|
|
171
183
|
db_dialect=self._db.dialect,
|
|
172
184
|
force_send=True,
|
|
173
185
|
send_fn=self._health_send_fn,
|
|
174
186
|
health_chat_id=self._health_chat_id,
|
|
175
187
|
)
|
|
176
|
-
raise SystemExit(f"DB schema mismatch: {error_msg}
|
|
188
|
+
raise SystemExit(f"DB schema mismatch: {error_msg}") from exc
|
|
189
|
+
|
|
190
|
+
def _start_async_loop(self) -> None:
|
|
191
|
+
if self._loop_thread and self._loop_thread.is_alive():
|
|
192
|
+
return
|
|
193
|
+
|
|
194
|
+
def _run_loop() -> None:
|
|
195
|
+
asyncio.set_event_loop(self._loop)
|
|
196
|
+
self._loop_thread_id = threading.get_ident()
|
|
197
|
+
register_loop(self._loop, self._loop_thread_id)
|
|
198
|
+
self._loop_started.set()
|
|
199
|
+
self._loop.run_forever()
|
|
200
|
+
self._loop.close()
|
|
201
|
+
|
|
202
|
+
self._loop_started.clear()
|
|
203
|
+
self._loop_thread = Thread(target=_run_loop, name="brawny-async-loop", daemon=True)
|
|
204
|
+
self._loop_thread.start()
|
|
205
|
+
self._loop_started.wait(timeout=5.0)
|
|
206
|
+
if not self._loop_started.is_set():
|
|
207
|
+
raise RuntimeError("Async loop failed to start")
|
|
208
|
+
|
|
209
|
+
def _stop_async_loop(self) -> None:
|
|
210
|
+
if self._loop and not self._loop.is_closed():
|
|
211
|
+
self._loop.call_soon_threadsafe(self._loop.stop)
|
|
212
|
+
if self._loop_thread:
|
|
213
|
+
self._loop_thread.join(timeout=5.0)
|
|
214
|
+
clear_loop()
|
|
177
215
|
|
|
178
216
|
def _make_claim_token(self, worker_id: int) -> str:
|
|
179
217
|
"""Generate a unique claim token for a worker."""
|
|
@@ -202,7 +240,8 @@ class BrawnyDaemon:
|
|
|
202
240
|
|
|
203
241
|
def _process_block(self, block: BlockInfo) -> None:
|
|
204
242
|
"""Process a single block."""
|
|
205
|
-
|
|
243
|
+
if self._job_runner is None:
|
|
244
|
+
raise RuntimeError("Job runner not initialized")
|
|
206
245
|
|
|
207
246
|
self._log.info(
|
|
208
247
|
"block.ingest.start",
|
|
@@ -320,8 +359,10 @@ class BrawnyDaemon:
|
|
|
320
359
|
|
|
321
360
|
def _reconcile_startup(self) -> None:
|
|
322
361
|
"""Reconcile state on startup."""
|
|
323
|
-
|
|
324
|
-
|
|
362
|
+
if self._db is None:
|
|
363
|
+
raise RuntimeError("Database not initialized")
|
|
364
|
+
if self._monitor is None and not self.overrides.dry_run:
|
|
365
|
+
raise RuntimeError("Monitor not initialized")
|
|
325
366
|
|
|
326
367
|
# Reconcile nonces
|
|
327
368
|
if self._executor and self._executor.nonce_manager:
|
|
@@ -343,23 +384,19 @@ class BrawnyDaemon:
|
|
|
343
384
|
"startup_recover_sending",
|
|
344
385
|
chain_id=self.config.chain_id,
|
|
345
386
|
)
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
intent.intent_id,
|
|
360
|
-
IntentStatus.CREATED,
|
|
361
|
-
"startup_recover_sending",
|
|
362
|
-
chain_id=self.config.chain_id,
|
|
387
|
+
continue
|
|
388
|
+
if not attempt or not attempt.tx_hash:
|
|
389
|
+
self._db.set_signer_quarantined(
|
|
390
|
+
self.config.chain_id,
|
|
391
|
+
intent.signer_address,
|
|
392
|
+
reason="startup_sending_no_tx_hash",
|
|
393
|
+
source="startup_reconcile",
|
|
394
|
+
)
|
|
395
|
+
self._log.warning(
|
|
396
|
+
"startup.quarantine_sending_no_tx",
|
|
397
|
+
intent_id=str(intent.intent_id),
|
|
398
|
+
job_id=intent.job_id,
|
|
399
|
+
attempt_id=str(attempt.attempt_id) if attempt else None,
|
|
363
400
|
)
|
|
364
401
|
|
|
365
402
|
if stuck_sending:
|
|
@@ -378,7 +415,7 @@ class BrawnyDaemon:
|
|
|
378
415
|
)
|
|
379
416
|
|
|
380
417
|
def _start_workers(self) -> None:
|
|
381
|
-
"""Start worker threads."""
|
|
418
|
+
"""Start worker threads with supervision."""
|
|
382
419
|
if self.overrides.dry_run:
|
|
383
420
|
return
|
|
384
421
|
|
|
@@ -397,6 +434,7 @@ class BrawnyDaemon:
|
|
|
397
434
|
monitor=self._monitor,
|
|
398
435
|
replacer=self._replacer,
|
|
399
436
|
nonce_manager=self._executor.nonce_manager if self._executor else None,
|
|
437
|
+
controls=self._controls,
|
|
400
438
|
chain_id=self.config.chain_id,
|
|
401
439
|
health_send_fn=self._health_send_fn,
|
|
402
440
|
health_chat_id=self._health_chat_id,
|
|
@@ -409,22 +447,48 @@ class BrawnyDaemon:
|
|
|
409
447
|
inflight_dec=self._inflight_done,
|
|
410
448
|
)
|
|
411
449
|
|
|
450
|
+
# Register workers with supervisor
|
|
412
451
|
for i in range(worker_count):
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
452
|
+
self._supervisor.add(
|
|
453
|
+
f"tx_worker_{i}",
|
|
454
|
+
lambda worker_id=i: run_worker(
|
|
455
|
+
worker_id, self._stop, self._wakeup_hint, ctx, state, self.overrides.dry_run
|
|
456
|
+
),
|
|
417
457
|
)
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
target=run_monitor,
|
|
424
|
-
args=(self._monitor_stop, ctx, self._worker_threads),
|
|
425
|
-
daemon=True,
|
|
458
|
+
|
|
459
|
+
# Register monitor as supervised worker
|
|
460
|
+
self._supervisor.add(
|
|
461
|
+
"tx_monitor",
|
|
462
|
+
lambda: run_monitor(self._monitor_stop, ctx, self._worker_threads),
|
|
426
463
|
)
|
|
427
|
-
|
|
464
|
+
|
|
465
|
+
# Start all supervised workers
|
|
466
|
+
self._supervisor.start_all()
|
|
467
|
+
|
|
468
|
+
# Track worker threads for backward compatibility (used in monitor and shutdown)
|
|
469
|
+
# The supervisor owns the actual threads, but we need references for metrics
|
|
470
|
+
with self._supervisor._lock:
|
|
471
|
+
for name, worker_state in self._supervisor._workers.items():
|
|
472
|
+
if name.startswith("tx_worker_") and worker_state.thread:
|
|
473
|
+
self._worker_threads.append(worker_state.thread)
|
|
474
|
+
elif name == "tx_monitor" and worker_state.thread:
|
|
475
|
+
self._monitor_thread = worker_state.thread
|
|
476
|
+
|
|
477
|
+
# Start supervisor watcher - signals daemon stop when supervisor triggers shutdown
|
|
478
|
+
def _watch_supervisor() -> None:
|
|
479
|
+
self._supervisor.wait_for_shutdown()
|
|
480
|
+
if not self._stop.is_set():
|
|
481
|
+
self._log.critical(
|
|
482
|
+
"daemon.supervisor_shutdown",
|
|
483
|
+
reason=self._supervisor.fatal_reason(),
|
|
484
|
+
)
|
|
485
|
+
self._stop.set()
|
|
486
|
+
self._wakeup_hint.set()
|
|
487
|
+
if self._poller:
|
|
488
|
+
self._poller.stop(timeout=0.1)
|
|
489
|
+
|
|
490
|
+
watcher = Thread(target=_watch_supervisor, name="supervisor-watcher", daemon=True)
|
|
491
|
+
watcher.start()
|
|
428
492
|
|
|
429
493
|
# Initial gauge
|
|
430
494
|
metrics = get_metrics()
|
|
@@ -467,9 +531,18 @@ class BrawnyDaemon:
|
|
|
467
531
|
if alive:
|
|
468
532
|
self._log.warning("shutdown.threads_still_alive", count=len(alive))
|
|
469
533
|
|
|
470
|
-
# Close
|
|
471
|
-
|
|
472
|
-
|
|
534
|
+
# Close HTTP clients to avoid leaked connections
|
|
535
|
+
# Keep calls qualified to avoid name collision (both modules export close_http_client)
|
|
536
|
+
from brawny.alerts import abi_resolver
|
|
537
|
+
from brawny.telegram import close_http_client as close_telegram_http_client
|
|
538
|
+
|
|
539
|
+
if self._alert_service is not None:
|
|
540
|
+
run_sync(self._alert_service.stop(flush_timeout=self.config.shutdown_grace_seconds))
|
|
541
|
+
set_alert_service(None)
|
|
542
|
+
abi_resolver.close_http_client()
|
|
543
|
+
close_telegram_http_client()
|
|
544
|
+
|
|
545
|
+
self._stop_async_loop()
|
|
473
546
|
|
|
474
547
|
self._log.info("daemon.shutdown.complete")
|
|
475
548
|
|
|
@@ -486,13 +559,11 @@ class BrawnyDaemon:
|
|
|
486
559
|
# Database
|
|
487
560
|
self._db = create_database(
|
|
488
561
|
self.config.database_url,
|
|
489
|
-
pool_size=self.config.database_pool_size,
|
|
490
|
-
pool_max_overflow=self.config.database_pool_max_overflow,
|
|
491
|
-
pool_timeout=self.config.database_pool_timeout_seconds,
|
|
492
562
|
circuit_breaker_failures=self.config.db_circuit_breaker_failures,
|
|
493
563
|
circuit_breaker_seconds=self.config.db_circuit_breaker_seconds,
|
|
494
564
|
)
|
|
495
565
|
self._db.connect()
|
|
566
|
+
self._controls = RuntimeControls(self._db)
|
|
496
567
|
|
|
497
568
|
# Migrations
|
|
498
569
|
migrator = Migrator(self._db)
|
|
@@ -502,7 +573,7 @@ class BrawnyDaemon:
|
|
|
502
573
|
migrator.migrate()
|
|
503
574
|
|
|
504
575
|
# RPC
|
|
505
|
-
self._rpc =
|
|
576
|
+
self._rpc = ReadClient.from_config(self.config)
|
|
506
577
|
|
|
507
578
|
self._log.info(
|
|
508
579
|
"startup.finality_policy",
|
|
@@ -554,7 +625,10 @@ class BrawnyDaemon:
|
|
|
554
625
|
|
|
555
626
|
# Cache TelegramBot instance (if configured)
|
|
556
627
|
if self.config.telegram.bot_token:
|
|
557
|
-
self._telegram_bot = TelegramBot(
|
|
628
|
+
self._telegram_bot = TelegramBot(
|
|
629
|
+
token=self.config.telegram.bot_token,
|
|
630
|
+
default_parse_mode=self.config.telegram.parse_mode or "Markdown",
|
|
631
|
+
)
|
|
558
632
|
|
|
559
633
|
# Initialize health alerting
|
|
560
634
|
tg = self.config.telegram
|
|
@@ -575,6 +649,16 @@ class BrawnyDaemon:
|
|
|
575
649
|
if tg:
|
|
576
650
|
self._health_cooldown = tg.health_cooldown_seconds
|
|
577
651
|
|
|
652
|
+
from brawny.alerts import send as alerts_send
|
|
653
|
+
self._alert_service = AlertService(
|
|
654
|
+
maxsize=alerts_send.ALERT_QUEUE_MAXSIZE,
|
|
655
|
+
max_attempts=alerts_send.ALERT_SEND_MAX_ATTEMPTS,
|
|
656
|
+
backoff_base_seconds=alerts_send.ALERT_SEND_BACKOFF_BASE_SECONDS,
|
|
657
|
+
backoff_max_seconds=alerts_send.ALERT_SEND_BACKOFF_MAX_SECONDS,
|
|
658
|
+
health_max_oldest_age_seconds=self.config._advanced_or_default().alerts_health_max_oldest_age_seconds,
|
|
659
|
+
)
|
|
660
|
+
set_alert_service(self._alert_service)
|
|
661
|
+
|
|
578
662
|
# Validate schema (after health is set up so we can alert on failure)
|
|
579
663
|
self._check_schema()
|
|
580
664
|
|
|
@@ -604,7 +688,8 @@ class BrawnyDaemon:
|
|
|
604
688
|
)
|
|
605
689
|
self._replacer = TxReplacer(
|
|
606
690
|
self._db, self._rpc, self._keystore, self._executor.nonce_manager, self.config,
|
|
607
|
-
lifecycle=self._lifecycle
|
|
691
|
+
lifecycle=self._lifecycle,
|
|
692
|
+
controls=self._controls,
|
|
608
693
|
)
|
|
609
694
|
|
|
610
695
|
# Job runner
|
|
@@ -616,7 +701,7 @@ class BrawnyDaemon:
|
|
|
616
701
|
lifecycle=self._lifecycle,
|
|
617
702
|
contract_system=self._contract_system,
|
|
618
703
|
loop=self._loop,
|
|
619
|
-
|
|
704
|
+
controls=self._controls,
|
|
620
705
|
)
|
|
621
706
|
self._job_runner._on_intent_created = self._on_intent_created
|
|
622
707
|
|
|
@@ -649,22 +734,31 @@ class BrawnyDaemon:
|
|
|
649
734
|
|
|
650
735
|
return validation_errors, routing_errors, startup_messages
|
|
651
736
|
|
|
652
|
-
def run(self, blocking: bool = True) ->
|
|
653
|
-
"""Run the daemon.
|
|
737
|
+
def run(self, blocking: bool = True) -> int:
|
|
738
|
+
"""Run the daemon. Returns exit code (0=clean, 1=failure).
|
|
739
|
+
|
|
740
|
+
Caller should: sys.exit(daemon.run())
|
|
654
741
|
|
|
655
742
|
Args:
|
|
656
743
|
blocking: If True, block until shutdown. If False, return immediately.
|
|
744
|
+
|
|
745
|
+
Returns:
|
|
746
|
+
Exit code: 0 for clean shutdown, 1 for worker failure
|
|
657
747
|
"""
|
|
658
|
-
|
|
748
|
+
if self._poller is None:
|
|
749
|
+
raise RuntimeError("Daemon not initialized")
|
|
750
|
+
|
|
751
|
+
# Start async loop and services
|
|
752
|
+
self._start_async_loop()
|
|
753
|
+
if self._alert_service is not None:
|
|
754
|
+
run_sync(self._alert_service.start())
|
|
659
755
|
|
|
660
756
|
# Startup reconciliation
|
|
661
757
|
self._reconcile_startup()
|
|
662
758
|
|
|
663
759
|
# Warm gas cache before workers start (eliminates cold-start race)
|
|
664
760
|
try:
|
|
665
|
-
self.
|
|
666
|
-
asyncio.wait_for(self._rpc.gas_quote(), timeout=5.0)
|
|
667
|
-
)
|
|
761
|
+
run_sync(asyncio.wait_for(self._rpc.gas_quote(), timeout=5.0))
|
|
668
762
|
self._log.debug("startup.gas_cache_warmed")
|
|
669
763
|
except Exception as e:
|
|
670
764
|
self._log.warning("startup.gas_cache_warm_failed", error=str(e))
|
|
@@ -685,6 +779,34 @@ class BrawnyDaemon:
|
|
|
685
779
|
finally:
|
|
686
780
|
self._shutdown()
|
|
687
781
|
|
|
782
|
+
# Return non-zero exit code if supervisor triggered shutdown due to worker failure
|
|
783
|
+
if self._supervisor.fatal_reason():
|
|
784
|
+
return 1
|
|
785
|
+
return 0
|
|
786
|
+
|
|
787
|
+
def health_check(self) -> dict[str, Any]:
|
|
788
|
+
"""Return daemon health status.
|
|
789
|
+
|
|
790
|
+
Uses all_healthy() as primary health indicator. This ensures:
|
|
791
|
+
- fail_fast=True + worker fails → shutdown_requested()=True → healthy=False
|
|
792
|
+
- fail_fast=False + worker fails → all_healthy()=False → healthy=False
|
|
793
|
+
|
|
794
|
+
Either way, health checks report unhealthy when workers fail.
|
|
795
|
+
"""
|
|
796
|
+
worker_snapshot = self._supervisor.snapshot()
|
|
797
|
+
workers_ok = self._supervisor.all_healthy()
|
|
798
|
+
|
|
799
|
+
from brawny.alerts import send as alerts_send
|
|
800
|
+
alert_health = alerts_send.get_alert_worker_health()
|
|
801
|
+
alerts_ok = bool(alert_health.get("healthy", True))
|
|
802
|
+
|
|
803
|
+
return {
|
|
804
|
+
"healthy": workers_ok and alerts_ok and not self._supervisor.shutdown_requested(),
|
|
805
|
+
"workers": worker_snapshot,
|
|
806
|
+
"fatal_reason": self._supervisor.fatal_reason(),
|
|
807
|
+
"alerts": alert_health,
|
|
808
|
+
}
|
|
809
|
+
|
|
688
810
|
def stop(self, timeout: float = 5.0) -> None:
|
|
689
811
|
"""Stop the daemon.
|
|
690
812
|
|