brawny 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brawny/__init__.py +106 -0
- brawny/_context.py +232 -0
- brawny/_rpc/__init__.py +38 -0
- brawny/_rpc/broadcast.py +172 -0
- brawny/_rpc/clients.py +98 -0
- brawny/_rpc/context.py +49 -0
- brawny/_rpc/errors.py +252 -0
- brawny/_rpc/gas.py +158 -0
- brawny/_rpc/manager.py +982 -0
- brawny/_rpc/selector.py +156 -0
- brawny/accounts.py +534 -0
- brawny/alerts/__init__.py +132 -0
- brawny/alerts/abi_resolver.py +530 -0
- brawny/alerts/base.py +152 -0
- brawny/alerts/context.py +271 -0
- brawny/alerts/contracts.py +635 -0
- brawny/alerts/encoded_call.py +201 -0
- brawny/alerts/errors.py +267 -0
- brawny/alerts/events.py +680 -0
- brawny/alerts/function_caller.py +364 -0
- brawny/alerts/health.py +185 -0
- brawny/alerts/routing.py +118 -0
- brawny/alerts/send.py +364 -0
- brawny/api.py +660 -0
- brawny/chain.py +93 -0
- brawny/cli/__init__.py +16 -0
- brawny/cli/app.py +17 -0
- brawny/cli/bootstrap.py +37 -0
- brawny/cli/commands/__init__.py +41 -0
- brawny/cli/commands/abi.py +93 -0
- brawny/cli/commands/accounts.py +632 -0
- brawny/cli/commands/console.py +495 -0
- brawny/cli/commands/contract.py +139 -0
- brawny/cli/commands/health.py +112 -0
- brawny/cli/commands/init_project.py +86 -0
- brawny/cli/commands/intents.py +130 -0
- brawny/cli/commands/job_dev.py +254 -0
- brawny/cli/commands/jobs.py +308 -0
- brawny/cli/commands/logs.py +87 -0
- brawny/cli/commands/maintenance.py +182 -0
- brawny/cli/commands/migrate.py +51 -0
- brawny/cli/commands/networks.py +253 -0
- brawny/cli/commands/run.py +249 -0
- brawny/cli/commands/script.py +209 -0
- brawny/cli/commands/signer.py +248 -0
- brawny/cli/helpers.py +265 -0
- brawny/cli_templates.py +1445 -0
- brawny/config/__init__.py +74 -0
- brawny/config/models.py +404 -0
- brawny/config/parser.py +633 -0
- brawny/config/routing.py +55 -0
- brawny/config/validation.py +246 -0
- brawny/daemon/__init__.py +14 -0
- brawny/daemon/context.py +69 -0
- brawny/daemon/core.py +702 -0
- brawny/daemon/loops.py +327 -0
- brawny/db/__init__.py +78 -0
- brawny/db/base.py +986 -0
- brawny/db/base_new.py +165 -0
- brawny/db/circuit_breaker.py +97 -0
- brawny/db/global_cache.py +298 -0
- brawny/db/mappers.py +182 -0
- brawny/db/migrate.py +349 -0
- brawny/db/migrations/001_init.sql +186 -0
- brawny/db/migrations/002_add_included_block.sql +7 -0
- brawny/db/migrations/003_add_broadcast_at.sql +10 -0
- brawny/db/migrations/004_broadcast_binding.sql +20 -0
- brawny/db/migrations/005_add_retry_after.sql +9 -0
- brawny/db/migrations/006_add_retry_count_column.sql +11 -0
- brawny/db/migrations/007_add_gap_tracking.sql +18 -0
- brawny/db/migrations/008_add_transactions.sql +72 -0
- brawny/db/migrations/009_add_intent_metadata.sql +5 -0
- brawny/db/migrations/010_add_nonce_gap_index.sql +9 -0
- brawny/db/migrations/011_add_job_logs.sql +24 -0
- brawny/db/migrations/012_add_claimed_by.sql +5 -0
- brawny/db/ops/__init__.py +29 -0
- brawny/db/ops/attempts.py +108 -0
- brawny/db/ops/blocks.py +83 -0
- brawny/db/ops/cache.py +93 -0
- brawny/db/ops/intents.py +296 -0
- brawny/db/ops/jobs.py +110 -0
- brawny/db/ops/logs.py +97 -0
- brawny/db/ops/nonces.py +322 -0
- brawny/db/postgres.py +2535 -0
- brawny/db/postgres_new.py +196 -0
- brawny/db/queries.py +584 -0
- brawny/db/sqlite.py +2733 -0
- brawny/db/sqlite_new.py +191 -0
- brawny/history.py +126 -0
- brawny/interfaces.py +136 -0
- brawny/invariants.py +155 -0
- brawny/jobs/__init__.py +26 -0
- brawny/jobs/base.py +287 -0
- brawny/jobs/discovery.py +233 -0
- brawny/jobs/job_validation.py +111 -0
- brawny/jobs/kv.py +125 -0
- brawny/jobs/registry.py +283 -0
- brawny/keystore.py +484 -0
- brawny/lifecycle.py +551 -0
- brawny/logging.py +290 -0
- brawny/metrics.py +594 -0
- brawny/model/__init__.py +53 -0
- brawny/model/contexts.py +319 -0
- brawny/model/enums.py +70 -0
- brawny/model/errors.py +194 -0
- brawny/model/events.py +93 -0
- brawny/model/startup.py +20 -0
- brawny/model/types.py +483 -0
- brawny/networks/__init__.py +96 -0
- brawny/networks/config.py +269 -0
- brawny/networks/manager.py +423 -0
- brawny/obs/__init__.py +67 -0
- brawny/obs/emit.py +158 -0
- brawny/obs/health.py +175 -0
- brawny/obs/heartbeat.py +133 -0
- brawny/reconciliation.py +108 -0
- brawny/scheduler/__init__.py +19 -0
- brawny/scheduler/poller.py +472 -0
- brawny/scheduler/reorg.py +632 -0
- brawny/scheduler/runner.py +708 -0
- brawny/scheduler/shutdown.py +371 -0
- brawny/script_tx.py +297 -0
- brawny/scripting.py +251 -0
- brawny/startup.py +76 -0
- brawny/telegram.py +393 -0
- brawny/testing.py +108 -0
- brawny/tx/__init__.py +41 -0
- brawny/tx/executor.py +1071 -0
- brawny/tx/fees.py +50 -0
- brawny/tx/intent.py +423 -0
- brawny/tx/monitor.py +628 -0
- brawny/tx/nonce.py +498 -0
- brawny/tx/replacement.py +456 -0
- brawny/tx/utils.py +26 -0
- brawny/utils.py +205 -0
- brawny/validation.py +69 -0
- brawny-0.1.13.dist-info/METADATA +156 -0
- brawny-0.1.13.dist-info/RECORD +141 -0
- brawny-0.1.13.dist-info/WHEEL +5 -0
- brawny-0.1.13.dist-info/entry_points.txt +2 -0
- brawny-0.1.13.dist-info/top_level.txt +1 -0
brawny/obs/__init__.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Observability module for brawny.
|
|
2
|
+
|
|
3
|
+
Provides structured logging, liveness heartbeats, and readiness health checks.
|
|
4
|
+
|
|
5
|
+
See LOGGING_METRICS_PLAN.md for design rationale and usage patterns.
|
|
6
|
+
|
|
7
|
+
Quick Reference:
|
|
8
|
+
# Logging via emit() gateway
|
|
9
|
+
from brawny.obs import emit, get_logger, bind_intent
|
|
10
|
+
|
|
11
|
+
log = get_logger(worker_id=1, chain_id=1)
|
|
12
|
+
log = bind_intent(log, intent_id=str(intent.intent_id), job_id=intent.job_id)
|
|
13
|
+
emit(log, level="info", event="tx", result="broadcast", tx_hash=hash)
|
|
14
|
+
|
|
15
|
+
# Heartbeat for liveness
|
|
16
|
+
from brawny.obs import get_heartbeat
|
|
17
|
+
|
|
18
|
+
heartbeat = get_heartbeat("block_poller")
|
|
19
|
+
heartbeat.beat() # Call in loop
|
|
20
|
+
|
|
21
|
+
# Health state for readiness
|
|
22
|
+
from brawny.obs import get_health_state
|
|
23
|
+
|
|
24
|
+
health = get_health_state()
|
|
25
|
+
health.update_db(db.ping())
|
|
26
|
+
if not health.is_ready():
|
|
27
|
+
return 503
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from brawny.obs.emit import (
|
|
31
|
+
ALLOWED,
|
|
32
|
+
RUN_ID,
|
|
33
|
+
bind_attempt,
|
|
34
|
+
bind_intent,
|
|
35
|
+
emit,
|
|
36
|
+
get_logger,
|
|
37
|
+
)
|
|
38
|
+
from brawny.obs.health import (
|
|
39
|
+
HealthState,
|
|
40
|
+
get_health_state,
|
|
41
|
+
reset_health_state,
|
|
42
|
+
)
|
|
43
|
+
from brawny.obs.heartbeat import (
|
|
44
|
+
Heartbeat,
|
|
45
|
+
all_heartbeat_ages,
|
|
46
|
+
any_stale,
|
|
47
|
+
get_heartbeat,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
__all__ = [
|
|
51
|
+
# emit.py
|
|
52
|
+
"ALLOWED",
|
|
53
|
+
"RUN_ID",
|
|
54
|
+
"bind_attempt",
|
|
55
|
+
"bind_intent",
|
|
56
|
+
"emit",
|
|
57
|
+
"get_logger",
|
|
58
|
+
# health.py
|
|
59
|
+
"HealthState",
|
|
60
|
+
"get_health_state",
|
|
61
|
+
"reset_health_state",
|
|
62
|
+
# heartbeat.py
|
|
63
|
+
"Heartbeat",
|
|
64
|
+
"all_heartbeat_ages",
|
|
65
|
+
"any_stale",
|
|
66
|
+
"get_heartbeat",
|
|
67
|
+
]
|
brawny/obs/emit.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""Structured logging gateway for brawny.
|
|
2
|
+
|
|
3
|
+
The emit() function is the single enforcement choke point for all logging.
|
|
4
|
+
It validates event/result pairs, normalizes error fields, and controls trace inclusion.
|
|
5
|
+
|
|
6
|
+
See LOGGING_METRICS_PLAN.md for design rationale.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import uuid
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
import structlog
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
# Run ID for correlating logs across restarts
|
|
21
|
+
RUN_ID = os.environ.get("BRAWNY_RUN_ID") or f"run_{uuid.uuid4().hex[:12]}"
|
|
22
|
+
|
|
23
|
+
# Allowed event families and their valid results
|
|
24
|
+
# Any (event, result) pair not in this dict will raise ValueError
|
|
25
|
+
ALLOWED: dict[str, set[str]] = {
|
|
26
|
+
"job.check": {"triggered", "skipped", "timeout", "error"},
|
|
27
|
+
"intent": {"created", "claimed", "executed", "failed", "status"},
|
|
28
|
+
"tx": {"signed", "broadcast", "confirmed", "failed", "replaced"},
|
|
29
|
+
"rpc": {"ok", "error", "timeout"},
|
|
30
|
+
"nonce": {"reserved", "released", "reconciled"},
|
|
31
|
+
"block": {"processed", "reorg"},
|
|
32
|
+
"system": {"started", "draining", "shutdown"},
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Error message truncation limit
|
|
36
|
+
ERROR_MESSAGE_MAX_LENGTH = 500
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def emit(
|
|
40
|
+
log: structlog.stdlib.BoundLogger,
|
|
41
|
+
*,
|
|
42
|
+
level: str,
|
|
43
|
+
event: str,
|
|
44
|
+
result: str,
|
|
45
|
+
err: Exception | None = None,
|
|
46
|
+
is_terminal: bool = False,
|
|
47
|
+
**fields: Any,
|
|
48
|
+
) -> None:
|
|
49
|
+
"""Emit a structured log event.
|
|
50
|
+
|
|
51
|
+
This is the single enforcement choke point for all logging in brawny.
|
|
52
|
+
It validates event/result pairs, normalizes error fields, and controls
|
|
53
|
+
stack trace inclusion.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
log: Bound logger instance
|
|
57
|
+
level: Log level ("debug", "info", "warning", "error")
|
|
58
|
+
event: Event family (e.g., "tx", "intent", "job.check")
|
|
59
|
+
result: Event result (e.g., "confirmed", "failed", "triggered")
|
|
60
|
+
err: Optional exception for error events
|
|
61
|
+
is_terminal: If True and err is provided, include stack trace
|
|
62
|
+
**fields: Additional context fields
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
ValueError: If (event, result) pair is not in ALLOWED
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
emit(log, level="info", event="tx", result="broadcast", tx_hash=hash)
|
|
69
|
+
emit(log, level="error", event="tx", result="failed", err=e, is_terminal=True)
|
|
70
|
+
"""
|
|
71
|
+
# Validate event/result pair
|
|
72
|
+
if event not in ALLOWED:
|
|
73
|
+
raise ValueError(f"Invalid event family: {event!r}. Must be one of: {sorted(ALLOWED.keys())}")
|
|
74
|
+
if result not in ALLOWED[event]:
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"Invalid result {result!r} for event {event!r}. Must be one of: {sorted(ALLOWED[event])}"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Normalize error fields
|
|
80
|
+
if err is not None:
|
|
81
|
+
msg = str(err)
|
|
82
|
+
fields["error_type"] = type(err).__name__
|
|
83
|
+
# Truncate long error messages
|
|
84
|
+
if len(msg) > ERROR_MESSAGE_MAX_LENGTH:
|
|
85
|
+
fields["error"] = msg[:ERROR_MESSAGE_MAX_LENGTH] + "..."
|
|
86
|
+
else:
|
|
87
|
+
fields["error"] = msg
|
|
88
|
+
|
|
89
|
+
# Get the logging function for this level
|
|
90
|
+
log_fn = getattr(log, level.lower())
|
|
91
|
+
|
|
92
|
+
# Dispatch - exc_info must be a kwarg to logger, not a field
|
|
93
|
+
if err is not None and is_terminal:
|
|
94
|
+
log_fn(event, result=result, exc_info=True, **fields)
|
|
95
|
+
else:
|
|
96
|
+
log_fn(event, result=result, **fields)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_logger(**bind: Any) -> structlog.stdlib.BoundLogger:
|
|
100
|
+
"""Get a logger with run_id bound.
|
|
101
|
+
|
|
102
|
+
Use this at component boundaries to get a base logger.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
**bind: Additional fields to bind (e.g., worker_id, chain_id)
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Bound logger with run_id and any additional fields
|
|
109
|
+
|
|
110
|
+
Example:
|
|
111
|
+
log = get_logger(worker_id=1, chain_id=1)
|
|
112
|
+
"""
|
|
113
|
+
return structlog.get_logger("brawny").bind(run_id=RUN_ID, **bind)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def bind_intent(
|
|
117
|
+
log: structlog.stdlib.BoundLogger,
|
|
118
|
+
*,
|
|
119
|
+
intent_id: str,
|
|
120
|
+
job_id: str,
|
|
121
|
+
) -> structlog.stdlib.BoundLogger:
|
|
122
|
+
"""Bind intent context to a logger.
|
|
123
|
+
|
|
124
|
+
Use when processing a specific intent.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
log: Base logger
|
|
128
|
+
intent_id: Intent UUID as string
|
|
129
|
+
job_id: Job identifier
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Logger with intent context bound
|
|
133
|
+
"""
|
|
134
|
+
return log.bind(intent_id=intent_id, job_id=job_id)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def bind_attempt(
|
|
138
|
+
log: structlog.stdlib.BoundLogger,
|
|
139
|
+
*,
|
|
140
|
+
attempt_id: str,
|
|
141
|
+
nonce: int | None = None,
|
|
142
|
+
) -> structlog.stdlib.BoundLogger:
|
|
143
|
+
"""Bind attempt context to a logger.
|
|
144
|
+
|
|
145
|
+
Use when processing a specific transaction attempt.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
log: Base logger (typically with intent context already bound)
|
|
149
|
+
attempt_id: Attempt UUID as string
|
|
150
|
+
nonce: Transaction nonce (optional, but include when known)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Logger with attempt context bound
|
|
154
|
+
"""
|
|
155
|
+
# Use 'is not None' to correctly handle nonce=0
|
|
156
|
+
if nonce is not None:
|
|
157
|
+
return log.bind(attempt_id=attempt_id, nonce=nonce)
|
|
158
|
+
return log.bind(attempt_id=attempt_id)
|
brawny/obs/health.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Cached health state for readiness probes.
|
|
2
|
+
|
|
3
|
+
Readiness probes (/readyz) must be fast and never block on slow checks.
|
|
4
|
+
This module provides cached health state that's updated by background loops.
|
|
5
|
+
|
|
6
|
+
See LOGGING_METRICS_PLAN.md Section 4.1.3 for design rationale.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from threading import Lock
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from threading import Thread
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class HealthState:
|
|
22
|
+
"""Cached health state for readiness checks.
|
|
23
|
+
|
|
24
|
+
Updated by background loops, read by /readyz endpoint.
|
|
25
|
+
All fields are protected by a lock for thread safety.
|
|
26
|
+
|
|
27
|
+
NOT READY when:
|
|
28
|
+
- shutdown_requested is True (draining)
|
|
29
|
+
- db_ok is False
|
|
30
|
+
- rpc_ok is False
|
|
31
|
+
- workers_ok is False
|
|
32
|
+
|
|
33
|
+
Usage:
|
|
34
|
+
# Background loop updates state
|
|
35
|
+
health_state.update_db(db.ping())
|
|
36
|
+
health_state.update_rpc(rpc.any_healthy())
|
|
37
|
+
health_state.update_workers(worker_threads)
|
|
38
|
+
|
|
39
|
+
# /readyz reads cached state (fast, never blocks)
|
|
40
|
+
if not health_state.is_ready():
|
|
41
|
+
return Response(status_code=503)
|
|
42
|
+
return Response(status_code=200)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# Cached component health
|
|
46
|
+
db_ok: bool = field(default=True)
|
|
47
|
+
rpc_ok: bool = field(default=True)
|
|
48
|
+
workers_ok: bool = field(default=True)
|
|
49
|
+
|
|
50
|
+
# Draining state
|
|
51
|
+
shutdown_requested: bool = field(default=False)
|
|
52
|
+
|
|
53
|
+
# Last update timestamps (for staleness detection)
|
|
54
|
+
last_db_check: float = field(default=0.0)
|
|
55
|
+
last_rpc_check: float = field(default=0.0)
|
|
56
|
+
last_workers_check: float = field(default=0.0)
|
|
57
|
+
|
|
58
|
+
# Thread safety
|
|
59
|
+
_lock: Lock = field(default_factory=Lock, repr=False)
|
|
60
|
+
|
|
61
|
+
def update_db(self, ok: bool) -> None:
|
|
62
|
+
"""Update database health state."""
|
|
63
|
+
with self._lock:
|
|
64
|
+
self.db_ok = ok
|
|
65
|
+
self.last_db_check = time.time()
|
|
66
|
+
|
|
67
|
+
def update_rpc(self, ok: bool) -> None:
|
|
68
|
+
"""Update RPC health state."""
|
|
69
|
+
with self._lock:
|
|
70
|
+
self.rpc_ok = ok
|
|
71
|
+
self.last_rpc_check = time.time()
|
|
72
|
+
|
|
73
|
+
def update_workers(self, threads: list["Thread"]) -> None:
|
|
74
|
+
"""Update worker health state.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
threads: List of worker threads
|
|
78
|
+
"""
|
|
79
|
+
with self._lock:
|
|
80
|
+
self.workers_ok = any(t.is_alive() for t in threads) if threads else False
|
|
81
|
+
self.last_workers_check = time.time()
|
|
82
|
+
|
|
83
|
+
def request_shutdown(self) -> None:
|
|
84
|
+
"""Mark the system as draining.
|
|
85
|
+
|
|
86
|
+
Call this at the start of graceful shutdown.
|
|
87
|
+
/readyz will return 503 immediately.
|
|
88
|
+
"""
|
|
89
|
+
with self._lock:
|
|
90
|
+
self.shutdown_requested = True
|
|
91
|
+
|
|
92
|
+
def is_ready(self) -> bool:
|
|
93
|
+
"""Check if the system is ready to accept work.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
True if ready, False if not ready (should return 503)
|
|
97
|
+
"""
|
|
98
|
+
with self._lock:
|
|
99
|
+
if self.shutdown_requested:
|
|
100
|
+
return False
|
|
101
|
+
if not self.db_ok:
|
|
102
|
+
return False
|
|
103
|
+
if not self.rpc_ok:
|
|
104
|
+
return False
|
|
105
|
+
if not self.workers_ok:
|
|
106
|
+
return False
|
|
107
|
+
return True
|
|
108
|
+
|
|
109
|
+
def readiness_reasons(self) -> list[str]:
|
|
110
|
+
"""Get human-readable reasons for not being ready.
|
|
111
|
+
|
|
112
|
+
Useful for /healthz diagnostics.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
List of reasons why the system is not ready, empty if ready
|
|
116
|
+
"""
|
|
117
|
+
reasons = []
|
|
118
|
+
with self._lock:
|
|
119
|
+
if self.shutdown_requested:
|
|
120
|
+
reasons.append("shutdown_requested")
|
|
121
|
+
if not self.db_ok:
|
|
122
|
+
reasons.append("db_unhealthy")
|
|
123
|
+
if not self.rpc_ok:
|
|
124
|
+
reasons.append("rpc_unhealthy")
|
|
125
|
+
if not self.workers_ok:
|
|
126
|
+
reasons.append("no_workers_alive")
|
|
127
|
+
return reasons
|
|
128
|
+
|
|
129
|
+
def to_dict(self) -> dict[str, object]:
|
|
130
|
+
"""Get full health state as a dictionary.
|
|
131
|
+
|
|
132
|
+
Useful for /healthz JSON response.
|
|
133
|
+
"""
|
|
134
|
+
with self._lock:
|
|
135
|
+
# Compute ready inline to avoid deadlock (is_ready also acquires lock)
|
|
136
|
+
ready = (
|
|
137
|
+
not self.shutdown_requested
|
|
138
|
+
and self.db_ok
|
|
139
|
+
and self.rpc_ok
|
|
140
|
+
and self.workers_ok
|
|
141
|
+
)
|
|
142
|
+
return {
|
|
143
|
+
"ready": ready,
|
|
144
|
+
"shutdown_requested": self.shutdown_requested,
|
|
145
|
+
"db_ok": self.db_ok,
|
|
146
|
+
"rpc_ok": self.rpc_ok,
|
|
147
|
+
"workers_ok": self.workers_ok,
|
|
148
|
+
"last_db_check": self.last_db_check,
|
|
149
|
+
"last_rpc_check": self.last_rpc_check,
|
|
150
|
+
"last_workers_check": self.last_workers_check,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# Global health state singleton
|
|
155
|
+
_health_state: HealthState | None = None
|
|
156
|
+
_health_state_lock = Lock()
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_health_state() -> HealthState:
|
|
160
|
+
"""Get the global health state singleton.
|
|
161
|
+
|
|
162
|
+
Creates the singleton on first access.
|
|
163
|
+
"""
|
|
164
|
+
global _health_state
|
|
165
|
+
with _health_state_lock:
|
|
166
|
+
if _health_state is None:
|
|
167
|
+
_health_state = HealthState()
|
|
168
|
+
return _health_state
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def reset_health_state() -> None:
|
|
172
|
+
"""Reset the global health state (for testing)."""
|
|
173
|
+
global _health_state
|
|
174
|
+
with _health_state_lock:
|
|
175
|
+
_health_state = None
|
brawny/obs/heartbeat.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Heartbeat-based liveness for brawny.
|
|
2
|
+
|
|
3
|
+
Liveness is NOT "can we respond to HTTP?" - it's "is the core loop making progress?"
|
|
4
|
+
|
|
5
|
+
The Heartbeat class tracks when critical loops last made progress.
|
|
6
|
+
/livez returns 503 when the heartbeat is stale (no progress in 30s).
|
|
7
|
+
|
|
8
|
+
See LOGGING_METRICS_PLAN.md Section 4.1.2 for design rationale.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import time
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from threading import Lock
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Heartbeat:
|
|
20
|
+
"""Track liveness of a critical loop.
|
|
21
|
+
|
|
22
|
+
A heartbeat is considered stale if beat() hasn't been called within
|
|
23
|
+
max_age_seconds. This indicates the loop is stuck or deadlocked.
|
|
24
|
+
|
|
25
|
+
Thread-safe: beat() and is_stale() can be called from different threads.
|
|
26
|
+
|
|
27
|
+
Usage:
|
|
28
|
+
heartbeat = Heartbeat()
|
|
29
|
+
|
|
30
|
+
# In the critical loop
|
|
31
|
+
while not stop_event.is_set():
|
|
32
|
+
heartbeat.beat()
|
|
33
|
+
# ... do work ...
|
|
34
|
+
|
|
35
|
+
# In /livez endpoint
|
|
36
|
+
if heartbeat.is_stale():
|
|
37
|
+
return Response(status_code=503)
|
|
38
|
+
return Response(status_code=200)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
last_beat_ts: float = field(default=0.0)
|
|
42
|
+
_lock: Lock = field(default_factory=Lock, repr=False)
|
|
43
|
+
|
|
44
|
+
def beat(self) -> None:
|
|
45
|
+
"""Record that the loop is making progress.
|
|
46
|
+
|
|
47
|
+
Call this at the start of each loop iteration.
|
|
48
|
+
"""
|
|
49
|
+
with self._lock:
|
|
50
|
+
self.last_beat_ts = time.time()
|
|
51
|
+
|
|
52
|
+
def is_stale(self, max_age_seconds: float = 30.0) -> bool:
|
|
53
|
+
"""Check if the heartbeat is stale.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
max_age_seconds: Maximum allowed time since last beat.
|
|
57
|
+
Default 30s is a reasonable choice for most loops.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
True if:
|
|
61
|
+
- beat() was never called (last_beat_ts == 0.0), OR
|
|
62
|
+
- More than max_age_seconds have passed since last beat
|
|
63
|
+
"""
|
|
64
|
+
with self._lock:
|
|
65
|
+
if self.last_beat_ts == 0.0:
|
|
66
|
+
# Never started
|
|
67
|
+
return True
|
|
68
|
+
return (time.time() - self.last_beat_ts) > max_age_seconds
|
|
69
|
+
|
|
70
|
+
def age_seconds(self) -> float:
|
|
71
|
+
"""Get the age of the last heartbeat in seconds.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Seconds since last beat, or float('inf') if never beat.
|
|
75
|
+
"""
|
|
76
|
+
with self._lock:
|
|
77
|
+
if self.last_beat_ts == 0.0:
|
|
78
|
+
return float("inf")
|
|
79
|
+
return time.time() - self.last_beat_ts
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# Global heartbeats for critical loops
|
|
83
|
+
_heartbeats: dict[str, Heartbeat] = {}
|
|
84
|
+
_heartbeats_lock = Lock()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_heartbeat(name: str) -> Heartbeat:
|
|
88
|
+
"""Get or create a named heartbeat.
|
|
89
|
+
|
|
90
|
+
Use this to track different critical loops:
|
|
91
|
+
- "block_poller" for the block processing loop
|
|
92
|
+
- "monitor" for the transaction monitor loop
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
name: Identifier for the heartbeat
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
The Heartbeat instance for this name
|
|
99
|
+
"""
|
|
100
|
+
with _heartbeats_lock:
|
|
101
|
+
if name not in _heartbeats:
|
|
102
|
+
_heartbeats[name] = Heartbeat()
|
|
103
|
+
return _heartbeats[name]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def any_stale(max_age_seconds: float = 30.0) -> bool:
|
|
107
|
+
"""Check if any registered heartbeat is stale.
|
|
108
|
+
|
|
109
|
+
Use this in /livez to check overall system liveness.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
max_age_seconds: Maximum allowed time since last beat
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
True if any heartbeat is stale
|
|
116
|
+
"""
|
|
117
|
+
with _heartbeats_lock:
|
|
118
|
+
if not _heartbeats:
|
|
119
|
+
# No heartbeats registered yet - system is starting up
|
|
120
|
+
return False
|
|
121
|
+
return any(hb.is_stale(max_age_seconds) for hb in _heartbeats.values())
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def all_heartbeat_ages() -> dict[str, float]:
|
|
125
|
+
"""Get the age of all registered heartbeats.
|
|
126
|
+
|
|
127
|
+
Useful for /healthz diagnostics endpoint.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Dict mapping heartbeat name to age in seconds
|
|
131
|
+
"""
|
|
132
|
+
with _heartbeats_lock:
|
|
133
|
+
return {name: hb.age_seconds() for name, hb in _heartbeats.items()}
|
brawny/reconciliation.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Startup reconciliation for detecting and repairing inconsistent state.
|
|
2
|
+
|
|
3
|
+
Phase 1 implementation: runs at startup only.
|
|
4
|
+
Phase 2 will add periodic reconciliation after metrics prove stability.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import asdict, dataclass
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from brawny.logging import get_logger
|
|
13
|
+
from brawny.metrics import get_metrics
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from brawny.db.base import Database
|
|
17
|
+
|
|
18
|
+
logger = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ReconciliationStats:
|
|
23
|
+
"""Statistics from a reconciliation run."""
|
|
24
|
+
|
|
25
|
+
orphaned_claims_cleared: int = 0
|
|
26
|
+
orphaned_nonces_released: int = 0
|
|
27
|
+
pending_without_attempts: int = 0
|
|
28
|
+
stale_claims: int = 0
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def reconcile_startup(db: Database, chain_id: int) -> ReconciliationStats:
|
|
32
|
+
"""Run reconciliation checks at startup.
|
|
33
|
+
|
|
34
|
+
Repairs:
|
|
35
|
+
- Orphaned claims (status != claimed but claim_token set, stale)
|
|
36
|
+
- Orphaned nonces (reserved but intent is terminal and stale)
|
|
37
|
+
|
|
38
|
+
Detects (logs only, no repair):
|
|
39
|
+
- Pending intents without attempts (data integrity issue)
|
|
40
|
+
- Stale claimed intents (worker may have crashed)
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
db: Database connection
|
|
44
|
+
chain_id: Chain ID to reconcile
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Statistics from the reconciliation run
|
|
48
|
+
"""
|
|
49
|
+
stats = ReconciliationStats()
|
|
50
|
+
|
|
51
|
+
# Repair: clear orphaned claims (with time guard)
|
|
52
|
+
stats.orphaned_claims_cleared = db.clear_orphaned_claims(
|
|
53
|
+
chain_id, older_than_minutes=2
|
|
54
|
+
)
|
|
55
|
+
if stats.orphaned_claims_cleared > 0:
|
|
56
|
+
logger.warning(
|
|
57
|
+
"reconciliation.orphaned_claims_cleared",
|
|
58
|
+
count=stats.orphaned_claims_cleared,
|
|
59
|
+
chain_id=chain_id,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Repair: release orphaned nonces (with time guard)
|
|
63
|
+
stats.orphaned_nonces_released = db.release_orphaned_nonces(
|
|
64
|
+
chain_id, older_than_minutes=5
|
|
65
|
+
)
|
|
66
|
+
if stats.orphaned_nonces_released > 0:
|
|
67
|
+
logger.warning(
|
|
68
|
+
"reconciliation.orphaned_nonces_released",
|
|
69
|
+
count=stats.orphaned_nonces_released,
|
|
70
|
+
chain_id=chain_id,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Detect: pending without attempts (log only - needs investigation)
|
|
74
|
+
stats.pending_without_attempts = db.count_pending_without_attempts(chain_id)
|
|
75
|
+
if stats.pending_without_attempts > 0:
|
|
76
|
+
logger.error(
|
|
77
|
+
"reconciliation.pending_without_attempts",
|
|
78
|
+
count=stats.pending_without_attempts,
|
|
79
|
+
chain_id=chain_id,
|
|
80
|
+
action="manual_investigation_required",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Detect: stale claims (log only - may self-recover or need intervention)
|
|
84
|
+
stats.stale_claims = db.count_stale_claims(chain_id, older_than_minutes=10)
|
|
85
|
+
if stats.stale_claims > 0:
|
|
86
|
+
logger.warning(
|
|
87
|
+
"reconciliation.stale_claims",
|
|
88
|
+
count=stats.stale_claims,
|
|
89
|
+
chain_id=chain_id,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Emit metrics
|
|
93
|
+
metrics = get_metrics()
|
|
94
|
+
metrics.gauge("brawny_reconciliation_orphaned_claims").set(
|
|
95
|
+
stats.orphaned_claims_cleared, chain_id=chain_id
|
|
96
|
+
)
|
|
97
|
+
metrics.gauge("brawny_reconciliation_orphaned_nonces").set(
|
|
98
|
+
stats.orphaned_nonces_released, chain_id=chain_id
|
|
99
|
+
)
|
|
100
|
+
metrics.gauge("brawny_reconciliation_pending_no_attempts").set(
|
|
101
|
+
stats.pending_without_attempts, chain_id=chain_id
|
|
102
|
+
)
|
|
103
|
+
metrics.gauge("brawny_reconciliation_stale_claims").set(
|
|
104
|
+
stats.stale_claims, chain_id=chain_id
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
logger.info("reconciliation.completed", **asdict(stats), chain_id=chain_id)
|
|
108
|
+
return stats
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Block poller, reorg detection, and job scheduler."""
|
|
2
|
+
|
|
3
|
+
from brawny.scheduler.poller import BlockPoller, PollResult
|
|
4
|
+
from brawny.scheduler.reorg import ReorgDetector, ReorgResult
|
|
5
|
+
from brawny.scheduler.runner import BlockResult, JobResult, JobRunner
|
|
6
|
+
from brawny.scheduler.shutdown import ShutdownContext, ShutdownHandler, ShutdownStats
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BlockPoller",
|
|
10
|
+
"PollResult",
|
|
11
|
+
"ReorgDetector",
|
|
12
|
+
"ReorgResult",
|
|
13
|
+
"JobRunner",
|
|
14
|
+
"JobResult",
|
|
15
|
+
"BlockResult",
|
|
16
|
+
"ShutdownHandler",
|
|
17
|
+
"ShutdownContext",
|
|
18
|
+
"ShutdownStats",
|
|
19
|
+
]
|