brawny 0.1.13__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. brawny/__init__.py +2 -0
  2. brawny/_context.py +5 -5
  3. brawny/_rpc/__init__.py +36 -12
  4. brawny/_rpc/broadcast.py +14 -13
  5. brawny/_rpc/caller.py +243 -0
  6. brawny/_rpc/client.py +539 -0
  7. brawny/_rpc/clients.py +11 -11
  8. brawny/_rpc/context.py +23 -0
  9. brawny/_rpc/errors.py +465 -31
  10. brawny/_rpc/gas.py +7 -6
  11. brawny/_rpc/pool.py +18 -0
  12. brawny/_rpc/retry.py +266 -0
  13. brawny/_rpc/retry_policy.py +81 -0
  14. brawny/accounts.py +28 -9
  15. brawny/alerts/__init__.py +15 -18
  16. brawny/alerts/abi_resolver.py +212 -36
  17. brawny/alerts/base.py +2 -2
  18. brawny/alerts/contracts.py +77 -10
  19. brawny/alerts/errors.py +30 -3
  20. brawny/alerts/events.py +38 -5
  21. brawny/alerts/health.py +19 -13
  22. brawny/alerts/send.py +513 -55
  23. brawny/api.py +39 -11
  24. brawny/assets/AGENTS.md +325 -0
  25. brawny/async_runtime.py +48 -0
  26. brawny/chain.py +3 -3
  27. brawny/cli/commands/__init__.py +2 -0
  28. brawny/cli/commands/console.py +69 -19
  29. brawny/cli/commands/contract.py +2 -2
  30. brawny/cli/commands/controls.py +121 -0
  31. brawny/cli/commands/health.py +2 -2
  32. brawny/cli/commands/job_dev.py +6 -5
  33. brawny/cli/commands/jobs.py +99 -2
  34. brawny/cli/commands/maintenance.py +13 -29
  35. brawny/cli/commands/migrate.py +1 -0
  36. brawny/cli/commands/run.py +10 -3
  37. brawny/cli/commands/script.py +8 -3
  38. brawny/cli/commands/signer.py +143 -26
  39. brawny/cli/helpers.py +0 -3
  40. brawny/cli_templates.py +25 -349
  41. brawny/config/__init__.py +4 -1
  42. brawny/config/models.py +43 -57
  43. brawny/config/parser.py +268 -57
  44. brawny/config/validation.py +52 -15
  45. brawny/daemon/context.py +4 -2
  46. brawny/daemon/core.py +185 -63
  47. brawny/daemon/loops.py +166 -98
  48. brawny/daemon/supervisor.py +261 -0
  49. brawny/db/__init__.py +14 -26
  50. brawny/db/base.py +248 -151
  51. brawny/db/global_cache.py +11 -1
  52. brawny/db/migrate.py +175 -28
  53. brawny/db/migrations/001_init.sql +4 -3
  54. brawny/db/migrations/010_add_nonce_gap_index.sql +1 -1
  55. brawny/db/migrations/011_add_job_logs.sql +1 -2
  56. brawny/db/migrations/012_add_claimed_by.sql +2 -2
  57. brawny/db/migrations/013_attempt_unique.sql +10 -0
  58. brawny/db/migrations/014_add_lease_expires_at.sql +5 -0
  59. brawny/db/migrations/015_add_signer_alias.sql +14 -0
  60. brawny/db/migrations/016_runtime_controls_and_quarantine.sql +32 -0
  61. brawny/db/migrations/017_add_job_drain.sql +6 -0
  62. brawny/db/migrations/018_add_nonce_reset_audit.sql +20 -0
  63. brawny/db/migrations/019_add_job_cooldowns.sql +8 -0
  64. brawny/db/migrations/020_attempt_unique_initial.sql +7 -0
  65. brawny/db/ops/__init__.py +3 -25
  66. brawny/db/ops/logs.py +1 -2
  67. brawny/db/queries.py +47 -91
  68. brawny/db/serialized.py +65 -0
  69. brawny/db/sqlite/__init__.py +1001 -0
  70. brawny/db/sqlite/connection.py +231 -0
  71. brawny/db/sqlite/execute.py +116 -0
  72. brawny/db/sqlite/mappers.py +190 -0
  73. brawny/db/sqlite/repos/attempts.py +372 -0
  74. brawny/db/sqlite/repos/block_state.py +102 -0
  75. brawny/db/sqlite/repos/cache.py +104 -0
  76. brawny/db/sqlite/repos/intents.py +1021 -0
  77. brawny/db/sqlite/repos/jobs.py +200 -0
  78. brawny/db/sqlite/repos/maintenance.py +182 -0
  79. brawny/db/sqlite/repos/signers_nonces.py +566 -0
  80. brawny/db/sqlite/tx.py +119 -0
  81. brawny/http.py +194 -0
  82. brawny/invariants.py +11 -24
  83. brawny/jobs/base.py +8 -0
  84. brawny/jobs/job_validation.py +2 -1
  85. brawny/keystore.py +83 -7
  86. brawny/lifecycle.py +64 -12
  87. brawny/logging.py +0 -2
  88. brawny/metrics.py +84 -12
  89. brawny/model/contexts.py +111 -9
  90. brawny/model/enums.py +1 -0
  91. brawny/model/errors.py +18 -0
  92. brawny/model/types.py +47 -131
  93. brawny/network_guard.py +133 -0
  94. brawny/networks/__init__.py +5 -5
  95. brawny/networks/config.py +1 -7
  96. brawny/networks/manager.py +14 -11
  97. brawny/runtime_controls.py +74 -0
  98. brawny/scheduler/poller.py +11 -7
  99. brawny/scheduler/reorg.py +95 -39
  100. brawny/scheduler/runner.py +442 -168
  101. brawny/scheduler/shutdown.py +3 -3
  102. brawny/script_tx.py +3 -3
  103. brawny/telegram.py +53 -7
  104. brawny/testing.py +1 -0
  105. brawny/timeout.py +38 -0
  106. brawny/tx/executor.py +922 -308
  107. brawny/tx/intent.py +54 -16
  108. brawny/tx/monitor.py +31 -12
  109. brawny/tx/nonce.py +212 -90
  110. brawny/tx/replacement.py +69 -18
  111. brawny/tx/retry_policy.py +24 -0
  112. brawny/tx/stages/types.py +75 -0
  113. brawny/types.py +18 -0
  114. brawny/utils.py +41 -0
  115. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/METADATA +3 -3
  116. brawny-0.1.22.dist-info/RECORD +163 -0
  117. brawny/_rpc/manager.py +0 -982
  118. brawny/_rpc/selector.py +0 -156
  119. brawny/db/base_new.py +0 -165
  120. brawny/db/mappers.py +0 -182
  121. brawny/db/migrations/008_add_transactions.sql +0 -72
  122. brawny/db/ops/attempts.py +0 -108
  123. brawny/db/ops/blocks.py +0 -83
  124. brawny/db/ops/cache.py +0 -93
  125. brawny/db/ops/intents.py +0 -296
  126. brawny/db/ops/jobs.py +0 -110
  127. brawny/db/ops/nonces.py +0 -322
  128. brawny/db/postgres.py +0 -2535
  129. brawny/db/postgres_new.py +0 -196
  130. brawny/db/sqlite.py +0 -2733
  131. brawny/db/sqlite_new.py +0 -191
  132. brawny-0.1.13.dist-info/RECORD +0 -141
  133. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/WHEEL +0 -0
  134. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/entry_points.txt +0 -0
  135. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/top_level.txt +0 -0
brawny/_rpc/manager.py DELETED
@@ -1,982 +0,0 @@
1
- """RPC Manager with multi-endpoint failover and health tracking.
2
-
3
- Implements OE6 simplification:
4
- - Uses EndpointSelector for health-aware endpoint ordering
5
- - Explicit failover gate: only failover on RPCRetryableError
6
- - Per-attempt metrics (requests, latency, errors, failovers)
7
- - Selector health updates only on transport failures
8
-
9
- OE6 Invariants (LOCKED):
10
- 1. Failover occurs ONLY on RPCRetryableError (explicit issubclass check)
11
- 2. Fatal + Recoverable errors NEVER trigger failover (raise immediately)
12
- 3. Per-attempt metrics: requests, latency (success AND failure), errors, failovers
13
- 4. Selector health updates ONLY on retryable failures (not Fatal/Recoverable)
14
- 5. Selector returns ALL endpoints (unhealthy moved to end, not removed)
15
- """
16
-
17
- from __future__ import annotations
18
-
19
- import re
20
- import time
21
- from urllib.parse import urlsplit, urlunsplit
22
- from typing import TYPE_CHECKING, Any
23
-
24
- from requests.auth import HTTPBasicAuth
25
- from web3 import Web3
26
- from web3.exceptions import TransactionNotFound
27
-
28
- from brawny.logging import LogEvents, get_logger
29
- from brawny.metrics import (
30
- RPC_ENDPOINT_HEALTH,
31
- RPC_ERRORS,
32
- RPC_FAILOVERS,
33
- RPC_REQUESTS,
34
- RPC_REQUESTS_BY_JOB,
35
- RPC_REQUEST_SECONDS,
36
- get_metrics,
37
- )
38
- from brawny._rpc.context import get_job_context
39
- from brawny.model.errors import (
40
- SimulationNetworkError,
41
- SimulationReverted,
42
- )
43
- from brawny._rpc.errors import (
44
- RPCError,
45
- RPCFatalError,
46
- RPCPoolExhaustedError,
47
- RPCRecoverableError,
48
- RPCRetryableError,
49
- classify_error,
50
- normalize_error_code,
51
- )
52
- from brawny._rpc.selector import EndpointSelector
53
-
54
- if TYPE_CHECKING:
55
- from brawny.config import Config
56
- from brawny._rpc.gas import GasQuote, GasQuoteCache
57
-
58
- logger = get_logger(__name__)
59
-
60
- # RPC methods that broadcast transactions (vs read-only)
61
- RPC_BROADCAST_METHODS = {"eth_sendRawTransaction", "eth_sendTransaction"}
62
-
63
-
64
- def _rpc_category(method: str) -> str:
65
- """Classify RPC method as 'broadcast' or 'read'."""
66
- return "broadcast" if method in RPC_BROADCAST_METHODS else "read"
67
-
68
-
69
- def _rpc_host(url: str, allowed_hosts: frozenset[str] | None = None) -> str:
70
- """Extract hostname from URL, stripping credentials/path/query.
71
-
72
- Returns 'unknown' if parse fails, 'other' if host not in allowed set.
73
- This provides cardinality protection for Prometheus metrics.
74
- """
75
- try:
76
- parsed = urlsplit(url)
77
- host = parsed.hostname or "unknown"
78
- if parsed.port and parsed.port not in (80, 443):
79
- host = f"{host}:{parsed.port}"
80
- # Cardinality guardrail: coerce unknown hosts
81
- if allowed_hosts and host not in allowed_hosts:
82
- return "other"
83
- return host
84
- except Exception:
85
- return "unknown"
86
-
87
-
88
- def _extract_url_auth(url: str) -> tuple[str, HTTPBasicAuth | None]:
89
- """Extract Basic Auth credentials from URL if present.
90
-
91
- Args:
92
- url: URL that may contain embedded credentials (https://user:pass@host)
93
-
94
- Returns:
95
- Tuple of (clean_url, auth) where auth is HTTPBasicAuth if credentials
96
- were present, None otherwise. The clean_url has credentials removed.
97
-
98
- Example:
99
- >>> _extract_url_auth("https://guest:secret@eth.example.com/rpc")
100
- ("https://eth.example.com/rpc", HTTPBasicAuth("guest", "secret"))
101
- """
102
- split = urlsplit(url)
103
- if split.username:
104
- # Rebuild URL without credentials
105
- netloc = split.hostname or ""
106
- if split.port:
107
- netloc = f"{netloc}:{split.port}"
108
- clean_url = urlunsplit((split.scheme, netloc, split.path, split.query, split.fragment))
109
- auth = HTTPBasicAuth(split.username, split.password or "")
110
- return clean_url, auth
111
- return url, None
112
-
113
-
114
- class RPCManager:
115
- """RPC manager with failover and health tracking.
116
-
117
- Provides a high-level interface for RPC calls with:
118
- - Automatic retry with exponential backoff
119
- - Endpoint health tracking via EndpointSelector
120
- - Explicit failover gate (only on RPCRetryableError)
121
-
122
- OE6 Simplification:
123
- - Removed circuit breaker (logging-only, no blocking)
124
- - Removed rate limiter (RPC providers handle this)
125
- - Uses EndpointSelector for health-aware ordering
126
- """
127
-
128
- def __init__(
129
- self,
130
- endpoints: list[str],
131
- timeout_seconds: float = 30.0,
132
- max_retries: int = 3,
133
- retry_backoff_base: float = 1.0,
134
- circuit_breaker_seconds: int = 300,
135
- rate_limit_per_second: float | None = None,
136
- rate_limit_burst: int | None = None,
137
- rate_limits: dict[str, dict[str, float | int]] | None = None,
138
- chain_id: int | None = None,
139
- gas_refresh_seconds: int = 15,
140
- log_init: bool = True,
141
- ) -> None:
142
- """Initialize RPC manager.
143
-
144
- Args:
145
- endpoints: List of RPC endpoint URLs
146
- timeout_seconds: Request timeout
147
- max_retries: Maximum retry attempts (try up to N different endpoints)
148
- retry_backoff_base: Base for exponential backoff
149
- circuit_breaker_seconds: Ignored (kept for backwards compatibility)
150
- rate_limit_per_second: Ignored (kept for backwards compatibility)
151
- rate_limit_burst: Ignored (kept for backwards compatibility)
152
- rate_limits: Ignored (kept for backwards compatibility)
153
- chain_id: Chain ID for validation
154
- gas_refresh_seconds: TTL for gas quote cache
155
- log_init: Whether to log initialization (False for ephemeral broadcast managers)
156
- """
157
- if not endpoints:
158
- raise ValueError("At least one RPC endpoint is required")
159
-
160
- # Use EndpointSelector for health-aware ordering (OE6)
161
- self._selector = EndpointSelector(endpoints, failure_threshold=3)
162
- self._timeout = timeout_seconds
163
- self._max_retries = max_retries
164
- self._backoff_base = retry_backoff_base
165
- self._chain_id = chain_id
166
- self._gas_refresh_seconds = gas_refresh_seconds
167
- self._gas_cache: "GasQuoteCache | None" = None
168
- self._failure_debug_last_ts: dict[tuple[int | None, str, str], float] = {}
169
-
170
- # Create Web3 instances for each endpoint
171
- # Extract Basic Auth credentials from URLs if present (e.g., https://user:pass@host)
172
- self._web3_instances: dict[str, Web3] = {}
173
- for ep in self._selector.endpoints:
174
- clean_url, auth = _extract_url_auth(ep.url)
175
- request_kwargs: dict[str, Any] = {"timeout": timeout_seconds}
176
- if auth:
177
- request_kwargs["auth"] = auth
178
- self._web3_instances[ep.url] = Web3(Web3.HTTPProvider(clean_url, request_kwargs=request_kwargs))
179
-
180
- # Build allowed hosts set for metrics cardinality protection
181
- hosts = []
182
- for ep in self._selector.endpoints:
183
- h = _rpc_host(ep.url) # no allowed_hosts passed - get raw host
184
- if h not in ("unknown", "other"):
185
- hosts.append(h)
186
- self._allowed_hosts = frozenset(hosts)
187
-
188
- if log_init:
189
- logger.info(
190
- "rpc.manager.initialized",
191
- endpoints=len(endpoints),
192
- timeout=timeout_seconds,
193
- max_retries=max_retries,
194
- )
195
-
196
- @classmethod
197
- def from_config(cls, config: Config) -> RPCManager:
198
- """Create RPC manager from config.
199
-
200
- Args:
201
- config: Application configuration
202
-
203
- Returns:
204
- Configured RPC manager
205
- """
206
- from brawny.config.routing import resolve_default_group
207
-
208
- default_group = resolve_default_group(config)
209
- endpoints = config.rpc_groups[default_group].endpoints
210
- return cls(
211
- endpoints=endpoints,
212
- timeout_seconds=config.rpc_timeout_seconds,
213
- max_retries=config.rpc_max_retries,
214
- retry_backoff_base=config.rpc_retry_backoff_base,
215
- circuit_breaker_seconds=config.rpc_circuit_breaker_seconds,
216
- rate_limit_per_second=config.rpc_rate_limit_per_second,
217
- rate_limit_burst=config.rpc_rate_limit_burst,
218
- rate_limits=config.rpc_rate_limits,
219
- chain_id=config.chain_id,
220
- gas_refresh_seconds=config.gas_refresh_seconds,
221
- )
222
-
223
- @property
224
- def web3(self) -> Web3:
225
- """Get Web3 instance for the active (healthiest) endpoint.
226
-
227
- Returns:
228
- Web3 instance configured for the current best endpoint
229
-
230
- Note:
231
- This provides direct web3-py API access. For operations with
232
- automatic retry/failover, use RPCManager methods instead.
233
- """
234
- endpoint = self._selector.get_active_endpoint()
235
- return self._web3_instances[endpoint.url]
236
-
237
- @property
238
- def gas(self) -> "GasQuoteCache":
239
- """Gas quote cache (lazy init)."""
240
- if self._gas_cache is None:
241
- from brawny._rpc.gas import GasQuoteCache
242
-
243
- self._gas_cache = GasQuoteCache(
244
- self,
245
- ttl_seconds=self._gas_refresh_seconds,
246
- )
247
- return self._gas_cache
248
-
249
- async def gas_quote(self) -> "GasQuote":
250
- """Get gas quote (async)."""
251
- return await self.gas.get_quote()
252
-
253
- def gas_quote_sync(self) -> "GasQuote | None":
254
- """Get cached gas quote (sync, for executor)."""
255
- return self.gas.get_quote_sync()
256
-
257
- @staticmethod
258
- def _safe_endpoint_label(url: str) -> str:
259
- split = urlsplit(url)
260
- netloc = split.hostname or ""
261
- if split.port:
262
- netloc = f"{netloc}:{split.port}"
263
- return urlunsplit((split.scheme, netloc, split.path, "", ""))
264
-
265
- def _should_log_failure_debug(self, method: str, endpoint: str) -> bool:
266
- safe_endpoint = self._safe_endpoint_label(endpoint)
267
- key = (self._chain_id, method, safe_endpoint)
268
- now = time.time()
269
- last = self._failure_debug_last_ts.get(key)
270
- if last is None or (now - last) >= 60:
271
- self._failure_debug_last_ts[key] = now
272
- return True
273
- return False
274
-
275
- def call(
276
- self,
277
- method: str,
278
- *args: Any,
279
- timeout: float | None = None,
280
- block_identifier: int | str = "latest",
281
- ) -> Any:
282
- """Execute an RPC call with retry and failover.
283
-
284
- OE6 Invariants:
285
- 1. Failover occurs ONLY on RPCRetryableError (explicit issubclass check)
286
- 2. Fatal + Recoverable errors NEVER trigger failover (raise immediately)
287
- 3. Per-attempt metrics: requests, latency (success AND failure), errors, failovers
288
- 4. Selector health updates ONLY on retryable failures
289
-
290
- Args:
291
- method: RPC method name (e.g., "eth_blockNumber")
292
- *args: Method arguments
293
- timeout: Optional timeout override
294
- block_identifier: Block identifier for state queries
295
-
296
- Returns:
297
- RPC response
298
-
299
- Raises:
300
- RPCFatalError: For non-retryable errors (nonce too low, reverted)
301
- RPCRecoverableError: For errors that need param changes (underpriced)
302
- RPCRetryableError: If all retries exhausted
303
- """
304
- timeout = timeout or self._timeout
305
- last_error: Exception | None = None
306
-
307
- # Get metrics provider once outside retry loop
308
- metrics = get_metrics()
309
- category = _rpc_category(method)
310
-
311
- # Get ordered endpoints from selector (healthiest first, unhealthy at end)
312
- ordered_endpoints = self._selector.order_endpoints()
313
- attempts_to_try = min(self._max_retries, len(ordered_endpoints))
314
-
315
- for attempt, endpoint in enumerate(ordered_endpoints[:attempts_to_try]):
316
- w3 = self._web3_instances[endpoint.url]
317
- rpc_host = _rpc_host(endpoint.url, self._allowed_hosts)
318
-
319
- # Count every attempt (OE6: per-attempt metrics)
320
- metrics.counter(RPC_REQUESTS).inc(
321
- chain_id=self._chain_id,
322
- method=method,
323
- rpc_category=category,
324
- rpc_host=rpc_host,
325
- )
326
-
327
- # Job attribution (if context exists) - no rpc_host to avoid cardinality explosion
328
- job_id = get_job_context()
329
- if job_id:
330
- metrics.counter(RPC_REQUESTS_BY_JOB).inc(
331
- chain_id=self._chain_id,
332
- job_id=job_id,
333
- rpc_category=category,
334
- )
335
-
336
- # Per-attempt timing starts right before execute
337
- start_time = time.time()
338
- try:
339
- result = self._execute_method(w3, method, args, block_identifier)
340
- latency = time.time() - start_time
341
-
342
- # Record success with selector
343
- self._selector.record_success(endpoint.url, latency * 1000)
344
-
345
- # Record latency on success
346
- metrics.histogram(RPC_REQUEST_SECONDS).observe(
347
- latency,
348
- chain_id=self._chain_id,
349
- method=method,
350
- rpc_category=category,
351
- rpc_host=rpc_host,
352
- )
353
-
354
- logger.debug(
355
- LogEvents.RPC_REQUEST,
356
- method=method,
357
- endpoint=self._safe_endpoint_label(endpoint.url),
358
- latency_ms=round(latency * 1000, 1),
359
- )
360
-
361
- return result
362
-
363
- except Exception as e:
364
- latency = time.time() - start_time
365
-
366
- # Record latency on failure too (don't hide slow failures)
367
- metrics.histogram(RPC_REQUEST_SECONDS).observe(
368
- latency,
369
- chain_id=self._chain_id,
370
- method=method,
371
- rpc_category=category,
372
- rpc_host=rpc_host,
373
- )
374
-
375
- # If already an RPCError subclass, preserve it but ensure context
376
- if isinstance(e, RPCError):
377
- if getattr(e, "method", None) is None or getattr(e, "endpoint", None) is None:
378
- raise type(e)(str(e), method=method, endpoint=endpoint.url) from e
379
- raise
380
-
381
- # Classify using existing infrastructure
382
- error_class = classify_error(e)
383
- error_code = normalize_error_code(e)
384
- include_trace = error_code == "unknown_error" or attempt == attempts_to_try - 1
385
-
386
- if self._should_log_failure_debug(method, endpoint.url):
387
- logger.info(
388
- "rpc.failure_debug",
389
- method=method,
390
- endpoint=self._safe_endpoint_label(endpoint.url),
391
- timeout_seconds=timeout,
392
- attempt=attempt + 1,
393
- max_retries=attempts_to_try,
394
- elapsed_ms=round(latency * 1000, 1),
395
- error_type=type(e).__name__,
396
- error_code=error_code,
397
- )
398
-
399
- # EXPLICIT FAILOVER GATE (OE6 Invariant #1):
400
- # Only failover on RPCRetryableError. This prevents future error
401
- # classes from silently becoming failover triggers.
402
- if not issubclass(error_class, RPCRetryableError):
403
- # Fatal, Recoverable, or any new class: raise immediately, no failover
404
- # Don't record failure with selector (not a transport failure)
405
- logger.warning(
406
- LogEvents.RPC_ERROR,
407
- method=method,
408
- endpoint=self._safe_endpoint_label(endpoint.url),
409
- error=str(e)[:200],
410
- error_code=error_code,
411
- attempt=attempt + 1,
412
- max_retries=attempts_to_try,
413
- classified_as=error_class.__name__,
414
- exc_info=include_trace,
415
- )
416
- raise error_class(
417
- str(e),
418
- code=error_code,
419
- endpoint=endpoint.url,
420
- method=method,
421
- ) from e
422
-
423
- # === RPCRetryableError path: record and maybe failover ===
424
- # Only record failure with selector for transport errors (OE6 Invariant #4)
425
- self._selector.record_failure(endpoint.url)
426
-
427
- # Count transport error
428
- metrics.counter(RPC_ERRORS).inc(
429
- chain_id=self._chain_id,
430
- method=method,
431
- rpc_category=category,
432
- rpc_host=rpc_host,
433
- )
434
-
435
- last_error = e
436
- is_last = (attempt == attempts_to_try - 1)
437
-
438
- if is_last:
439
- logger.warning(
440
- LogEvents.RPC_ERROR,
441
- method=method,
442
- endpoint=self._safe_endpoint_label(endpoint.url),
443
- error=str(e)[:200],
444
- error_code=error_code,
445
- attempt=attempt + 1,
446
- max_retries=attempts_to_try,
447
- classified_as="RPCRetryableError",
448
- exc_info=include_trace,
449
- )
450
- else:
451
- # Failover: log and count
452
- metrics.counter(RPC_FAILOVERS).inc(
453
- chain_id=self._chain_id,
454
- method=method,
455
- )
456
- logger.warning(
457
- "rpc.failover",
458
- method=method,
459
- endpoint=self._safe_endpoint_label(endpoint.url),
460
- error=str(e)[:200],
461
- error_code=error_code,
462
- attempt=attempt + 1,
463
- attempts_to_try=attempts_to_try,
464
- classified_as="RPCRetryableError",
465
- )
466
-
467
- # Exponential backoff before failover
468
- backoff = self._backoff_base * (2 ** attempt)
469
- time.sleep(backoff)
470
-
471
- # All retries exhausted
472
- if not self._selector.has_healthy_endpoint():
473
- logger.error(LogEvents.RPC_ALL_ENDPOINTS_FAILED)
474
-
475
- raise RPCRetryableError(
476
- f"All {attempts_to_try} attempts failed: {last_error}",
477
- code="retries_exhausted",
478
- method=method,
479
- )
480
-
481
- def _execute_method(
482
- self,
483
- w3: Web3,
484
- method: str,
485
- args: tuple,
486
- block_identifier: int | str,
487
- ) -> Any:
488
- """Execute an RPC method on a Web3 instance.
489
-
490
- Args:
491
- w3: Web3 instance
492
- method: Method name
493
- args: Method arguments
494
- block_identifier: Block for state queries
495
-
496
- Returns:
497
- Method result
498
- """
499
- # Map common method names to Web3 calls
500
- if method == "eth_blockNumber":
501
- return w3.eth.block_number
502
- elif method == "eth_getBlockByNumber":
503
- block_num = args[0] if args else "latest"
504
- full_tx = args[1] if len(args) > 1 else False
505
- return w3.eth.get_block(block_num, full_transactions=full_tx)
506
- elif method == "eth_getTransactionCount":
507
- address = args[0]
508
- block = args[1] if len(args) > 1 else "pending"
509
- return w3.eth.get_transaction_count(address, block)
510
- elif method == "eth_getTransactionReceipt":
511
- tx_hash = args[0]
512
- try:
513
- return w3.eth.get_transaction_receipt(tx_hash)
514
- except TransactionNotFound:
515
- return None
516
- elif method == "eth_sendRawTransaction":
517
- return w3.eth.send_raw_transaction(args[0])
518
- elif method == "eth_estimateGas":
519
- return w3.eth.estimate_gas(args[0], block_identifier=block_identifier)
520
- elif method == "eth_call":
521
- tx = args[0]
522
- block = args[1] if len(args) > 1 else block_identifier
523
- return w3.eth.call(tx, block_identifier=block)
524
- elif method == "eth_getStorageAt":
525
- address = args[0]
526
- slot = args[1]
527
- block = args[2] if len(args) > 2 else block_identifier
528
- return w3.eth.get_storage_at(address, slot, block_identifier=block)
529
- elif method == "eth_chainId":
530
- return w3.eth.chain_id
531
- elif method == "eth_gasPrice":
532
- return w3.eth.gas_price
533
- elif method == "eth_getBalance":
534
- address = args[0]
535
- block = args[1] if len(args) > 1 else block_identifier
536
- return w3.eth.get_balance(address, block_identifier=block)
537
- else:
538
- # Generic RPC call
539
- return w3.provider.make_request(method, list(args))
540
-
541
- # =========================================================================
542
- # High-level convenience methods
543
- # =========================================================================
544
-
545
- def with_retry(self, fn: callable) -> Any:
546
- """Execute arbitrary web3 operation with retry and failover.
547
-
548
- Use this when you need a web3-py method that isn't wrapped by RPCManager,
549
- but still want automatic retry and endpoint failover.
550
-
551
- Args:
552
- fn: Callable that takes a Web3 instance and returns a result.
553
- Will be called with the healthiest endpoint's Web3 instance.
554
-
555
- Returns:
556
- Result from fn(web3)
557
-
558
- Raises:
559
- RPCRetryableError: If all retries exhausted
560
-
561
- Example:
562
- # Get storage with retry
563
- storage = rpc.with_retry(lambda w3: w3.eth.get_storage_at(addr, 0))
564
-
565
- # Complex operation with retry
566
- def get_logs(w3):
567
- return w3.eth.get_logs({"address": addr, "fromBlock": 0})
568
- logs = rpc.with_retry(get_logs)
569
- """
570
- last_error: Exception | None = None
571
-
572
- # Get ordered endpoints from selector
573
- ordered_endpoints = self._selector.order_endpoints()
574
- attempts_to_try = min(self._max_retries, len(ordered_endpoints))
575
-
576
- for attempt, endpoint in enumerate(ordered_endpoints[:attempts_to_try]):
577
- w3 = self._web3_instances[endpoint.url]
578
-
579
- try:
580
- start_time = time.time()
581
- result = fn(w3)
582
- latency_ms = (time.time() - start_time) * 1000
583
-
584
- self._selector.record_success(endpoint.url, latency_ms)
585
-
586
- logger.debug(
587
- "rpc.with_retry.success",
588
- endpoint=self._safe_endpoint_label(endpoint.url),
589
- latency_ms=round(latency_ms, 1),
590
- attempt=attempt + 1,
591
- )
592
-
593
- return result
594
-
595
- except Exception as e:
596
- self._selector.record_failure(endpoint.url)
597
-
598
- logger.warning(
599
- "rpc.with_retry.error",
600
- endpoint=self._safe_endpoint_label(endpoint.url),
601
- error=str(e)[:200],
602
- attempt=attempt + 1,
603
- max_retries=attempts_to_try,
604
- )
605
-
606
- last_error = e
607
-
608
- # Exponential backoff
609
- if attempt < attempts_to_try - 1:
610
- backoff = self._backoff_base * (2 ** attempt)
611
- time.sleep(backoff)
612
-
613
- # All retries exhausted
614
- raise RPCRetryableError(
615
- f"with_retry: all {attempts_to_try} attempts failed: {last_error}",
616
- code="retries_exhausted",
617
- method="with_retry",
618
- )
619
-
620
- def get_block_number(self, timeout: float | None = None) -> int:
621
- """Get current block number."""
622
- return self.call("eth_blockNumber", timeout=timeout)
623
-
624
- def get_block(
625
- self,
626
- block_identifier: int | str = "latest",
627
- full_transactions: bool = False,
628
- timeout: float | None = None,
629
- ) -> dict[str, Any]:
630
- """Get block by number or hash."""
631
- return self.call(
632
- "eth_getBlockByNumber",
633
- block_identifier,
634
- full_transactions,
635
- timeout=timeout,
636
- )
637
-
638
- def get_transaction_count(
639
- self,
640
- address: str,
641
- block_identifier: str = "pending",
642
- ) -> int:
643
- """Get transaction count (nonce) for address."""
644
- return self.call("eth_getTransactionCount", address, block_identifier)
645
-
646
- def get_transaction_receipt(self, tx_hash: str) -> dict[str, Any] | None:
647
- """Get transaction receipt."""
648
- return self.call("eth_getTransactionReceipt", tx_hash)
649
-
650
- def send_raw_transaction(self, raw_tx: bytes) -> tuple[str, str]:
651
- """Broadcast a signed transaction.
652
-
653
- Routes through call() to ensure single instrumentation point.
654
- Metrics are recorded per-attempt in call().
655
-
656
- Returns:
657
- Tuple of (tx_hash, endpoint_url) — endpoint is best approximation
658
- (actual endpoint may differ if retry occurred)
659
-
660
- Raises:
661
- RPCRetryableError: All retries failed
662
- RPCFatalError: TX rejected (nonce, funds, revert)
663
- RPCRecoverableError: TX may succeed with different params
664
-
665
- NOTE: RPCManager is group-agnostic. It doesn't know about broadcast groups.
666
- The broadcast layer wraps errors with group context.
667
- """
668
- try:
669
- result = self.call("eth_sendRawTransaction", raw_tx)
670
-
671
- # Normalize tx_hash
672
- if hasattr(result, "hex"):
673
- tx_hash = f"0x{result.hex()}"
674
- else:
675
- tx_hash = result if str(result).startswith("0x") else f"0x{result}"
676
-
677
- # Return healthiest endpoint URL (best approximation - actual may differ if retry)
678
- endpoint = self._selector.get_active_endpoint()
679
- return tx_hash, endpoint.url
680
-
681
- except RPCRetryableError as e:
682
- # Convert to RPCPoolExhaustedError for broadcast.py compatibility
683
- raise RPCPoolExhaustedError(
684
- f"All {self._max_retries} retries failed",
685
- endpoints=[ep.url for ep in self._selector.endpoints],
686
- last_error=e,
687
- ) from e
688
-
689
- def estimate_gas(
690
- self,
691
- tx_params: dict[str, Any],
692
- block_identifier: int | str = "latest",
693
- ) -> int:
694
- """Estimate gas for transaction."""
695
- return self.call("eth_estimateGas", tx_params, block_identifier=block_identifier)
696
-
697
- def eth_call(
698
- self,
699
- tx_params: dict[str, Any],
700
- block_identifier: int | str = "latest",
701
- ) -> bytes:
702
- """Execute eth_call."""
703
- return self.call("eth_call", tx_params, block_identifier=block_identifier)
704
-
705
- def get_storage_at(
706
- self,
707
- address: str,
708
- slot: str | int,
709
- block_identifier: int | str = "latest",
710
- ) -> bytes:
711
- """Get storage at slot."""
712
- return self.call("eth_getStorageAt", address, slot, block_identifier=block_identifier)
713
-
714
- def get_chain_id(self) -> int:
715
- """Get chain ID."""
716
- return self.call("eth_chainId")
717
-
718
- def get_gas_price(self) -> int:
719
- """Get current gas price."""
720
- return self.call("eth_gasPrice")
721
-
722
- def get_base_fee(self, block_identifier: int | str = "latest") -> int:
723
- """Get base fee from block.
724
-
725
- Returns base fee in wei.
726
- """
727
- block = self.get_block(block_identifier)
728
- base_fee = block.get("baseFeePerGas", 0)
729
- return int(base_fee) if base_fee else 0
730
-
731
- def get_balance(
732
- self,
733
- address: str,
734
- block_identifier: int | str = "latest",
735
- ) -> int:
736
- """Get account balance in wei."""
737
- return self.call("eth_getBalance", address, block_identifier=block_identifier)
738
-
739
- # =========================================================================
740
- # Simulation
741
- # =========================================================================
742
-
743
- def simulate_transaction(
744
- self,
745
- tx: dict[str, Any],
746
- rpc_url: str | None = None,
747
- ) -> str:
748
- """Simulate a transaction using eth_call at latest block.
749
-
750
- Args:
751
- tx: Transaction dict with from, to, data, and optionally value, gas
752
- rpc_url: Optional override RPC URL. If provided, bypasses the
753
- RPCManager's failover machinery and calls this URL directly.
754
- Used for per-job RPC configuration.
755
-
756
- Returns:
757
- Hex-encoded return data (0x...) on success
758
-
759
- Raises:
760
- SimulationReverted: Transaction would revert (permanent, don't retry)
761
- SimulationNetworkError: Network/RPC error (transient, may retry)
762
- """
763
- call_params: dict[str, Any] = {
764
- "from": tx["from"],
765
- "to": tx["to"],
766
- }
767
- if "data" in tx:
768
- call_params["data"] = tx["data"]
769
- if "value" in tx:
770
- call_params["value"] = hex(tx["value"]) if isinstance(tx["value"], int) else tx["value"]
771
- if "gas" in tx:
772
- call_params["gas"] = hex(tx["gas"]) if isinstance(tx["gas"], int) else tx["gas"]
773
-
774
- try:
775
- if rpc_url:
776
- # Direct call to override RPC (bypasses failover machinery)
777
- clean_url, auth = _extract_url_auth(rpc_url)
778
- request_kwargs: dict[str, Any] = {"timeout": 30}
779
- if auth:
780
- request_kwargs["auth"] = auth
781
- w3 = Web3(Web3.HTTPProvider(clean_url, request_kwargs=request_kwargs))
782
- result = w3.eth.call(call_params, block_identifier="latest")
783
- else:
784
- result = self.eth_call(call_params, block_identifier="latest")
785
- return result.hex() if isinstance(result, bytes) else result
786
- except Exception as e:
787
- revert_reason = self._parse_revert_reason(e)
788
- if revert_reason:
789
- raise SimulationReverted(revert_reason) from e
790
- else:
791
- raise SimulationNetworkError(str(e)) from e
792
-
793
- def _parse_revert_reason(self, error: Exception) -> str | None:
794
- """Parse revert reason from RPC error.
795
-
796
- Returns revert reason string if this is a revert, None if network error.
797
-
798
- The key distinction:
799
- - Reverts are permanent (tx would fail on-chain) -> return reason string
800
- - Network errors are transient (RPC issues) -> return None
801
-
802
- Detection approach:
803
- 1. Check error codes that indicate execution failure
804
- 2. Look for revert keywords in error message
805
- 3. Try to extract revert data from structured error payloads
806
- """
807
- error_str = str(error).lower()
808
-
809
- # Extract error code if present
810
- error_code = None
811
- if hasattr(error, "args"):
812
- for arg in error.args:
813
- if isinstance(arg, dict):
814
- error_code = arg.get("code")
815
- if error_code is None:
816
- error_code = arg.get("error", {}).get("code")
817
- if error_code is not None:
818
- break
819
-
820
- # Error codes that indicate execution failure (not network issues)
821
- # -32000: Geth execution error
822
- # -32015: Parity execution error
823
- # 3: Geth revert
824
- revert_error_codes = {-32000, -32015, 3}
825
- if error_code in revert_error_codes:
826
- return self._extract_revert_message(error)
827
-
828
- # Keywords that indicate a revert (case-insensitive check already done)
829
- revert_keywords = [
830
- "execution reverted",
831
- "revert",
832
- "out of gas",
833
- "insufficient funds",
834
- "invalid opcode",
835
- "stack underflow",
836
- "stack overflow",
837
- ]
838
- if any(kw in error_str for kw in revert_keywords):
839
- return self._extract_revert_message(error)
840
-
841
- # No revert indicators - treat as network error
842
- return None
843
-
844
- def _extract_revert_message(self, error: Exception) -> str:
845
- """Extract a human-readable revert message from an error.
846
-
847
- Tries multiple strategies to get the best message possible.
848
- Returns generic message if no specific reason found.
849
- """
850
- error_str = str(error)
851
-
852
- # Strategy 1: "execution reverted: <reason>" pattern
853
- if "execution reverted:" in error_str.lower():
854
- idx = error_str.lower().find("execution reverted:")
855
- return error_str[idx + len("execution reverted:"):].strip() or "execution reverted"
856
-
857
- # Strategy 2: Extract revert data from structured payload
858
- revert_data = self._extract_revert_data(error)
859
- if revert_data:
860
- decoded = self._decode_revert_data(revert_data)
861
- if decoded:
862
- return decoded
863
-
864
- # Fallback: truncate error message
865
- clean_msg = error_str
866
-
867
- # Return first 200 chars of error as fallback
868
- if len(clean_msg) > 200:
869
- clean_msg = clean_msg[:200] + "..."
870
- return clean_msg or "Transaction reverted"
871
-
872
- def _extract_revert_data(self, error: Exception) -> str | None:
873
- """Extract hex revert data from error if present."""
874
- if hasattr(error, "args"):
875
- for arg in error.args:
876
- if isinstance(arg, dict):
877
- # Try common locations for revert data
878
- data = arg.get("data")
879
- if data is None:
880
- data = arg.get("error", {}).get("data")
881
- if isinstance(data, dict):
882
- data = data.get("data") or data.get("result")
883
- if isinstance(data, str) and data.startswith("0x"):
884
- return data
885
-
886
- # Also check error string for hex data
887
- error_str = str(error)
888
- hex_match = re.search(r"0x[0-9a-fA-F]{8,}", error_str)
889
- if hex_match:
890
- return hex_match.group()
891
-
892
- return None
893
-
894
- def _decode_revert_data(self, data: str) -> str | None:
895
- """Attempt to decode revert data into human-readable format.
896
-
897
- Handles standard Error(string) and Panic(uint256) selectors.
898
- Returns None if decoding fails (data will be shown as-is).
899
- """
900
- if len(data) < 10:
901
- return None
902
-
903
- selector = data[:10]
904
-
905
- # Error(string) - 0x08c379a0
906
- if selector == "0x08c379a0" and len(data) >= 138:
907
- try:
908
- from eth_abi import decode
909
- decoded = decode(["string"], bytes.fromhex(data[10:]))
910
- return decoded[0]
911
- except Exception:
912
- pass
913
-
914
- # Panic(uint256) - 0x4e487b71
915
- if selector == "0x4e487b71" and len(data) >= 74:
916
- try:
917
- from eth_abi import decode
918
- decoded = decode(["uint256"], bytes.fromhex(data[10:]))
919
- panic_code = decoded[0]
920
- panic_names = {
921
- 0x00: "generic panic",
922
- 0x01: "assertion failed",
923
- 0x11: "arithmetic overflow",
924
- 0x12: "division by zero",
925
- 0x21: "invalid enum value",
926
- 0x22: "storage encoding error",
927
- 0x31: "pop on empty array",
928
- 0x32: "array out of bounds",
929
- 0x41: "memory allocation error",
930
- 0x51: "zero function pointer",
931
- }
932
- return f"Panic({panic_code:#x}): {panic_names.get(panic_code, 'unknown')}"
933
- except Exception:
934
- pass
935
-
936
- # Custom error - return selector + truncated data for debugging
937
- if len(data) > 74:
938
- return f"Custom error {selector} ({len(data)//2 - 4} bytes)"
939
- elif len(data) > 10:
940
- return f"Custom error {selector}"
941
-
942
- return None
943
-
944
- # =========================================================================
945
- # Health and diagnostics
946
- # =========================================================================
947
-
948
- def get_health(self) -> dict[str, Any]:
949
- """Get health status of all endpoints."""
950
- endpoints = self._selector.endpoints
951
- healthy_count = sum(1 for e in endpoints if self._selector.is_healthy(e))
952
- metrics = get_metrics()
953
- for endpoint in endpoints:
954
- split = urlsplit(endpoint.url)
955
- netloc = split.hostname or ""
956
- if split.port:
957
- netloc = f"{netloc}:{split.port}"
958
- safe_url = urlunsplit((split.scheme, netloc, split.path, "", ""))
959
- metrics.gauge(RPC_ENDPOINT_HEALTH).set(
960
- 1.0 if self._selector.is_healthy(endpoint) else 0.0,
961
- endpoint=safe_url or "unknown",
962
- )
963
- return {
964
- "healthy_endpoints": healthy_count,
965
- "total_endpoints": len(endpoints),
966
- "all_unhealthy": not self._selector.has_healthy_endpoint(),
967
- "endpoints": [
968
- {
969
- "url": e.url[:50] + "..." if len(e.url) > 50 else e.url,
970
- "healthy": self._selector.is_healthy(e),
971
- "latency_ms": round(e.latency_ewma_ms, 1),
972
- "consecutive_failures": e.consecutive_failures,
973
- }
974
- for e in endpoints
975
- ],
976
- }
977
-
978
- def close(self) -> None:
979
- """Close all connections."""
980
- # Web3 doesn't have explicit close, but we can clear references
981
- self._web3_instances.clear()
982
- logger.info("rpc.manager.closed")