brawny 0.1.13__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. brawny/__init__.py +2 -0
  2. brawny/_context.py +5 -5
  3. brawny/_rpc/__init__.py +36 -12
  4. brawny/_rpc/broadcast.py +14 -13
  5. brawny/_rpc/caller.py +243 -0
  6. brawny/_rpc/client.py +539 -0
  7. brawny/_rpc/clients.py +11 -11
  8. brawny/_rpc/context.py +23 -0
  9. brawny/_rpc/errors.py +465 -31
  10. brawny/_rpc/gas.py +7 -6
  11. brawny/_rpc/pool.py +18 -0
  12. brawny/_rpc/retry.py +266 -0
  13. brawny/_rpc/retry_policy.py +81 -0
  14. brawny/accounts.py +28 -9
  15. brawny/alerts/__init__.py +15 -18
  16. brawny/alerts/abi_resolver.py +212 -36
  17. brawny/alerts/base.py +2 -2
  18. brawny/alerts/contracts.py +77 -10
  19. brawny/alerts/errors.py +30 -3
  20. brawny/alerts/events.py +38 -5
  21. brawny/alerts/health.py +19 -13
  22. brawny/alerts/send.py +513 -55
  23. brawny/api.py +39 -11
  24. brawny/assets/AGENTS.md +325 -0
  25. brawny/async_runtime.py +48 -0
  26. brawny/chain.py +3 -3
  27. brawny/cli/commands/__init__.py +2 -0
  28. brawny/cli/commands/console.py +69 -19
  29. brawny/cli/commands/contract.py +2 -2
  30. brawny/cli/commands/controls.py +121 -0
  31. brawny/cli/commands/health.py +2 -2
  32. brawny/cli/commands/job_dev.py +6 -5
  33. brawny/cli/commands/jobs.py +99 -2
  34. brawny/cli/commands/maintenance.py +13 -29
  35. brawny/cli/commands/migrate.py +1 -0
  36. brawny/cli/commands/run.py +10 -3
  37. brawny/cli/commands/script.py +8 -3
  38. brawny/cli/commands/signer.py +143 -26
  39. brawny/cli/helpers.py +0 -3
  40. brawny/cli_templates.py +25 -349
  41. brawny/config/__init__.py +4 -1
  42. brawny/config/models.py +43 -57
  43. brawny/config/parser.py +268 -57
  44. brawny/config/validation.py +52 -15
  45. brawny/daemon/context.py +4 -2
  46. brawny/daemon/core.py +185 -63
  47. brawny/daemon/loops.py +166 -98
  48. brawny/daemon/supervisor.py +261 -0
  49. brawny/db/__init__.py +14 -26
  50. brawny/db/base.py +248 -151
  51. brawny/db/global_cache.py +11 -1
  52. brawny/db/migrate.py +175 -28
  53. brawny/db/migrations/001_init.sql +4 -3
  54. brawny/db/migrations/010_add_nonce_gap_index.sql +1 -1
  55. brawny/db/migrations/011_add_job_logs.sql +1 -2
  56. brawny/db/migrations/012_add_claimed_by.sql +2 -2
  57. brawny/db/migrations/013_attempt_unique.sql +10 -0
  58. brawny/db/migrations/014_add_lease_expires_at.sql +5 -0
  59. brawny/db/migrations/015_add_signer_alias.sql +14 -0
  60. brawny/db/migrations/016_runtime_controls_and_quarantine.sql +32 -0
  61. brawny/db/migrations/017_add_job_drain.sql +6 -0
  62. brawny/db/migrations/018_add_nonce_reset_audit.sql +20 -0
  63. brawny/db/migrations/019_add_job_cooldowns.sql +8 -0
  64. brawny/db/migrations/020_attempt_unique_initial.sql +7 -0
  65. brawny/db/ops/__init__.py +3 -25
  66. brawny/db/ops/logs.py +1 -2
  67. brawny/db/queries.py +47 -91
  68. brawny/db/serialized.py +65 -0
  69. brawny/db/sqlite/__init__.py +1001 -0
  70. brawny/db/sqlite/connection.py +231 -0
  71. brawny/db/sqlite/execute.py +116 -0
  72. brawny/db/sqlite/mappers.py +190 -0
  73. brawny/db/sqlite/repos/attempts.py +372 -0
  74. brawny/db/sqlite/repos/block_state.py +102 -0
  75. brawny/db/sqlite/repos/cache.py +104 -0
  76. brawny/db/sqlite/repos/intents.py +1021 -0
  77. brawny/db/sqlite/repos/jobs.py +200 -0
  78. brawny/db/sqlite/repos/maintenance.py +182 -0
  79. brawny/db/sqlite/repos/signers_nonces.py +566 -0
  80. brawny/db/sqlite/tx.py +119 -0
  81. brawny/http.py +194 -0
  82. brawny/invariants.py +11 -24
  83. brawny/jobs/base.py +8 -0
  84. brawny/jobs/job_validation.py +2 -1
  85. brawny/keystore.py +83 -7
  86. brawny/lifecycle.py +64 -12
  87. brawny/logging.py +0 -2
  88. brawny/metrics.py +84 -12
  89. brawny/model/contexts.py +111 -9
  90. brawny/model/enums.py +1 -0
  91. brawny/model/errors.py +18 -0
  92. brawny/model/types.py +47 -131
  93. brawny/network_guard.py +133 -0
  94. brawny/networks/__init__.py +5 -5
  95. brawny/networks/config.py +1 -7
  96. brawny/networks/manager.py +14 -11
  97. brawny/runtime_controls.py +74 -0
  98. brawny/scheduler/poller.py +11 -7
  99. brawny/scheduler/reorg.py +95 -39
  100. brawny/scheduler/runner.py +442 -168
  101. brawny/scheduler/shutdown.py +3 -3
  102. brawny/script_tx.py +3 -3
  103. brawny/telegram.py +53 -7
  104. brawny/testing.py +1 -0
  105. brawny/timeout.py +38 -0
  106. brawny/tx/executor.py +922 -308
  107. brawny/tx/intent.py +54 -16
  108. brawny/tx/monitor.py +31 -12
  109. brawny/tx/nonce.py +212 -90
  110. brawny/tx/replacement.py +69 -18
  111. brawny/tx/retry_policy.py +24 -0
  112. brawny/tx/stages/types.py +75 -0
  113. brawny/types.py +18 -0
  114. brawny/utils.py +41 -0
  115. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/METADATA +3 -3
  116. brawny-0.1.22.dist-info/RECORD +163 -0
  117. brawny/_rpc/manager.py +0 -982
  118. brawny/_rpc/selector.py +0 -156
  119. brawny/db/base_new.py +0 -165
  120. brawny/db/mappers.py +0 -182
  121. brawny/db/migrations/008_add_transactions.sql +0 -72
  122. brawny/db/ops/attempts.py +0 -108
  123. brawny/db/ops/blocks.py +0 -83
  124. brawny/db/ops/cache.py +0 -93
  125. brawny/db/ops/intents.py +0 -296
  126. brawny/db/ops/jobs.py +0 -110
  127. brawny/db/ops/nonces.py +0 -322
  128. brawny/db/postgres.py +0 -2535
  129. brawny/db/postgres_new.py +0 -196
  130. brawny/db/sqlite.py +0 -2733
  131. brawny/db/sqlite_new.py +0 -191
  132. brawny-0.1.13.dist-info/RECORD +0 -141
  133. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/WHEEL +0 -0
  134. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/entry_points.txt +0 -0
  135. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/top_level.txt +0 -0
@@ -10,10 +10,11 @@ Implements the job evaluation logic from SPEC 5.3:
10
10
  from __future__ import annotations
11
11
 
12
12
  import asyncio
13
+ import hashlib
13
14
  import inspect
14
- import threading
15
15
  import time
16
16
  from uuid import UUID
17
+ from datetime import datetime, timedelta
17
18
  from collections.abc import Callable
18
19
  from concurrent.futures import ThreadPoolExecutor
19
20
  from concurrent.futures import TimeoutError as FuturesTimeout
@@ -27,6 +28,7 @@ from brawny._rpc.context import set_job_context as set_rpc_job_context
27
28
  from brawny.jobs.base import Job # Runtime import for legacy API detection
28
29
  from brawny.jobs.kv import DatabaseJobKVStore
29
30
  from brawny.logging import LogEvents, get_logger
31
+ from brawny._rpc.errors import RPCError
30
32
  from brawny.metrics import (
31
33
  INTENTS_CREATED,
32
34
  JOB_BUILD_TIMEOUTS,
@@ -34,21 +36,30 @@ from brawny.metrics import (
34
36
  JOB_CHECK_TIMEOUTS,
35
37
  JOBS_TRIGGERED,
36
38
  LAST_INTENT_CREATED_TIMESTAMP,
39
+ INTENT_COOLDOWN_SKIPPED,
40
+ INTENT_COOLDOWN_ERRORS,
37
41
  get_metrics,
38
42
  )
39
- from brawny.model.contexts import BlockContext, BuildContext, CheckContext
43
+ from brawny.model.contexts import BlockContext, BuildContext, CheckContext, CancellationToken
40
44
  from brawny.model.types import BlockInfo, Trigger
45
+ from brawny.model.errors import DatabaseError
46
+ from brawny.http import ApprovedHttpClient
47
+ from brawny.alerts.health import health_alert
48
+ from brawny.async_runtime import run_sync
49
+ from brawny.network_guard import job_network_guard
41
50
 
42
51
  if TYPE_CHECKING:
43
52
  from brawny._rpc.clients import RPCClients
44
- from brawny._rpc.manager import RPCManager
53
+ from brawny._rpc.clients import ReadClient
45
54
  from brawny.alerts.contracts import ContractSystem
46
55
  from brawny.config import Config
47
56
  from brawny.db.base import Database
48
57
  from brawny.lifecycle import LifecycleDispatcher
49
58
  from brawny.model.types import TxIntent, TxIntentSpec
59
+ from brawny.runtime_controls import RuntimeControls
50
60
 
51
61
  logger = get_logger(__name__)
62
+ _MAX_ABANDONED_EXECUTORS = 3
52
63
 
53
64
 
54
65
  @lru_cache(maxsize=1024)
@@ -97,6 +108,7 @@ class JobResult:
97
108
  intent_created: bool = False
98
109
  skipped: bool = False
99
110
  error: Exception | None = None
111
+ check_token: CancellationToken | None = None
100
112
 
101
113
 
102
114
  @dataclass
@@ -120,14 +132,14 @@ class JobRunner:
120
132
  def __init__(
121
133
  self,
122
134
  db: Database,
123
- rpc: RPCManager,
135
+ rpc: ReadClient,
124
136
  config: Config,
125
137
  jobs: dict[str, Job],
126
138
  on_intent_created: Callable[[str], None] | None = None,
127
139
  lifecycle: LifecycleDispatcher | None = None,
128
140
  contract_system: ContractSystem | None = None,
129
141
  loop: asyncio.AbstractEventLoop | None = None,
130
- loop_thread_id: int | None = None,
142
+ controls: "RuntimeControls | None" = None,
131
143
  ) -> None:
132
144
  """Initialize job runner.
133
145
 
@@ -138,7 +150,6 @@ class JobRunner:
138
150
  jobs: Dictionary of job_id -> Job instances
139
151
  on_intent_created: Callback when intent is created (for worker scheduling)
140
152
  loop: Event loop for async job.check() support
141
- loop_thread_id: Thread ID that owns the loop (for assertion)
142
153
  """
143
154
  self._db = db
144
155
  self._rpc = rpc
@@ -149,39 +160,113 @@ class JobRunner:
149
160
  self._lifecycle = lifecycle
150
161
  self._contract_system = contract_system
151
162
  self._loop = loop
152
- self._loop_thread_id = loop_thread_id
163
+ self._http_client = ApprovedHttpClient(config.http)
164
+ self._controls = controls
153
165
 
154
166
  # RPC clients cache for per-job read routing
155
167
  from brawny._rpc.clients import RPCClients
156
168
  self._rpc_clients: RPCClients = RPCClients(config)
157
169
 
158
170
  # Thread pool for job check timeouts (used for sync jobs only)
159
- self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="job_check")
171
+ self._executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="job_check")
160
172
  self._abandoned_executors = 0
173
+ self._last_cooldown_prune = 0.0
174
+
175
+ cooldown_cfg = config.intent_cooldown
176
+ logger.info(
177
+ "intent.cooldown.config",
178
+ enabled=cooldown_cfg.enabled,
179
+ default_seconds=cooldown_cfg.default_seconds,
180
+ max_seconds=cooldown_cfg.max_seconds,
181
+ prune_older_than_days=cooldown_cfg.prune_older_than_days,
182
+ )
161
183
 
162
184
  def _recreate_executor_after_timeout(self, operation: str, job_id: str) -> None:
163
185
  """Recreate the executor after a timeout to prevent deadlock.
164
186
 
165
187
  When a job times out, the worker thread continues running but the future
166
- is cancelled. With max_workers=1, this blocks all subsequent job operations.
167
- We recreate the executor to abandon the stuck thread and continue processing.
188
+ is cancelled. A stuck worker can starve the pool; recreating the executor
189
+ abandons the thread and allows new work to proceed.
168
190
 
169
191
  Args:
170
192
  operation: The operation that timed out ("check" or "build")
171
193
  job_id: The job that caused the timeout
172
194
  """
195
+ next_count = self._abandoned_executors + 1
173
196
  logger.warning(
174
197
  "runner.executor_recreated",
175
198
  operation=operation,
176
199
  job_id=job_id,
177
200
  reason="Abandoning stuck thread after timeout",
178
- abandoned_executors=self._abandoned_executors + 1,
201
+ abandoned_executors=next_count,
179
202
  )
180
203
  # Don't wait for the stuck thread - just abandon it
181
204
  self._executor.shutdown(wait=False, cancel_futures=True)
182
- self._abandoned_executors += 1
205
+ self._abandoned_executors = next_count
206
+ if self._abandoned_executors > _MAX_ABANDONED_EXECUTORS:
207
+ error = RuntimeError(
208
+ f"Exceeded abandoned executor cap ({_MAX_ABANDONED_EXECUTORS}); "
209
+ "possible stuck job execution threads."
210
+ )
211
+ health_alert(
212
+ component="brawny.scheduler.runner",
213
+ chain_id=self._chain_id,
214
+ job_id=job_id,
215
+ error=error,
216
+ level="critical",
217
+ action="Investigate stuck job checks/builds; restart daemon.",
218
+ )
219
+ raise error
183
220
  self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="job_check")
184
221
 
222
+ def _safe_get_attempts_for_intent(
223
+ self,
224
+ *,
225
+ intent_id: str | UUID,
226
+ job_id: str,
227
+ signer: str | None,
228
+ to_address: str | None,
229
+ existing_status: str | None,
230
+ existing_claimed_at: datetime | None,
231
+ ) -> int:
232
+ """Return attempts count, or -1 if unknown due to DB read failure."""
233
+ try:
234
+ intent_uuid = intent_id if isinstance(intent_id, UUID) else UUID(str(intent_id))
235
+ return len(self._db.get_attempts_for_intent(intent_uuid))
236
+ except asyncio.CancelledError:
237
+ raise
238
+ except DatabaseError:
239
+ logger.warning(
240
+ "intent.create.skipped_inflight",
241
+ job_id=job_id,
242
+ signer=signer,
243
+ to_address=to_address,
244
+ intent_id=str(intent_id),
245
+ existing_intent_id=str(intent_id),
246
+ existing_status=existing_status,
247
+ existing_claimed_at=existing_claimed_at,
248
+ existing_attempt_count=-1,
249
+ attempts_read_error=True,
250
+ chain_id=self._chain_id,
251
+ exc_info=True,
252
+ )
253
+ return -1
254
+ except Exception as e:
255
+ logger.error(
256
+ "intent.create.skipped_inflight_failed",
257
+ job_id=job_id,
258
+ signer=signer,
259
+ to_address=to_address,
260
+ intent_id=str(intent_id),
261
+ existing_intent_id=str(intent_id),
262
+ existing_status=existing_status,
263
+ existing_claimed_at=existing_claimed_at,
264
+ chain_id=self._chain_id,
265
+ error=str(e),
266
+ exc_info=True,
267
+ )
268
+ raise
269
+
185
270
  def process_block(self, block: BlockInfo) -> BlockResult:
186
271
  """Process a block by evaluating all enabled jobs.
187
272
 
@@ -194,18 +279,61 @@ class JobRunner:
194
279
  BlockResult with processing stats
195
280
  """
196
281
  result = BlockResult(block_number=block.block_number)
282
+ now_epoch = time.time()
283
+
284
+ cooldown_cfg = self._config.intent_cooldown
285
+ if cooldown_cfg.enabled and cooldown_cfg.prune_older_than_days > 0:
286
+ if now_epoch - self._last_cooldown_prune >= 24 * 3600:
287
+ try:
288
+ deleted = self._db.prune_job_cooldowns(cooldown_cfg.prune_older_than_days)
289
+ if deleted > 0:
290
+ logger.info("intent.cooldown_prune", deleted=deleted)
291
+ except asyncio.CancelledError:
292
+ raise
293
+ except DatabaseError as e:
294
+ logger.warning(
295
+ "intent.cooldown_prune_failed",
296
+ chain_id=self._chain_id,
297
+ error=str(e)[:200],
298
+ )
299
+ except Exception as e:
300
+ logger.error(
301
+ "intent.cooldown_prune_failed",
302
+ chain_id=self._chain_id,
303
+ error=str(e)[:200],
304
+ exc_info=True,
305
+ )
306
+ raise
307
+ self._last_cooldown_prune = now_epoch
197
308
 
198
309
  # Warm gas quote cache at start of block (for executor)
199
310
  if self._loop is not None:
200
311
  try:
201
- self._loop.run_until_complete(
202
- asyncio.wait_for(self._rpc.gas_quote(), timeout=5.0)
312
+ run_sync(asyncio.wait_for(self._rpc.gas_quote(), timeout=5.0))
313
+ except asyncio.CancelledError:
314
+ raise
315
+ except (asyncio.TimeoutError, RPCError) as e:
316
+ logger.warning(
317
+ "gas.cache_warm_failed",
318
+ chain_id=self._chain_id,
319
+ error=str(e),
203
320
  )
204
321
  except Exception as e:
205
- logger.warning("gas.cache_warm_failed", error=str(e))
322
+ logger.error(
323
+ "gas.cache_warm_failed",
324
+ chain_id=self._chain_id,
325
+ error=str(e),
326
+ exc_info=True,
327
+ )
328
+ raise
206
329
 
207
330
  # Get enabled jobs sorted by job_id
208
331
  enabled_jobs = self._db.get_enabled_jobs()
332
+ pause_new_intents = (
333
+ self._controls.is_active("pause_new_intents")
334
+ if self._controls is not None
335
+ else False
336
+ )
209
337
 
210
338
  for job_config in enabled_jobs:
211
339
  job_id = job_config.job_id
@@ -216,117 +344,144 @@ class JobRunner:
216
344
  # Job in DB but not discovered - skip silently
217
345
  # (orphaned jobs are warned about once at startup)
218
346
  continue
347
+ try:
219
348
 
220
- # Check interval
221
- last_checked = job_config.last_checked_block_number
222
- if last_checked is not None and (block.block_number - last_checked) < job.check_interval_blocks:
223
- logger.debug(
224
- LogEvents.JOB_CHECK_SKIP,
225
- job_id=job_id,
226
- block_number=block.block_number,
227
- last_checked=last_checked,
228
- interval=job.check_interval_blocks,
229
- )
230
- continue
231
- backoff_until = self._db.get_job_kv(job_id, "backoff_until_block")
232
- if isinstance(backoff_until, int) and block.block_number <= backoff_until:
233
- logger.debug(
234
- "job.check_backoff",
235
- job_id=job_id,
236
- block_number=block.block_number,
237
- backoff_until=backoff_until,
238
- )
239
- continue
240
-
241
- if job.max_in_flight_intents is not None:
242
- active_count = self._db.get_active_intent_count(
243
- job_id,
244
- chain_id=self._chain_id,
245
- )
246
- if active_count >= job.max_in_flight_intents:
247
- logger.warning(
248
- "job.check.backpressure",
349
+ # Check interval
350
+ last_checked = job_config.last_checked_block_number
351
+ if last_checked is not None and (block.block_number - last_checked) < job.check_interval_blocks:
352
+ logger.debug(
353
+ LogEvents.JOB_CHECK_SKIP,
249
354
  job_id=job_id,
250
355
  block_number=block.block_number,
251
- active_intents=active_count,
252
- limit=job.max_in_flight_intents,
356
+ last_checked=last_checked,
357
+ interval=job.check_interval_blocks,
253
358
  )
254
- self._db.update_job_checked(
255
- job_id,
256
- block.block_number,
257
- triggered=False,
359
+ continue
360
+ backoff_until = self._db.get_job_kv(job_id, "backoff_until_block")
361
+ if isinstance(backoff_until, int) and block.block_number <= backoff_until:
362
+ logger.debug(
363
+ "job.check_backoff",
364
+ job_id=job_id,
365
+ block_number=block.block_number,
366
+ backoff_until=backoff_until,
258
367
  )
259
368
  continue
260
369
 
261
- # Run job check
262
- job_result = self._run_job_check(job, block)
263
- result.jobs_checked += 1
264
-
265
- if job_result.error:
266
- if self._config.job_error_backoff_blocks > 0:
267
- self._db.set_job_kv(
370
+ if job.max_in_flight_intents is not None:
371
+ active_count = self._db.get_active_intent_count(
268
372
  job_id,
269
- "backoff_until_block",
270
- block.block_number + self._config.job_error_backoff_blocks,
373
+ chain_id=self._chain_id,
271
374
  )
272
- result.errors.append(f"{job_id}: {job_result.error}")
273
- continue
375
+ if active_count >= job.max_in_flight_intents:
376
+ logger.warning(
377
+ "job.check.backpressure",
378
+ job_id=job_id,
379
+ block_number=block.block_number,
380
+ active_intents=active_count,
381
+ limit=job.max_in_flight_intents,
382
+ )
383
+ self._db.update_job_checked(
384
+ job_id,
385
+ block.block_number,
386
+ triggered=False,
387
+ )
388
+ continue
389
+
390
+ # Run job check
391
+ job_result = self._run_job_check(job, block)
392
+ result.jobs_checked += 1
393
+
394
+ if job_result.error:
395
+ if self._config.job_error_backoff_blocks > 0:
396
+ self._db.set_job_kv(
397
+ job_id,
398
+ "backoff_until_block",
399
+ block.block_number + self._config.job_error_backoff_blocks,
400
+ )
401
+ result.errors.append(f"{job_id}: {job_result.error}")
402
+ continue
274
403
 
275
- # Update last checked
276
- self._db.update_job_checked(
277
- job_id,
278
- block.block_number,
279
- triggered=job_result.triggered,
280
- )
404
+ # Update last checked
405
+ self._db.update_job_checked(
406
+ job_id,
407
+ block.block_number,
408
+ triggered=job_result.triggered,
409
+ )
281
410
 
282
- if job_result.triggered and job_result.trigger:
283
- result.jobs_triggered += 1
411
+ if job_result.triggered and job_result.trigger:
412
+ result.jobs_triggered += 1
284
413
 
285
- # Create intent if tx required
286
- if job_result.trigger.tx_required:
287
- try:
288
- intent, is_new = self._create_intent_for_trigger(
289
- job, block, job_result.trigger
290
- )
291
- if is_new:
292
- result.intents_created += 1
293
- metrics = get_metrics()
294
- metrics.counter(INTENTS_CREATED).inc(
295
- chain_id=self._chain_id,
414
+ # Create intent if tx required
415
+ if job_result.trigger.tx_required:
416
+ if pause_new_intents:
417
+ logger.warning(
418
+ "runtime.control.pause_new_intents",
296
419
  job_id=job_id,
420
+ block_number=block.block_number,
297
421
  )
298
- metrics.gauge(LAST_INTENT_CREATED_TIMESTAMP).set(
299
- time.time(),
422
+ continue
423
+ try:
424
+ intent, is_new = self._create_intent_for_trigger(
425
+ job,
426
+ block,
427
+ job_result.trigger,
428
+ cancellation_token=job_result.check_token,
429
+ )
430
+ if is_new:
431
+ result.intents_created += 1
432
+ metrics = get_metrics()
433
+ metrics.counter(INTENTS_CREATED).inc(
434
+ chain_id=self._chain_id,
435
+ job_id=job_id,
436
+ )
437
+ metrics.gauge(LAST_INTENT_CREATED_TIMESTAMP).set(
438
+ time.time(),
439
+ chain_id=self._chain_id,
440
+ )
441
+ if self._lifecycle:
442
+ self._lifecycle.on_triggered(
443
+ job,
444
+ job_result.trigger,
445
+ block,
446
+ intent.intent_id,
447
+ )
448
+ except asyncio.CancelledError:
449
+ raise
450
+ except DatabaseError as e:
451
+ logger.warning(
452
+ "intent.creation_failed",
300
453
  chain_id=self._chain_id,
454
+ job_id=job_id,
455
+ error=str(e),
301
456
  )
302
- if self._lifecycle:
303
- self._lifecycle.on_triggered(
304
- job,
305
- job_result.trigger,
306
- block,
307
- intent.intent_id,
457
+ if self._config.job_error_backoff_blocks > 0:
458
+ self._db.set_job_kv(
459
+ job_id,
460
+ "backoff_until_block",
461
+ block.block_number + self._config.job_error_backoff_blocks,
308
462
  )
309
- except Exception as e:
310
- logger.error(
311
- "intent.creation_failed",
312
- job_id=job_id,
313
- error=str(e),
314
- )
315
- if self._config.job_error_backoff_blocks > 0:
316
- self._db.set_job_kv(
317
- job_id,
318
- "backoff_until_block",
319
- block.block_number + self._config.job_error_backoff_blocks,
463
+ result.errors.append(f"{job_id} intent: {e}")
464
+ except Exception as e:
465
+ logger.error(
466
+ "intent.creation_failed",
467
+ chain_id=self._chain_id,
468
+ job_id=job_id,
469
+ error=str(e),
470
+ exc_info=True,
320
471
  )
321
- result.errors.append(f"{job_id} intent: {e}")
322
- else:
323
- if self._lifecycle:
324
- self._lifecycle.on_triggered(
325
- job,
326
- job_result.trigger,
327
- block,
328
- None,
329
- )
472
+ raise
473
+ else:
474
+ if self._lifecycle:
475
+ self._lifecycle.on_triggered(
476
+ job,
477
+ job_result.trigger,
478
+ block,
479
+ None,
480
+ )
481
+ except DatabaseError as e:
482
+ self._handle_db_busy(e)
483
+ result.errors.append(f"{job_id}: {e}")
484
+ break
330
485
 
331
486
  return result
332
487
 
@@ -352,19 +507,16 @@ class JobRunner:
352
507
  metrics = get_metrics()
353
508
  start_time = time.perf_counter()
354
509
 
510
+ check_token = CancellationToken()
355
511
  # Build check context (phase-specific)
356
- ctx = self._build_check_context(job, block)
512
+ ctx = self._build_check_context(job, block, check_token)
357
513
 
358
514
  from brawny.scripting import set_job_context
359
515
 
360
516
  try:
361
517
  # Use async path if loop is available
362
- if self._loop is not None and self._loop_thread_id is not None:
363
- # Assert we're on the correct thread (loop owner)
364
- assert threading.get_ident() == self._loop_thread_id, \
365
- "check_job called from wrong thread"
366
-
367
- trigger = self._loop.run_until_complete(
518
+ if self._loop is not None:
519
+ trigger = run_sync(
368
520
  asyncio.wait_for(
369
521
  self._run_check_async(job, block, ctx),
370
522
  timeout=job.check_timeout_seconds,
@@ -384,11 +536,12 @@ class JobRunner:
384
536
  block_number=ctx.block.number,
385
537
  )
386
538
  try:
387
- # Call with or without ctx based on signature
388
- if _accepts_ctx(type(job), "check"):
389
- return job.check(ctx)
390
- else:
391
- return job.check()
539
+ with job_network_guard():
540
+ # Call with or without ctx based on signature
541
+ if _accepts_ctx(type(job), "check"):
542
+ return job.check(ctx)
543
+ else:
544
+ return job.check()
392
545
  finally:
393
546
  set_job_context(False)
394
547
  reset_rpc_job_context(thread_rpc_ctx_token)
@@ -415,11 +568,13 @@ class JobRunner:
415
568
  job_id=job.job_id,
416
569
  triggered=True,
417
570
  trigger=trigger,
571
+ check_token=check_token,
418
572
  )
419
573
  else:
420
- return JobResult(job_id=job.job_id, triggered=False)
574
+ return JobResult(job_id=job.job_id, triggered=False, check_token=check_token)
421
575
 
422
576
  except (asyncio.TimeoutError, FuturesTimeout):
577
+ check_token.cancel()
423
578
  logger.error(
424
579
  LogEvents.JOB_CHECK_TIMEOUT,
425
580
  job_id=job.job_id,
@@ -436,16 +591,30 @@ class JobRunner:
436
591
  return JobResult(
437
592
  job_id=job.job_id,
438
593
  error=TimeoutError(f"Job check timed out after {job.check_timeout_seconds}s"),
594
+ check_token=check_token,
439
595
  )
440
596
 
597
+ except asyncio.CancelledError:
598
+ raise
599
+ except (DatabaseError, RPCError) as e:
600
+ logger.warning(
601
+ "job.check.error",
602
+ job_id=job.job_id,
603
+ chain_id=self._chain_id,
604
+ block_number=block.block_number,
605
+ error=str(e),
606
+ )
607
+ return JobResult(job_id=job.job_id, error=e, check_token=check_token)
441
608
  except Exception as e:
442
609
  logger.error(
443
610
  "job.check.error",
444
611
  job_id=job.job_id,
612
+ chain_id=self._chain_id,
445
613
  block_number=block.block_number,
446
614
  error=str(e),
615
+ exc_info=True,
447
616
  )
448
- return JobResult(job_id=job.job_id, error=e)
617
+ raise
449
618
  finally:
450
619
  duration = time.perf_counter() - start_time
451
620
  metrics.histogram(JOB_CHECK_SECONDS).observe(
@@ -470,15 +639,16 @@ class JobRunner:
470
639
  )
471
640
 
472
641
  try:
473
- # Call with or without ctx based on signature
474
- if _accepts_ctx(type(job), "check"):
475
- result = job.check(ctx)
476
- else:
477
- result = job.check()
642
+ with job_network_guard():
643
+ # Call with or without ctx based on signature
644
+ if _accepts_ctx(type(job), "check"):
645
+ result = job.check(ctx)
646
+ else:
647
+ result = job.check()
478
648
 
479
- if inspect.isawaitable(result):
480
- return await result
481
- return result
649
+ if inspect.isawaitable(result):
650
+ return await result
651
+ return result
482
652
  finally:
483
653
  set_job_context(False)
484
654
  reset_rpc_job_context(async_rpc_ctx_token)
@@ -486,7 +656,26 @@ class JobRunner:
486
656
  _job_ctx.reset(ctx_token)
487
657
  _current_job.reset(job_token)
488
658
 
489
- def _build_check_context(self, job: Job, block: BlockInfo) -> CheckContext:
659
+ def _handle_db_busy(self, error: DatabaseError) -> None:
660
+ if self._controls is None:
661
+ logger.warning("db.busy_sustained", error=str(error))
662
+ return
663
+ self._db.set_runtime_control(
664
+ control="pause_new_intents",
665
+ active=True,
666
+ expires_at=datetime.utcnow() + timedelta(seconds=300),
667
+ reason="db_busy",
668
+ actor="runner",
669
+ mode="auto",
670
+ )
671
+ logger.warning("db.busy_sustained", error=str(error))
672
+
673
+ def _build_check_context(
674
+ self,
675
+ job: Job,
676
+ block: BlockInfo,
677
+ cancellation_token: CancellationToken,
678
+ ) -> CheckContext:
490
679
  """Build a CheckContext for job check phase.
491
680
 
492
681
  Args:
@@ -508,7 +697,7 @@ class JobRunner:
508
697
  number=block.block_number,
509
698
  timestamp=block.timestamp,
510
699
  hash=block.block_hash,
511
- base_fee=0, # TODO: Get from block if available
700
+ base_fee=block.base_fee,
512
701
  chain_id=block.chain_id,
513
702
  )
514
703
 
@@ -520,8 +709,10 @@ class JobRunner:
520
709
  kv=DatabaseJobKVStore(self._db, job.job_id),
521
710
  job_id=job.job_id,
522
711
  rpc=rpc,
712
+ http=self._http_client,
523
713
  logger=logger.bind(job_id=job.job_id, chain_id=block.chain_id),
524
714
  contracts=contracts,
715
+ cancellation_token=cancellation_token,
525
716
  _db=self._db,
526
717
  )
527
718
 
@@ -550,7 +741,7 @@ class JobRunner:
550
741
  number=block.block_number,
551
742
  timestamp=block.timestamp,
552
743
  hash=block.block_hash,
553
- base_fee=0,
744
+ base_fee=block.base_fee,
554
745
  chain_id=block.chain_id,
555
746
  )
556
747
 
@@ -562,6 +753,7 @@ class JobRunner:
562
753
  job_id=job.job_id,
563
754
  signer_address=signer_address,
564
755
  rpc=rpc,
756
+ http=self._http_client,
565
757
  logger=logger.bind(job_id=job.job_id, chain_id=block.chain_id),
566
758
  contracts=contracts,
567
759
  kv=DatabaseJobKVStore(self._db, job.job_id), # KVReader (read-only access)
@@ -572,6 +764,7 @@ class JobRunner:
572
764
  job: Job,
573
765
  block: BlockInfo,
574
766
  trigger: Trigger,
767
+ cancellation_token: CancellationToken | None = None,
575
768
  ) -> tuple[TxIntent | None, bool]:
576
769
  """Create a transaction intent for a triggered job.
577
770
 
@@ -580,11 +773,80 @@ class JobRunner:
580
773
  block: Block information
581
774
  trigger: Trigger with intent details
582
775
  """
776
+ from brawny.model.errors import CancelledCheckError
583
777
  from brawny.tx.intent import create_intent
584
778
 
585
779
  # Resolve signer address for build context
586
780
  signer_address = job.signer_address
587
781
 
782
+ cooldown_cfg = self._config.intent_cooldown
783
+ if cooldown_cfg.enabled:
784
+ job_override = job.cooldown_seconds
785
+ effective_cooldown = (
786
+ job_override if job_override is not None else cooldown_cfg.default_seconds
787
+ )
788
+ if effective_cooldown < 0:
789
+ raise ValueError("cooldown_seconds cannot be negative")
790
+ if effective_cooldown == 0:
791
+ pass
792
+ else:
793
+ if effective_cooldown > cooldown_cfg.max_seconds:
794
+ effective_cooldown = cooldown_cfg.max_seconds
795
+
796
+ now = int(time.time())
797
+ target_key = job.cooldown_key(trigger)
798
+ if target_key is None:
799
+ target_key = "*"
800
+ target_key_str = str(target_key)
801
+ if len(target_key_str) > 64:
802
+ target_key_str = hashlib.sha256(target_key_str.encode("utf-8")).hexdigest()[:16]
803
+
804
+ cooldown_key = (
805
+ f"{job.job_id}:{self._chain_id}:{signer_address.lower()}:{target_key_str}"
806
+ )
807
+ try:
808
+ allowed, last_intent_at = self._db.should_create_intent(
809
+ cooldown_key,
810
+ now,
811
+ int(effective_cooldown),
812
+ )
813
+ except asyncio.CancelledError:
814
+ raise
815
+ except DatabaseError as e:
816
+ logger.warning(
817
+ "intent.cooldown_error",
818
+ job_id=job.job_id,
819
+ chain_id=self._chain_id,
820
+ error=str(e)[:200],
821
+ )
822
+ metrics = get_metrics()
823
+ metrics.counter(INTENT_COOLDOWN_ERRORS).inc(chain_id=self._chain_id)
824
+ except Exception as e:
825
+ logger.error(
826
+ "intent.cooldown_error",
827
+ job_id=job.job_id,
828
+ chain_id=self._chain_id,
829
+ error=str(e)[:200],
830
+ exc_info=True,
831
+ )
832
+ raise
833
+ else:
834
+ if not allowed:
835
+ last_seen = last_intent_at if last_intent_at is not None else now
836
+ next_allowed_at = last_seen + int(effective_cooldown)
837
+ logger.debug(
838
+ "intent.cooldown_skip",
839
+ reason="cooldown_active",
840
+ job_id=job.job_id,
841
+ chain_id=self._chain_id,
842
+ signer=signer_address,
843
+ cooldown_seconds=int(effective_cooldown),
844
+ next_allowed_at=next_allowed_at,
845
+ )
846
+ metrics = get_metrics()
847
+ metrics.counter(INTENT_COOLDOWN_SKIPPED).inc(chain_id=self._chain_id)
848
+ return None, False
849
+
588
850
  # Build context for build_tx (phase-specific)
589
851
  ctx = self._build_build_context(job, block, trigger, signer_address)
590
852
 
@@ -597,15 +859,16 @@ class JobRunner:
597
859
  rpc_ctx_token = set_rpc_job_context(job.job_id)
598
860
  set_job_context(True)
599
861
  try:
600
- # Support legacy build_intent(trigger) API:
601
- # If job has build_intent but didn't override build_tx, use legacy API
602
- if hasattr(job, "build_intent") and type(job).build_tx is Job.build_tx:
603
- return job.build_intent(ctx.trigger)
604
- # Call with or without ctx based on signature
605
- if _accepts_ctx(type(job), "build_tx"):
606
- return job.build_tx(ctx)
607
- else:
608
- return job.build_tx()
862
+ with job_network_guard():
863
+ # Support legacy build_intent(trigger) API:
864
+ # If job has build_intent but didn't override build_tx, use legacy API
865
+ if hasattr(job, "build_intent") and type(job).build_tx is Job.build_tx:
866
+ return job.build_intent(ctx.trigger)
867
+ # Call with or without ctx based on signature
868
+ if _accepts_ctx(type(job), "build_tx"):
869
+ return job.build_tx(ctx)
870
+ else:
871
+ return job.build_tx()
609
872
  finally:
610
873
  set_job_context(False)
611
874
  reset_rpc_job_context(rpc_ctx_token)
@@ -660,23 +923,30 @@ class JobRunner:
660
923
  existing_status = existing.get("status")
661
924
  existing_claimed_at = existing.get("claimed_at")
662
925
  existing_attempts = 0
926
+ attempts_read_error = False
663
927
  if existing_id:
664
- try:
665
- existing_attempts = len(
666
- self._db.get_attempts_for_intent(UUID(existing_id))
667
- )
668
- except Exception:
669
- existing_attempts = 0
670
- logger.info(
671
- "intent.create.skipped_inflight",
672
- job_id=job.job_id,
673
- signer=signer_address,
674
- to_address=spec.to_address,
675
- existing_intent_id=str(existing_id) if existing_id else None,
676
- existing_status=existing_status,
677
- existing_claimed_at=existing_claimed_at,
678
- existing_attempt_count=existing_attempts,
679
- )
928
+ existing_attempts = self._safe_get_attempts_for_intent(
929
+ intent_id=existing_id,
930
+ job_id=job.job_id,
931
+ signer=signer_address,
932
+ to_address=spec.to_address,
933
+ existing_status=existing_status,
934
+ existing_claimed_at=existing_claimed_at,
935
+ )
936
+ attempts_read_error = existing_attempts < 0
937
+ if not attempts_read_error:
938
+ logger.info(
939
+ "intent.create.skipped_inflight",
940
+ job_id=job.job_id,
941
+ signer=signer_address,
942
+ to_address=spec.to_address,
943
+ intent_id=str(existing_id) if existing_id else None,
944
+ existing_intent_id=str(existing_id) if existing_id else None,
945
+ existing_status=existing_status,
946
+ existing_claimed_at=existing_claimed_at,
947
+ existing_attempt_count=existing_attempts,
948
+ chain_id=self._chain_id,
949
+ )
680
950
  if len(inflight) > 1:
681
951
  logger.warning(
682
952
  "invariant.multiple_inflight_intents",
@@ -687,16 +957,20 @@ class JobRunner:
687
957
  )
688
958
  return None, False
689
959
 
690
- intent, is_new = create_intent(
691
- db=self._db,
692
- job_id=job.job_id,
693
- chain_id=self._chain_id,
694
- spec=spec,
695
- idem_parts=idem_parts,
696
- broadcast_group=broadcast_group,
697
- broadcast_endpoints=broadcast_endpoints,
698
- trigger=trigger,
699
- )
960
+ try:
961
+ intent, is_new = create_intent(
962
+ db=self._db,
963
+ job_id=job.job_id,
964
+ chain_id=self._chain_id,
965
+ spec=spec,
966
+ idem_parts=idem_parts,
967
+ broadcast_group=broadcast_group,
968
+ broadcast_endpoints=broadcast_endpoints,
969
+ trigger=trigger,
970
+ cancellation_token=cancellation_token,
971
+ )
972
+ except CancelledCheckError:
973
+ return None, False
700
974
 
701
975
  if is_new and self._on_intent_created:
702
976
  self._on_intent_created(str(intent.intent_id))