brawny 0.1.13__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. brawny/__init__.py +2 -0
  2. brawny/_context.py +5 -5
  3. brawny/_rpc/__init__.py +36 -12
  4. brawny/_rpc/broadcast.py +14 -13
  5. brawny/_rpc/caller.py +243 -0
  6. brawny/_rpc/client.py +539 -0
  7. brawny/_rpc/clients.py +11 -11
  8. brawny/_rpc/context.py +23 -0
  9. brawny/_rpc/errors.py +465 -31
  10. brawny/_rpc/gas.py +7 -6
  11. brawny/_rpc/pool.py +18 -0
  12. brawny/_rpc/retry.py +266 -0
  13. brawny/_rpc/retry_policy.py +81 -0
  14. brawny/accounts.py +28 -9
  15. brawny/alerts/__init__.py +15 -18
  16. brawny/alerts/abi_resolver.py +212 -36
  17. brawny/alerts/base.py +2 -2
  18. brawny/alerts/contracts.py +77 -10
  19. brawny/alerts/errors.py +30 -3
  20. brawny/alerts/events.py +38 -5
  21. brawny/alerts/health.py +19 -13
  22. brawny/alerts/send.py +513 -55
  23. brawny/api.py +39 -11
  24. brawny/assets/AGENTS.md +325 -0
  25. brawny/async_runtime.py +48 -0
  26. brawny/chain.py +3 -3
  27. brawny/cli/commands/__init__.py +2 -0
  28. brawny/cli/commands/console.py +69 -19
  29. brawny/cli/commands/contract.py +2 -2
  30. brawny/cli/commands/controls.py +121 -0
  31. brawny/cli/commands/health.py +2 -2
  32. brawny/cli/commands/job_dev.py +6 -5
  33. brawny/cli/commands/jobs.py +99 -2
  34. brawny/cli/commands/maintenance.py +13 -29
  35. brawny/cli/commands/migrate.py +1 -0
  36. brawny/cli/commands/run.py +10 -3
  37. brawny/cli/commands/script.py +8 -3
  38. brawny/cli/commands/signer.py +143 -26
  39. brawny/cli/helpers.py +0 -3
  40. brawny/cli_templates.py +25 -349
  41. brawny/config/__init__.py +4 -1
  42. brawny/config/models.py +43 -57
  43. brawny/config/parser.py +268 -57
  44. brawny/config/validation.py +52 -15
  45. brawny/daemon/context.py +4 -2
  46. brawny/daemon/core.py +185 -63
  47. brawny/daemon/loops.py +166 -98
  48. brawny/daemon/supervisor.py +261 -0
  49. brawny/db/__init__.py +14 -26
  50. brawny/db/base.py +248 -151
  51. brawny/db/global_cache.py +11 -1
  52. brawny/db/migrate.py +175 -28
  53. brawny/db/migrations/001_init.sql +4 -3
  54. brawny/db/migrations/010_add_nonce_gap_index.sql +1 -1
  55. brawny/db/migrations/011_add_job_logs.sql +1 -2
  56. brawny/db/migrations/012_add_claimed_by.sql +2 -2
  57. brawny/db/migrations/013_attempt_unique.sql +10 -0
  58. brawny/db/migrations/014_add_lease_expires_at.sql +5 -0
  59. brawny/db/migrations/015_add_signer_alias.sql +14 -0
  60. brawny/db/migrations/016_runtime_controls_and_quarantine.sql +32 -0
  61. brawny/db/migrations/017_add_job_drain.sql +6 -0
  62. brawny/db/migrations/018_add_nonce_reset_audit.sql +20 -0
  63. brawny/db/migrations/019_add_job_cooldowns.sql +8 -0
  64. brawny/db/migrations/020_attempt_unique_initial.sql +7 -0
  65. brawny/db/ops/__init__.py +3 -25
  66. brawny/db/ops/logs.py +1 -2
  67. brawny/db/queries.py +47 -91
  68. brawny/db/serialized.py +65 -0
  69. brawny/db/sqlite/__init__.py +1001 -0
  70. brawny/db/sqlite/connection.py +231 -0
  71. brawny/db/sqlite/execute.py +116 -0
  72. brawny/db/sqlite/mappers.py +190 -0
  73. brawny/db/sqlite/repos/attempts.py +372 -0
  74. brawny/db/sqlite/repos/block_state.py +102 -0
  75. brawny/db/sqlite/repos/cache.py +104 -0
  76. brawny/db/sqlite/repos/intents.py +1021 -0
  77. brawny/db/sqlite/repos/jobs.py +200 -0
  78. brawny/db/sqlite/repos/maintenance.py +182 -0
  79. brawny/db/sqlite/repos/signers_nonces.py +566 -0
  80. brawny/db/sqlite/tx.py +119 -0
  81. brawny/http.py +194 -0
  82. brawny/invariants.py +11 -24
  83. brawny/jobs/base.py +8 -0
  84. brawny/jobs/job_validation.py +2 -1
  85. brawny/keystore.py +83 -7
  86. brawny/lifecycle.py +64 -12
  87. brawny/logging.py +0 -2
  88. brawny/metrics.py +84 -12
  89. brawny/model/contexts.py +111 -9
  90. brawny/model/enums.py +1 -0
  91. brawny/model/errors.py +18 -0
  92. brawny/model/types.py +47 -131
  93. brawny/network_guard.py +133 -0
  94. brawny/networks/__init__.py +5 -5
  95. brawny/networks/config.py +1 -7
  96. brawny/networks/manager.py +14 -11
  97. brawny/runtime_controls.py +74 -0
  98. brawny/scheduler/poller.py +11 -7
  99. brawny/scheduler/reorg.py +95 -39
  100. brawny/scheduler/runner.py +442 -168
  101. brawny/scheduler/shutdown.py +3 -3
  102. brawny/script_tx.py +3 -3
  103. brawny/telegram.py +53 -7
  104. brawny/testing.py +1 -0
  105. brawny/timeout.py +38 -0
  106. brawny/tx/executor.py +922 -308
  107. brawny/tx/intent.py +54 -16
  108. brawny/tx/monitor.py +31 -12
  109. brawny/tx/nonce.py +212 -90
  110. brawny/tx/replacement.py +69 -18
  111. brawny/tx/retry_policy.py +24 -0
  112. brawny/tx/stages/types.py +75 -0
  113. brawny/types.py +18 -0
  114. brawny/utils.py +41 -0
  115. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/METADATA +3 -3
  116. brawny-0.1.22.dist-info/RECORD +163 -0
  117. brawny/_rpc/manager.py +0 -982
  118. brawny/_rpc/selector.py +0 -156
  119. brawny/db/base_new.py +0 -165
  120. brawny/db/mappers.py +0 -182
  121. brawny/db/migrations/008_add_transactions.sql +0 -72
  122. brawny/db/ops/attempts.py +0 -108
  123. brawny/db/ops/blocks.py +0 -83
  124. brawny/db/ops/cache.py +0 -93
  125. brawny/db/ops/intents.py +0 -296
  126. brawny/db/ops/jobs.py +0 -110
  127. brawny/db/ops/nonces.py +0 -322
  128. brawny/db/postgres.py +0 -2535
  129. brawny/db/postgres_new.py +0 -196
  130. brawny/db/sqlite.py +0 -2733
  131. brawny/db/sqlite_new.py +0 -191
  132. brawny-0.1.13.dist-info/RECORD +0 -141
  133. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/WHEEL +0 -0
  134. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/entry_points.txt +0 -0
  135. {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/top_level.txt +0 -0
brawny/alerts/send.py CHANGED
@@ -19,11 +19,16 @@ Usage:
19
19
  from __future__ import annotations
20
20
 
21
21
  import asyncio
22
+ import hashlib
22
23
  import threading
24
+ import time
25
+ from collections import deque
23
26
  from dataclasses import dataclass, field
24
27
  from datetime import datetime
25
28
  from enum import Enum
26
- from typing import TYPE_CHECKING, Any, Callable, Coroutine
29
+ from typing import TYPE_CHECKING, Any, Callable
30
+
31
+ from cachetools import TTLCache
27
32
 
28
33
  if TYPE_CHECKING:
29
34
  from brawny.telegram import TelegramBot
@@ -31,6 +36,19 @@ if TYPE_CHECKING:
31
36
  import httpx
32
37
 
33
38
  from brawny.logging import get_logger
39
+ from brawny.metrics import (
40
+ ALERTS_ENQUEUED,
41
+ ALERTS_DROPPED,
42
+ ALERTS_LAST_ERROR_TIMESTAMP,
43
+ ALERTS_LAST_SUCCESS_TIMESTAMP,
44
+ ALERTS_OLDEST_QUEUED_AGE_SECONDS,
45
+ ALERTS_QUEUE_DEPTH,
46
+ ALERTS_RETRIED,
47
+ ALERTS_SENT,
48
+ ALERTS_WORKER_ALIVE,
49
+ get_metrics,
50
+ )
51
+ from brawny.network_guard import allow_network_calls
34
52
 
35
53
  logger = get_logger(__name__)
36
54
 
@@ -70,29 +88,382 @@ class AlertConfig:
70
88
  # NOTE: No module-level httpx.AsyncClient - asyncio objects are not safe to share
71
89
  # across multiple event loops / loop lifetimes. For low-volume alerts, we create
72
90
  # a fresh client per request (httpx context manager handles cleanup).
73
- _last_sent: dict[str, datetime] = {}
91
+ # Multi-threaded access - protected by _last_sent_lock
92
+ # Medium cardinality keys (job_id:event:dest:dest_id): maxsize=10K, ttl=1h
93
+ _last_sent: TTLCache[str, datetime] = TTLCache(maxsize=10_000, ttl=3600)
74
94
  # Use threading.Lock, not asyncio.Lock - avoids event loop binding issues
75
95
  _last_sent_lock = threading.Lock()
76
96
 
97
+ ALERT_QUEUE_MAXSIZE = 1000
98
+ ALERT_SEND_MAX_ATTEMPTS = 5
99
+ ALERT_SEND_BACKOFF_BASE_SECONDS = 1.0
100
+ ALERT_SEND_BACKOFF_MAX_SECONDS = 30.0
101
+ ALERT_WORKER_POLL_SECONDS = 0.1
102
+ ALERT_FLUSH_TIMEOUT_SECONDS = 3.0
103
+ ALERT_LOG_THROTTLE_SECONDS = 60.0
104
+ ALERT_HEALTH_MAX_OLDEST_AGE_SECONDS = 120.0
77
105
 
78
- async def send_alert(payload: AlertPayload, config: AlertConfig) -> None:
79
- """Send alert to configured destinations. Fire-and-forget."""
80
- tasks: list[Coroutine[Any, Any, None]] = []
81
106
 
107
+ @dataclass
108
+ class _AlertTask:
109
+ payload: AlertPayload
110
+ destination_type: str
111
+ destination_id: str
112
+ channel: str
113
+ enqueued_at: float
114
+ attempt: int = 0
115
+ next_attempt_at: float = 0.0
116
+ alert_id: str = ""
117
+ telegram_token: str | None = None
118
+ webhook_url: str | None = None
119
+
120
+
121
+ class AlertService:
122
+ def __init__(
123
+ self,
124
+ *,
125
+ maxsize: int,
126
+ max_attempts: int,
127
+ backoff_base_seconds: float,
128
+ backoff_max_seconds: float,
129
+ health_max_oldest_age_seconds: float,
130
+ ) -> None:
131
+ self._queue: deque[_AlertTask] = deque()
132
+ self._delayed: list[_AlertTask] = []
133
+ self._maxsize = maxsize
134
+ self._max_attempts = max_attempts
135
+ self._backoff_base_seconds = backoff_base_seconds
136
+ self._backoff_max_seconds = backoff_max_seconds
137
+ self._health_max_oldest_age_seconds = health_max_oldest_age_seconds
138
+ self._accepting = True
139
+ self._stop = False
140
+ self._stop_deadline: float | None = None
141
+ self._worker_task: asyncio.Task | None = None
142
+ self._wakeup: asyncio.Event | None = None
143
+ self._worker_alive = False
144
+ self._last_success_ts: float | None = None
145
+ self._last_error_ts: float | None = None
146
+ self._last_error_type: str | None = None
147
+ self._last_error_message: str | None = None
148
+ self._log_throttle: dict[str, float] = {}
149
+
150
+ async def start(self) -> None:
151
+ if self._worker_task and not self._worker_task.done():
152
+ return
153
+ self._accepting = True
154
+ self._stop = False
155
+ self._stop_deadline = None
156
+ self._wakeup = asyncio.Event()
157
+ self._worker_task = asyncio.create_task(self._run(), name="alert-sender")
158
+
159
+ async def stop(self, flush_timeout: float) -> None:
160
+ self._accepting = False
161
+ self._stop = True
162
+ self._stop_deadline = time.time() + flush_timeout
163
+ if self._wakeup is not None:
164
+ self._wakeup.set()
165
+ task = self._worker_task
166
+ if task is None:
167
+ return
168
+ try:
169
+ await asyncio.wait_for(task, timeout=flush_timeout)
170
+ except asyncio.TimeoutError:
171
+ task.cancel()
172
+ self._update_queue_metrics(now=time.time())
173
+
174
+ def enqueue(self, task: _AlertTask) -> bool:
175
+ if not self._accepting:
176
+ self._record_drop("shutdown", channel=task.channel)
177
+ return False
178
+ if self._queue_size() >= self._maxsize:
179
+ self._record_drop("queue_full", channel=task.channel)
180
+ self._log_throttled(
181
+ "queue_full",
182
+ "alert.queue_full",
183
+ maxsize=self._maxsize,
184
+ channel=task.channel,
185
+ )
186
+ return False
187
+ self._queue.append(task)
188
+ metrics = get_metrics()
189
+ metrics.counter(ALERTS_ENQUEUED).inc()
190
+ self._update_queue_metrics(now=time.time())
191
+ if self._wakeup is not None:
192
+ self._wakeup.set()
193
+ return True
194
+
195
+ def configure_health_threshold(self, max_oldest_age_seconds: float) -> None:
196
+ self._health_max_oldest_age_seconds = max_oldest_age_seconds
197
+
198
+ async def _run(self) -> None:
199
+ self._set_worker_alive(True)
200
+ try:
201
+ while True:
202
+ now = time.time()
203
+ if self._stop and self._stop_deadline and now >= self._stop_deadline:
204
+ self._drop_remaining("shutdown_timeout")
205
+ break
206
+ self._move_due_delayed(now)
207
+ if self._stop and not self._queue and not self._delayed:
208
+ break
209
+ if self._queue:
210
+ task = self._queue.popleft()
211
+ self._update_queue_metrics(now=now)
212
+ await self._process_task(task)
213
+ continue
214
+ wait = self._next_wait_seconds(now)
215
+ try:
216
+ if self._wakeup is not None:
217
+ self._wakeup.clear()
218
+ await asyncio.wait_for(self._wakeup.wait(), timeout=wait)
219
+ else:
220
+ await asyncio.sleep(wait)
221
+ except asyncio.TimeoutError:
222
+ pass
223
+ finally:
224
+ self._set_worker_alive(False)
225
+
226
+ async def _process_task(self, task: _AlertTask) -> None:
227
+ metrics = get_metrics()
228
+ task.attempt += 1
229
+ attempt = task.attempt
230
+ self._log_state(task, state="sending")
231
+ try:
232
+ await _send_task(task)
233
+ except Exception as exc:
234
+ retryable, error_type = _classify_error(exc)
235
+ self._record_error(error_type, str(exc))
236
+ if retryable and attempt < self._max_attempts:
237
+ metrics.counter(ALERTS_RETRIED).inc()
238
+ task.next_attempt_at = time.time() + _backoff_seconds(
239
+ attempt,
240
+ base_seconds=self._backoff_base_seconds,
241
+ max_seconds=self._backoff_max_seconds,
242
+ )
243
+ self._log_state(task, state="retry_scheduled", error_type=error_type)
244
+ self._delayed.append(task)
245
+ self._update_queue_metrics(now=time.time())
246
+ if self._wakeup is not None:
247
+ self._wakeup.set()
248
+ return
249
+ reason = "max_attempts" if attempt >= self._max_attempts else "non_retryable"
250
+ self._log_state(task, state="dropped", error_type=error_type)
251
+ self._record_drop(reason, channel=task.channel)
252
+ return
253
+
254
+ metrics.counter(ALERTS_SENT).inc()
255
+ self._record_success()
256
+ self._log_state(task, state="sent")
257
+
258
+ def _record_drop(self, reason: str, *, channel: str) -> None:
259
+ metrics = get_metrics()
260
+ metrics.counter(ALERTS_DROPPED).inc(reason=reason, channel=channel)
261
+
262
+ def _record_success(self) -> None:
263
+ self._last_success_ts = time.time()
264
+ metrics = get_metrics()
265
+ metrics.gauge(ALERTS_LAST_SUCCESS_TIMESTAMP).set(self._last_success_ts)
266
+
267
+ def _record_error(self, error_type: str, message: str) -> None:
268
+ self._last_error_ts = time.time()
269
+ self._last_error_type = error_type
270
+ self._last_error_message = message[:200]
271
+ metrics = get_metrics()
272
+ metrics.gauge(ALERTS_LAST_ERROR_TIMESTAMP).set(self._last_error_ts)
273
+
274
+ def _queue_size(self) -> int:
275
+ return len(self._queue) + len(self._delayed)
276
+
277
+ def _oldest_age_seconds(self, now: float) -> float:
278
+ if not self._queue and not self._delayed:
279
+ return 0.0
280
+ oldest = min(
281
+ [task.enqueued_at for task in self._queue]
282
+ + [task.enqueued_at for task in self._delayed]
283
+ )
284
+ return max(0.0, now - oldest)
285
+
286
+ def _update_queue_metrics(self, now: float) -> None:
287
+ metrics = get_metrics()
288
+ metrics.gauge(ALERTS_QUEUE_DEPTH).set(self._queue_size())
289
+ metrics.gauge(ALERTS_OLDEST_QUEUED_AGE_SECONDS).set(self._oldest_age_seconds(now))
290
+
291
+ def _move_due_delayed(self, now: float) -> None:
292
+ if not self._delayed:
293
+ return
294
+ due: list[_AlertTask] = []
295
+ remaining: list[_AlertTask] = []
296
+ for task in self._delayed:
297
+ if task.next_attempt_at <= now:
298
+ due.append(task)
299
+ else:
300
+ remaining.append(task)
301
+ self._delayed = remaining
302
+ if due:
303
+ self._queue.extend(due)
304
+ self._update_queue_metrics(now=now)
305
+
306
+ def _next_wait_seconds(self, now: float) -> float:
307
+ if not self._delayed:
308
+ return ALERT_WORKER_POLL_SECONDS
309
+ next_due = min(task.next_attempt_at for task in self._delayed)
310
+ wait = max(0.0, next_due - now)
311
+ return min(ALERT_WORKER_POLL_SECONDS, wait)
312
+
313
+ def _set_worker_alive(self, alive: bool) -> None:
314
+ self._worker_alive = alive
315
+ metrics = get_metrics()
316
+ metrics.gauge(ALERTS_WORKER_ALIVE).set(1.0 if alive else 0.0)
317
+
318
+ def _drop_remaining(self, reason: str) -> None:
319
+ while self._queue:
320
+ task = self._queue.popleft()
321
+ self._record_drop(reason, channel=task.channel)
322
+ while self._delayed:
323
+ task = self._delayed.pop()
324
+ self._record_drop(reason, channel=task.channel)
325
+ self._update_queue_metrics(now=time.time())
326
+
327
+ def _log_state(self, task: _AlertTask, *, state: str, error_type: str | None = None) -> None:
328
+ logger.info(
329
+ "alert.delivery_state",
330
+ alert_id=task.alert_id,
331
+ attempt=task.attempt,
332
+ state=state,
333
+ error_type=error_type,
334
+ channel=task.channel,
335
+ )
336
+
337
+ def _log_throttled(self, reason: str, event: str, **fields: object) -> None:
338
+ now = time.time()
339
+ last = self._log_throttle.get(reason)
340
+ if last is not None and now - last < ALERT_LOG_THROTTLE_SECONDS:
341
+ return
342
+ self._log_throttle[reason] = now
343
+ logger.warning(event, reason=reason, **fields)
344
+
345
+ def health_snapshot(self) -> dict[str, object]:
346
+ now = time.time()
347
+ queue_depth = self._queue_size()
348
+ oldest_age = self._oldest_age_seconds(now)
349
+ alive = self._worker_alive
350
+ healthy = queue_depth == 0 or (alive and oldest_age < self._health_max_oldest_age_seconds)
351
+ return {
352
+ "alive": alive,
353
+ "queue_depth": queue_depth,
354
+ "oldest_queued_age_seconds": oldest_age,
355
+ "healthy": healthy,
356
+ "last_success_timestamp": self._last_success_ts,
357
+ "last_error_timestamp": self._last_error_ts,
358
+ "last_error_type": self._last_error_type,
359
+ "last_error_message": self._last_error_message,
360
+ }
361
+
362
+
363
+ def _make_task(
364
+ payload: AlertPayload,
365
+ *,
366
+ destination_type: str,
367
+ destination_id: str,
368
+ telegram_token: str | None = None,
369
+ webhook_url: str | None = None,
370
+ ) -> _AlertTask:
371
+ enqueued_at = time.time()
372
+ alert_id = _make_alert_id(payload, destination_type, destination_id)
373
+ return _AlertTask(
374
+ payload=payload,
375
+ destination_type=destination_type,
376
+ destination_id=destination_id,
377
+ channel=destination_type,
378
+ enqueued_at=enqueued_at,
379
+ next_attempt_at=enqueued_at,
380
+ alert_id=alert_id,
381
+ telegram_token=telegram_token,
382
+ webhook_url=webhook_url,
383
+ )
384
+
385
+
386
+ def _make_alert_id(payload: AlertPayload, destination_type: str, destination_id: str) -> str:
387
+ raw = f"{destination_type}:{destination_id}:{payload.job_id}:{payload.event_type.value}:{payload.message}"
388
+ return hashlib.sha1(raw.encode("utf-8")).hexdigest()[:12]
389
+
390
+
391
+ def _backoff_seconds(attempt: int, *, base_seconds: float, max_seconds: float) -> float:
392
+ return min(base_seconds * (2 ** (attempt - 1)), max_seconds)
393
+
394
+
395
+ def _classify_error(exc: Exception) -> tuple[bool, str]:
396
+ if isinstance(exc, httpx.TimeoutException):
397
+ return True, "timeout"
398
+ if isinstance(exc, httpx.RequestError):
399
+ return True, "network_error"
400
+ if isinstance(exc, httpx.HTTPStatusError):
401
+ status = exc.response.status_code
402
+ error_type = f"http_{status}"
403
+ if status == 429 or 500 <= status < 600:
404
+ return True, error_type
405
+ if status in (400, 401, 403, 404):
406
+ return False, error_type
407
+ return False, error_type
408
+ return False, type(exc).__name__
409
+
410
+ _alert_service: AlertService | None = None
411
+
412
+
413
+ def set_alert_service(service: AlertService | None) -> None:
414
+ global _alert_service
415
+ _alert_service = service
416
+
417
+
418
+ def _require_alert_service() -> AlertService:
419
+ if _alert_service is None:
420
+ raise RuntimeError("AlertService is not initialized")
421
+ return _alert_service
422
+
423
+
424
+ async def send_alert(payload: AlertPayload, config: AlertConfig) -> None:
425
+ """Enqueue alert for background delivery. Never blocks core path."""
426
+ service = _require_alert_service()
82
427
  if config.telegram_token and config.telegram_chat_ids:
83
428
  for chat_id in config.telegram_chat_ids:
84
429
  if _should_send(payload, "telegram", chat_id, config.rate_limit_seconds):
85
- tasks.append(_send_telegram(config.telegram_token, chat_id, payload))
430
+ service.enqueue(
431
+ _make_task(
432
+ payload,
433
+ destination_type="telegram",
434
+ destination_id=str(chat_id),
435
+ telegram_token=config.telegram_token,
436
+ )
437
+ )
86
438
 
87
439
  if config.webhook_url:
88
440
  if _should_send(payload, "webhook", config.webhook_url, config.rate_limit_seconds):
89
- tasks.append(_send_webhook(config.webhook_url, payload))
441
+ service.enqueue(
442
+ _make_task(
443
+ payload,
444
+ destination_type="webhook",
445
+ destination_id=config.webhook_url,
446
+ webhook_url=config.webhook_url,
447
+ )
448
+ )
449
+
450
+
451
+ def enqueue_alert(payload: AlertPayload, config: AlertConfig) -> None:
452
+ """Sync wrapper for enqueuing alerts from non-async code."""
453
+ from brawny.async_runtime import run_sync
454
+
455
+ run_sync(send_alert(payload, config))
456
+
457
+
458
+ def configure_alert_worker(*, health_max_oldest_age_seconds: float | None = None) -> None:
459
+ service = _require_alert_service()
460
+ if health_max_oldest_age_seconds is not None:
461
+ service.configure_health_threshold(health_max_oldest_age_seconds)
90
462
 
91
- if tasks:
92
- results = await asyncio.gather(*tasks, return_exceptions=True)
93
- for i, result in enumerate(results):
94
- if isinstance(result, Exception):
95
- _log_failure(payload, tasks[i], result)
463
+
464
+ def get_alert_worker_health() -> dict[str, object]:
465
+ service = _require_alert_service()
466
+ return service.health_snapshot()
96
467
 
97
468
 
98
469
  def _should_send(
@@ -131,9 +502,10 @@ async def _send_telegram(token: str, chat_id: str, payload: AlertPayload) -> Non
131
502
  "parse_mode": parse_mode,
132
503
  "disable_web_page_preview": True,
133
504
  }
134
- async with httpx.AsyncClient(timeout=10.0) as client:
135
- resp = await client.post(url, json=data)
136
- resp.raise_for_status()
505
+ with allow_network_calls(reason="alerts"):
506
+ async with httpx.AsyncClient(timeout=10.0) as client:
507
+ resp = await client.post(url, json=data)
508
+ resp.raise_for_status()
137
509
 
138
510
 
139
511
  async def _send_webhook(url: str, payload: AlertPayload) -> None:
@@ -149,48 +521,42 @@ async def _send_webhook(url: str, payload: AlertPayload) -> None:
149
521
 
150
522
  Do not add fields without versioning discussion.
151
523
  """
152
- async with httpx.AsyncClient(timeout=10.0) as client:
153
- resp = await client.post(
154
- url,
155
- json={
156
- "job_id": payload.job_id,
157
- "job_name": payload.job_name,
158
- "event_type": payload.event_type.value,
159
- "message": payload.message,
160
- "chain_id": payload.chain_id,
161
- "timestamp": payload.timestamp.isoformat() + "Z",
162
- },
163
- )
164
- resp.raise_for_status()
524
+ with allow_network_calls(reason="alerts"):
525
+ async with httpx.AsyncClient(timeout=10.0) as client:
526
+ resp = await client.post(
527
+ url,
528
+ json={
529
+ "job_id": payload.job_id,
530
+ "job_name": payload.job_name,
531
+ "event_type": payload.event_type.value,
532
+ "message": payload.message,
533
+ "chain_id": payload.chain_id,
534
+ "timestamp": payload.timestamp.isoformat() + "Z",
535
+ },
536
+ )
537
+ resp.raise_for_status()
165
538
 
166
539
 
167
- def _log_failure(payload: AlertPayload, task: Coroutine[Any, Any, None], error: Exception) -> None:
168
- """Log alert failure with enough context to debug."""
169
- task_name = task.__qualname__ if hasattr(task, "__qualname__") else str(task)
540
+ async def _send_task(task: _AlertTask) -> None:
541
+ if task.destination_type == "telegram":
542
+ if task.telegram_token is None:
543
+ raise RuntimeError("telegram_token is required")
544
+ await _send_telegram(task.telegram_token, task.destination_id, task.payload)
545
+ return
546
+ if task.destination_type == "webhook":
547
+ if task.webhook_url is None:
548
+ raise RuntimeError("webhook_url is required")
549
+ await _send_webhook(task.webhook_url, task.payload)
550
+ return
551
+ raise RuntimeError(f"Unknown destination type: {task.destination_type}")
170
552
 
171
- if "telegram" in task_name.lower():
172
- logger.warning(
173
- "alert_delivery_failed",
174
- job_id=payload.job_id,
175
- event_type=payload.event_type.value,
176
- destination="telegram",
177
- error=str(error),
178
- )
179
- elif "webhook" in task_name.lower():
180
- logger.warning(
181
- "alert_delivery_failed",
182
- job_id=payload.job_id,
183
- event_type=payload.event_type.value,
184
- destination="webhook",
185
- error=str(error),
186
- )
187
- else:
188
- logger.warning(
189
- "alert_delivery_failed",
190
- job_id=payload.job_id,
191
- event_type=payload.event_type.value,
192
- error=str(error),
193
- )
553
+
554
+ def flush_alert_queue(timeout_seconds: float | None = None) -> None:
555
+ timeout = ALERT_FLUSH_TIMEOUT_SECONDS if timeout_seconds is None else timeout_seconds
556
+ service = _require_alert_service()
557
+ from brawny.async_runtime import run_sync
558
+
559
+ run_sync(service.stop(flush_timeout=timeout))
194
560
 
195
561
 
196
562
  # =============================================================================
@@ -299,7 +665,9 @@ async def _send_alert_logged(payload: AlertPayload, config: AlertConfig) -> None
299
665
  # =============================================================================
300
666
 
301
667
  # Separate rate limiting for health alerts (prevents job alert noise from blocking health)
302
- _health_last_sent: dict[str, datetime] = {}
668
+ # Multi-threaded access - protected by _health_lock
669
+ # Low cardinality keys (chat IDs): maxsize=1K, ttl=1h
670
+ _health_last_sent: TTLCache[str, datetime] = TTLCache(maxsize=1_000, ttl=3600)
303
671
  _health_lock = threading.Lock()
304
672
 
305
673
  HEALTH_RATE_LIMIT_SECONDS = 1.0 # Min interval between health messages to same chat
@@ -362,3 +730,93 @@ def create_send_health(bot: "TelegramBot") -> "Callable[[str, str], None]":
362
730
  )
363
731
 
364
732
  return send_health
733
+
734
+
735
+ # =============================================================================
736
+ # JobAlertSender for ctx.alert() in Lifecycle Hooks
737
+ # =============================================================================
738
+
739
+
740
+ class JobAlertSender:
741
+ """Alert sender bound to a specific job's routing configuration.
742
+
743
+ Used by lifecycle contexts (TriggerContext, SuccessContext, FailureContext)
744
+ to provide ctx.alert() that routes to job-specific destinations.
745
+
746
+ This class implements the AlertSender protocol from model.contexts.
747
+ """
748
+
749
+ def __init__(
750
+ self,
751
+ *,
752
+ telegram_bot: "TelegramBot | None",
753
+ telegram_config: Any, # TelegramConfig
754
+ job_alert_to: list[str] | None,
755
+ job_id: str,
756
+ ) -> None:
757
+ """Initialize with job-specific routing.
758
+
759
+ Args:
760
+ telegram_bot: TelegramBot instance (None if not configured)
761
+ telegram_config: TelegramConfig with chats, default, parse_mode
762
+ job_alert_to: Job-specific alert destinations (or None for default)
763
+ job_id: Job ID for logging
764
+ """
765
+ self._bot = telegram_bot
766
+ self._tg_config = telegram_config
767
+ self._job_alert_to = job_alert_to
768
+ self._job_id = job_id
769
+
770
+ def send(
771
+ self,
772
+ message: str,
773
+ *,
774
+ to: str | list[str] | None = None,
775
+ parse_mode: str | None = None,
776
+ ) -> None:
777
+ """Send alert to configured destinations.
778
+
779
+ Routing priority:
780
+ 1. `to` parameter (explicit override)
781
+ 2. job_alert_to (job-specific config)
782
+ 3. telegram.default (global default)
783
+
784
+ Args:
785
+ message: Alert text (up to 4096 characters)
786
+ to: Override routing target (name, ID, or list)
787
+ parse_mode: "Markdown", "MarkdownV2", "HTML", or None for config default
788
+ """
789
+ if not self._bot or not self._tg_config:
790
+ return # Silent no-op (warned once at startup)
791
+
792
+ from brawny.alerts.routing import resolve_targets
793
+
794
+ # Determine target
795
+ if to is not None:
796
+ target = to
797
+ else:
798
+ target = self._job_alert_to
799
+
800
+ # Resolve to chat IDs
801
+ chat_ids = resolve_targets(
802
+ target,
803
+ self._tg_config.chats,
804
+ self._tg_config.default,
805
+ job_id=self._job_id,
806
+ )
807
+
808
+ if not chat_ids:
809
+ return # No targets configured
810
+
811
+ # Send to each resolved chat
812
+ for chat_id in chat_ids:
813
+ effective_parse_mode = (
814
+ parse_mode if parse_mode is not None
815
+ else self._tg_config.parse_mode or "Markdown"
816
+ )
817
+ self._bot.send_message(
818
+ message,
819
+ chat_id=chat_id,
820
+ parse_mode=effective_parse_mode,
821
+ disable_web_page_preview=True,
822
+ )