brawny 0.1.13__py3-none-any.whl → 0.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brawny/__init__.py +2 -0
- brawny/_context.py +5 -5
- brawny/_rpc/__init__.py +36 -12
- brawny/_rpc/broadcast.py +14 -13
- brawny/_rpc/caller.py +243 -0
- brawny/_rpc/client.py +539 -0
- brawny/_rpc/clients.py +11 -11
- brawny/_rpc/context.py +23 -0
- brawny/_rpc/errors.py +465 -31
- brawny/_rpc/gas.py +7 -6
- brawny/_rpc/pool.py +18 -0
- brawny/_rpc/retry.py +266 -0
- brawny/_rpc/retry_policy.py +81 -0
- brawny/accounts.py +28 -9
- brawny/alerts/__init__.py +15 -18
- brawny/alerts/abi_resolver.py +212 -36
- brawny/alerts/base.py +2 -2
- brawny/alerts/contracts.py +77 -10
- brawny/alerts/errors.py +30 -3
- brawny/alerts/events.py +38 -5
- brawny/alerts/health.py +19 -13
- brawny/alerts/send.py +513 -55
- brawny/api.py +39 -11
- brawny/assets/AGENTS.md +325 -0
- brawny/async_runtime.py +48 -0
- brawny/chain.py +3 -3
- brawny/cli/commands/__init__.py +2 -0
- brawny/cli/commands/console.py +69 -19
- brawny/cli/commands/contract.py +2 -2
- brawny/cli/commands/controls.py +121 -0
- brawny/cli/commands/health.py +2 -2
- brawny/cli/commands/job_dev.py +6 -5
- brawny/cli/commands/jobs.py +99 -2
- brawny/cli/commands/maintenance.py +13 -29
- brawny/cli/commands/migrate.py +1 -0
- brawny/cli/commands/run.py +10 -3
- brawny/cli/commands/script.py +8 -3
- brawny/cli/commands/signer.py +143 -26
- brawny/cli/helpers.py +0 -3
- brawny/cli_templates.py +25 -349
- brawny/config/__init__.py +4 -1
- brawny/config/models.py +43 -57
- brawny/config/parser.py +268 -57
- brawny/config/validation.py +52 -15
- brawny/daemon/context.py +4 -2
- brawny/daemon/core.py +185 -63
- brawny/daemon/loops.py +166 -98
- brawny/daemon/supervisor.py +261 -0
- brawny/db/__init__.py +14 -26
- brawny/db/base.py +248 -151
- brawny/db/global_cache.py +11 -1
- brawny/db/migrate.py +175 -28
- brawny/db/migrations/001_init.sql +4 -3
- brawny/db/migrations/010_add_nonce_gap_index.sql +1 -1
- brawny/db/migrations/011_add_job_logs.sql +1 -2
- brawny/db/migrations/012_add_claimed_by.sql +2 -2
- brawny/db/migrations/013_attempt_unique.sql +10 -0
- brawny/db/migrations/014_add_lease_expires_at.sql +5 -0
- brawny/db/migrations/015_add_signer_alias.sql +14 -0
- brawny/db/migrations/016_runtime_controls_and_quarantine.sql +32 -0
- brawny/db/migrations/017_add_job_drain.sql +6 -0
- brawny/db/migrations/018_add_nonce_reset_audit.sql +20 -0
- brawny/db/migrations/019_add_job_cooldowns.sql +8 -0
- brawny/db/migrations/020_attempt_unique_initial.sql +7 -0
- brawny/db/ops/__init__.py +3 -25
- brawny/db/ops/logs.py +1 -2
- brawny/db/queries.py +47 -91
- brawny/db/serialized.py +65 -0
- brawny/db/sqlite/__init__.py +1001 -0
- brawny/db/sqlite/connection.py +231 -0
- brawny/db/sqlite/execute.py +116 -0
- brawny/db/sqlite/mappers.py +190 -0
- brawny/db/sqlite/repos/attempts.py +372 -0
- brawny/db/sqlite/repos/block_state.py +102 -0
- brawny/db/sqlite/repos/cache.py +104 -0
- brawny/db/sqlite/repos/intents.py +1021 -0
- brawny/db/sqlite/repos/jobs.py +200 -0
- brawny/db/sqlite/repos/maintenance.py +182 -0
- brawny/db/sqlite/repos/signers_nonces.py +566 -0
- brawny/db/sqlite/tx.py +119 -0
- brawny/http.py +194 -0
- brawny/invariants.py +11 -24
- brawny/jobs/base.py +8 -0
- brawny/jobs/job_validation.py +2 -1
- brawny/keystore.py +83 -7
- brawny/lifecycle.py +64 -12
- brawny/logging.py +0 -2
- brawny/metrics.py +84 -12
- brawny/model/contexts.py +111 -9
- brawny/model/enums.py +1 -0
- brawny/model/errors.py +18 -0
- brawny/model/types.py +47 -131
- brawny/network_guard.py +133 -0
- brawny/networks/__init__.py +5 -5
- brawny/networks/config.py +1 -7
- brawny/networks/manager.py +14 -11
- brawny/runtime_controls.py +74 -0
- brawny/scheduler/poller.py +11 -7
- brawny/scheduler/reorg.py +95 -39
- brawny/scheduler/runner.py +442 -168
- brawny/scheduler/shutdown.py +3 -3
- brawny/script_tx.py +3 -3
- brawny/telegram.py +53 -7
- brawny/testing.py +1 -0
- brawny/timeout.py +38 -0
- brawny/tx/executor.py +922 -308
- brawny/tx/intent.py +54 -16
- brawny/tx/monitor.py +31 -12
- brawny/tx/nonce.py +212 -90
- brawny/tx/replacement.py +69 -18
- brawny/tx/retry_policy.py +24 -0
- brawny/tx/stages/types.py +75 -0
- brawny/types.py +18 -0
- brawny/utils.py +41 -0
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/METADATA +3 -3
- brawny-0.1.22.dist-info/RECORD +163 -0
- brawny/_rpc/manager.py +0 -982
- brawny/_rpc/selector.py +0 -156
- brawny/db/base_new.py +0 -165
- brawny/db/mappers.py +0 -182
- brawny/db/migrations/008_add_transactions.sql +0 -72
- brawny/db/ops/attempts.py +0 -108
- brawny/db/ops/blocks.py +0 -83
- brawny/db/ops/cache.py +0 -93
- brawny/db/ops/intents.py +0 -296
- brawny/db/ops/jobs.py +0 -110
- brawny/db/ops/nonces.py +0 -322
- brawny/db/postgres.py +0 -2535
- brawny/db/postgres_new.py +0 -196
- brawny/db/sqlite.py +0 -2733
- brawny/db/sqlite_new.py +0 -191
- brawny-0.1.13.dist-info/RECORD +0 -141
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/WHEEL +0 -0
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/entry_points.txt +0 -0
- {brawny-0.1.13.dist-info → brawny-0.1.22.dist-info}/top_level.txt +0 -0
brawny/alerts/send.py
CHANGED
|
@@ -19,11 +19,16 @@ Usage:
|
|
|
19
19
|
from __future__ import annotations
|
|
20
20
|
|
|
21
21
|
import asyncio
|
|
22
|
+
import hashlib
|
|
22
23
|
import threading
|
|
24
|
+
import time
|
|
25
|
+
from collections import deque
|
|
23
26
|
from dataclasses import dataclass, field
|
|
24
27
|
from datetime import datetime
|
|
25
28
|
from enum import Enum
|
|
26
|
-
from typing import TYPE_CHECKING, Any, Callable
|
|
29
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
30
|
+
|
|
31
|
+
from cachetools import TTLCache
|
|
27
32
|
|
|
28
33
|
if TYPE_CHECKING:
|
|
29
34
|
from brawny.telegram import TelegramBot
|
|
@@ -31,6 +36,19 @@ if TYPE_CHECKING:
|
|
|
31
36
|
import httpx
|
|
32
37
|
|
|
33
38
|
from brawny.logging import get_logger
|
|
39
|
+
from brawny.metrics import (
|
|
40
|
+
ALERTS_ENQUEUED,
|
|
41
|
+
ALERTS_DROPPED,
|
|
42
|
+
ALERTS_LAST_ERROR_TIMESTAMP,
|
|
43
|
+
ALERTS_LAST_SUCCESS_TIMESTAMP,
|
|
44
|
+
ALERTS_OLDEST_QUEUED_AGE_SECONDS,
|
|
45
|
+
ALERTS_QUEUE_DEPTH,
|
|
46
|
+
ALERTS_RETRIED,
|
|
47
|
+
ALERTS_SENT,
|
|
48
|
+
ALERTS_WORKER_ALIVE,
|
|
49
|
+
get_metrics,
|
|
50
|
+
)
|
|
51
|
+
from brawny.network_guard import allow_network_calls
|
|
34
52
|
|
|
35
53
|
logger = get_logger(__name__)
|
|
36
54
|
|
|
@@ -70,29 +88,382 @@ class AlertConfig:
|
|
|
70
88
|
# NOTE: No module-level httpx.AsyncClient - asyncio objects are not safe to share
|
|
71
89
|
# across multiple event loops / loop lifetimes. For low-volume alerts, we create
|
|
72
90
|
# a fresh client per request (httpx context manager handles cleanup).
|
|
73
|
-
|
|
91
|
+
# Multi-threaded access - protected by _last_sent_lock
|
|
92
|
+
# Medium cardinality keys (job_id:event:dest:dest_id): maxsize=10K, ttl=1h
|
|
93
|
+
_last_sent: TTLCache[str, datetime] = TTLCache(maxsize=10_000, ttl=3600)
|
|
74
94
|
# Use threading.Lock, not asyncio.Lock - avoids event loop binding issues
|
|
75
95
|
_last_sent_lock = threading.Lock()
|
|
76
96
|
|
|
97
|
+
ALERT_QUEUE_MAXSIZE = 1000
|
|
98
|
+
ALERT_SEND_MAX_ATTEMPTS = 5
|
|
99
|
+
ALERT_SEND_BACKOFF_BASE_SECONDS = 1.0
|
|
100
|
+
ALERT_SEND_BACKOFF_MAX_SECONDS = 30.0
|
|
101
|
+
ALERT_WORKER_POLL_SECONDS = 0.1
|
|
102
|
+
ALERT_FLUSH_TIMEOUT_SECONDS = 3.0
|
|
103
|
+
ALERT_LOG_THROTTLE_SECONDS = 60.0
|
|
104
|
+
ALERT_HEALTH_MAX_OLDEST_AGE_SECONDS = 120.0
|
|
77
105
|
|
|
78
|
-
async def send_alert(payload: AlertPayload, config: AlertConfig) -> None:
|
|
79
|
-
"""Send alert to configured destinations. Fire-and-forget."""
|
|
80
|
-
tasks: list[Coroutine[Any, Any, None]] = []
|
|
81
106
|
|
|
107
|
+
@dataclass
|
|
108
|
+
class _AlertTask:
|
|
109
|
+
payload: AlertPayload
|
|
110
|
+
destination_type: str
|
|
111
|
+
destination_id: str
|
|
112
|
+
channel: str
|
|
113
|
+
enqueued_at: float
|
|
114
|
+
attempt: int = 0
|
|
115
|
+
next_attempt_at: float = 0.0
|
|
116
|
+
alert_id: str = ""
|
|
117
|
+
telegram_token: str | None = None
|
|
118
|
+
webhook_url: str | None = None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class AlertService:
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
*,
|
|
125
|
+
maxsize: int,
|
|
126
|
+
max_attempts: int,
|
|
127
|
+
backoff_base_seconds: float,
|
|
128
|
+
backoff_max_seconds: float,
|
|
129
|
+
health_max_oldest_age_seconds: float,
|
|
130
|
+
) -> None:
|
|
131
|
+
self._queue: deque[_AlertTask] = deque()
|
|
132
|
+
self._delayed: list[_AlertTask] = []
|
|
133
|
+
self._maxsize = maxsize
|
|
134
|
+
self._max_attempts = max_attempts
|
|
135
|
+
self._backoff_base_seconds = backoff_base_seconds
|
|
136
|
+
self._backoff_max_seconds = backoff_max_seconds
|
|
137
|
+
self._health_max_oldest_age_seconds = health_max_oldest_age_seconds
|
|
138
|
+
self._accepting = True
|
|
139
|
+
self._stop = False
|
|
140
|
+
self._stop_deadline: float | None = None
|
|
141
|
+
self._worker_task: asyncio.Task | None = None
|
|
142
|
+
self._wakeup: asyncio.Event | None = None
|
|
143
|
+
self._worker_alive = False
|
|
144
|
+
self._last_success_ts: float | None = None
|
|
145
|
+
self._last_error_ts: float | None = None
|
|
146
|
+
self._last_error_type: str | None = None
|
|
147
|
+
self._last_error_message: str | None = None
|
|
148
|
+
self._log_throttle: dict[str, float] = {}
|
|
149
|
+
|
|
150
|
+
async def start(self) -> None:
|
|
151
|
+
if self._worker_task and not self._worker_task.done():
|
|
152
|
+
return
|
|
153
|
+
self._accepting = True
|
|
154
|
+
self._stop = False
|
|
155
|
+
self._stop_deadline = None
|
|
156
|
+
self._wakeup = asyncio.Event()
|
|
157
|
+
self._worker_task = asyncio.create_task(self._run(), name="alert-sender")
|
|
158
|
+
|
|
159
|
+
async def stop(self, flush_timeout: float) -> None:
|
|
160
|
+
self._accepting = False
|
|
161
|
+
self._stop = True
|
|
162
|
+
self._stop_deadline = time.time() + flush_timeout
|
|
163
|
+
if self._wakeup is not None:
|
|
164
|
+
self._wakeup.set()
|
|
165
|
+
task = self._worker_task
|
|
166
|
+
if task is None:
|
|
167
|
+
return
|
|
168
|
+
try:
|
|
169
|
+
await asyncio.wait_for(task, timeout=flush_timeout)
|
|
170
|
+
except asyncio.TimeoutError:
|
|
171
|
+
task.cancel()
|
|
172
|
+
self._update_queue_metrics(now=time.time())
|
|
173
|
+
|
|
174
|
+
def enqueue(self, task: _AlertTask) -> bool:
|
|
175
|
+
if not self._accepting:
|
|
176
|
+
self._record_drop("shutdown", channel=task.channel)
|
|
177
|
+
return False
|
|
178
|
+
if self._queue_size() >= self._maxsize:
|
|
179
|
+
self._record_drop("queue_full", channel=task.channel)
|
|
180
|
+
self._log_throttled(
|
|
181
|
+
"queue_full",
|
|
182
|
+
"alert.queue_full",
|
|
183
|
+
maxsize=self._maxsize,
|
|
184
|
+
channel=task.channel,
|
|
185
|
+
)
|
|
186
|
+
return False
|
|
187
|
+
self._queue.append(task)
|
|
188
|
+
metrics = get_metrics()
|
|
189
|
+
metrics.counter(ALERTS_ENQUEUED).inc()
|
|
190
|
+
self._update_queue_metrics(now=time.time())
|
|
191
|
+
if self._wakeup is not None:
|
|
192
|
+
self._wakeup.set()
|
|
193
|
+
return True
|
|
194
|
+
|
|
195
|
+
def configure_health_threshold(self, max_oldest_age_seconds: float) -> None:
|
|
196
|
+
self._health_max_oldest_age_seconds = max_oldest_age_seconds
|
|
197
|
+
|
|
198
|
+
async def _run(self) -> None:
|
|
199
|
+
self._set_worker_alive(True)
|
|
200
|
+
try:
|
|
201
|
+
while True:
|
|
202
|
+
now = time.time()
|
|
203
|
+
if self._stop and self._stop_deadline and now >= self._stop_deadline:
|
|
204
|
+
self._drop_remaining("shutdown_timeout")
|
|
205
|
+
break
|
|
206
|
+
self._move_due_delayed(now)
|
|
207
|
+
if self._stop and not self._queue and not self._delayed:
|
|
208
|
+
break
|
|
209
|
+
if self._queue:
|
|
210
|
+
task = self._queue.popleft()
|
|
211
|
+
self._update_queue_metrics(now=now)
|
|
212
|
+
await self._process_task(task)
|
|
213
|
+
continue
|
|
214
|
+
wait = self._next_wait_seconds(now)
|
|
215
|
+
try:
|
|
216
|
+
if self._wakeup is not None:
|
|
217
|
+
self._wakeup.clear()
|
|
218
|
+
await asyncio.wait_for(self._wakeup.wait(), timeout=wait)
|
|
219
|
+
else:
|
|
220
|
+
await asyncio.sleep(wait)
|
|
221
|
+
except asyncio.TimeoutError:
|
|
222
|
+
pass
|
|
223
|
+
finally:
|
|
224
|
+
self._set_worker_alive(False)
|
|
225
|
+
|
|
226
|
+
async def _process_task(self, task: _AlertTask) -> None:
|
|
227
|
+
metrics = get_metrics()
|
|
228
|
+
task.attempt += 1
|
|
229
|
+
attempt = task.attempt
|
|
230
|
+
self._log_state(task, state="sending")
|
|
231
|
+
try:
|
|
232
|
+
await _send_task(task)
|
|
233
|
+
except Exception as exc:
|
|
234
|
+
retryable, error_type = _classify_error(exc)
|
|
235
|
+
self._record_error(error_type, str(exc))
|
|
236
|
+
if retryable and attempt < self._max_attempts:
|
|
237
|
+
metrics.counter(ALERTS_RETRIED).inc()
|
|
238
|
+
task.next_attempt_at = time.time() + _backoff_seconds(
|
|
239
|
+
attempt,
|
|
240
|
+
base_seconds=self._backoff_base_seconds,
|
|
241
|
+
max_seconds=self._backoff_max_seconds,
|
|
242
|
+
)
|
|
243
|
+
self._log_state(task, state="retry_scheduled", error_type=error_type)
|
|
244
|
+
self._delayed.append(task)
|
|
245
|
+
self._update_queue_metrics(now=time.time())
|
|
246
|
+
if self._wakeup is not None:
|
|
247
|
+
self._wakeup.set()
|
|
248
|
+
return
|
|
249
|
+
reason = "max_attempts" if attempt >= self._max_attempts else "non_retryable"
|
|
250
|
+
self._log_state(task, state="dropped", error_type=error_type)
|
|
251
|
+
self._record_drop(reason, channel=task.channel)
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
metrics.counter(ALERTS_SENT).inc()
|
|
255
|
+
self._record_success()
|
|
256
|
+
self._log_state(task, state="sent")
|
|
257
|
+
|
|
258
|
+
def _record_drop(self, reason: str, *, channel: str) -> None:
|
|
259
|
+
metrics = get_metrics()
|
|
260
|
+
metrics.counter(ALERTS_DROPPED).inc(reason=reason, channel=channel)
|
|
261
|
+
|
|
262
|
+
def _record_success(self) -> None:
|
|
263
|
+
self._last_success_ts = time.time()
|
|
264
|
+
metrics = get_metrics()
|
|
265
|
+
metrics.gauge(ALERTS_LAST_SUCCESS_TIMESTAMP).set(self._last_success_ts)
|
|
266
|
+
|
|
267
|
+
def _record_error(self, error_type: str, message: str) -> None:
|
|
268
|
+
self._last_error_ts = time.time()
|
|
269
|
+
self._last_error_type = error_type
|
|
270
|
+
self._last_error_message = message[:200]
|
|
271
|
+
metrics = get_metrics()
|
|
272
|
+
metrics.gauge(ALERTS_LAST_ERROR_TIMESTAMP).set(self._last_error_ts)
|
|
273
|
+
|
|
274
|
+
def _queue_size(self) -> int:
|
|
275
|
+
return len(self._queue) + len(self._delayed)
|
|
276
|
+
|
|
277
|
+
def _oldest_age_seconds(self, now: float) -> float:
|
|
278
|
+
if not self._queue and not self._delayed:
|
|
279
|
+
return 0.0
|
|
280
|
+
oldest = min(
|
|
281
|
+
[task.enqueued_at for task in self._queue]
|
|
282
|
+
+ [task.enqueued_at for task in self._delayed]
|
|
283
|
+
)
|
|
284
|
+
return max(0.0, now - oldest)
|
|
285
|
+
|
|
286
|
+
def _update_queue_metrics(self, now: float) -> None:
|
|
287
|
+
metrics = get_metrics()
|
|
288
|
+
metrics.gauge(ALERTS_QUEUE_DEPTH).set(self._queue_size())
|
|
289
|
+
metrics.gauge(ALERTS_OLDEST_QUEUED_AGE_SECONDS).set(self._oldest_age_seconds(now))
|
|
290
|
+
|
|
291
|
+
def _move_due_delayed(self, now: float) -> None:
|
|
292
|
+
if not self._delayed:
|
|
293
|
+
return
|
|
294
|
+
due: list[_AlertTask] = []
|
|
295
|
+
remaining: list[_AlertTask] = []
|
|
296
|
+
for task in self._delayed:
|
|
297
|
+
if task.next_attempt_at <= now:
|
|
298
|
+
due.append(task)
|
|
299
|
+
else:
|
|
300
|
+
remaining.append(task)
|
|
301
|
+
self._delayed = remaining
|
|
302
|
+
if due:
|
|
303
|
+
self._queue.extend(due)
|
|
304
|
+
self._update_queue_metrics(now=now)
|
|
305
|
+
|
|
306
|
+
def _next_wait_seconds(self, now: float) -> float:
|
|
307
|
+
if not self._delayed:
|
|
308
|
+
return ALERT_WORKER_POLL_SECONDS
|
|
309
|
+
next_due = min(task.next_attempt_at for task in self._delayed)
|
|
310
|
+
wait = max(0.0, next_due - now)
|
|
311
|
+
return min(ALERT_WORKER_POLL_SECONDS, wait)
|
|
312
|
+
|
|
313
|
+
def _set_worker_alive(self, alive: bool) -> None:
|
|
314
|
+
self._worker_alive = alive
|
|
315
|
+
metrics = get_metrics()
|
|
316
|
+
metrics.gauge(ALERTS_WORKER_ALIVE).set(1.0 if alive else 0.0)
|
|
317
|
+
|
|
318
|
+
def _drop_remaining(self, reason: str) -> None:
|
|
319
|
+
while self._queue:
|
|
320
|
+
task = self._queue.popleft()
|
|
321
|
+
self._record_drop(reason, channel=task.channel)
|
|
322
|
+
while self._delayed:
|
|
323
|
+
task = self._delayed.pop()
|
|
324
|
+
self._record_drop(reason, channel=task.channel)
|
|
325
|
+
self._update_queue_metrics(now=time.time())
|
|
326
|
+
|
|
327
|
+
def _log_state(self, task: _AlertTask, *, state: str, error_type: str | None = None) -> None:
|
|
328
|
+
logger.info(
|
|
329
|
+
"alert.delivery_state",
|
|
330
|
+
alert_id=task.alert_id,
|
|
331
|
+
attempt=task.attempt,
|
|
332
|
+
state=state,
|
|
333
|
+
error_type=error_type,
|
|
334
|
+
channel=task.channel,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
def _log_throttled(self, reason: str, event: str, **fields: object) -> None:
|
|
338
|
+
now = time.time()
|
|
339
|
+
last = self._log_throttle.get(reason)
|
|
340
|
+
if last is not None and now - last < ALERT_LOG_THROTTLE_SECONDS:
|
|
341
|
+
return
|
|
342
|
+
self._log_throttle[reason] = now
|
|
343
|
+
logger.warning(event, reason=reason, **fields)
|
|
344
|
+
|
|
345
|
+
def health_snapshot(self) -> dict[str, object]:
|
|
346
|
+
now = time.time()
|
|
347
|
+
queue_depth = self._queue_size()
|
|
348
|
+
oldest_age = self._oldest_age_seconds(now)
|
|
349
|
+
alive = self._worker_alive
|
|
350
|
+
healthy = queue_depth == 0 or (alive and oldest_age < self._health_max_oldest_age_seconds)
|
|
351
|
+
return {
|
|
352
|
+
"alive": alive,
|
|
353
|
+
"queue_depth": queue_depth,
|
|
354
|
+
"oldest_queued_age_seconds": oldest_age,
|
|
355
|
+
"healthy": healthy,
|
|
356
|
+
"last_success_timestamp": self._last_success_ts,
|
|
357
|
+
"last_error_timestamp": self._last_error_ts,
|
|
358
|
+
"last_error_type": self._last_error_type,
|
|
359
|
+
"last_error_message": self._last_error_message,
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _make_task(
|
|
364
|
+
payload: AlertPayload,
|
|
365
|
+
*,
|
|
366
|
+
destination_type: str,
|
|
367
|
+
destination_id: str,
|
|
368
|
+
telegram_token: str | None = None,
|
|
369
|
+
webhook_url: str | None = None,
|
|
370
|
+
) -> _AlertTask:
|
|
371
|
+
enqueued_at = time.time()
|
|
372
|
+
alert_id = _make_alert_id(payload, destination_type, destination_id)
|
|
373
|
+
return _AlertTask(
|
|
374
|
+
payload=payload,
|
|
375
|
+
destination_type=destination_type,
|
|
376
|
+
destination_id=destination_id,
|
|
377
|
+
channel=destination_type,
|
|
378
|
+
enqueued_at=enqueued_at,
|
|
379
|
+
next_attempt_at=enqueued_at,
|
|
380
|
+
alert_id=alert_id,
|
|
381
|
+
telegram_token=telegram_token,
|
|
382
|
+
webhook_url=webhook_url,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _make_alert_id(payload: AlertPayload, destination_type: str, destination_id: str) -> str:
|
|
387
|
+
raw = f"{destination_type}:{destination_id}:{payload.job_id}:{payload.event_type.value}:{payload.message}"
|
|
388
|
+
return hashlib.sha1(raw.encode("utf-8")).hexdigest()[:12]
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def _backoff_seconds(attempt: int, *, base_seconds: float, max_seconds: float) -> float:
|
|
392
|
+
return min(base_seconds * (2 ** (attempt - 1)), max_seconds)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _classify_error(exc: Exception) -> tuple[bool, str]:
|
|
396
|
+
if isinstance(exc, httpx.TimeoutException):
|
|
397
|
+
return True, "timeout"
|
|
398
|
+
if isinstance(exc, httpx.RequestError):
|
|
399
|
+
return True, "network_error"
|
|
400
|
+
if isinstance(exc, httpx.HTTPStatusError):
|
|
401
|
+
status = exc.response.status_code
|
|
402
|
+
error_type = f"http_{status}"
|
|
403
|
+
if status == 429 or 500 <= status < 600:
|
|
404
|
+
return True, error_type
|
|
405
|
+
if status in (400, 401, 403, 404):
|
|
406
|
+
return False, error_type
|
|
407
|
+
return False, error_type
|
|
408
|
+
return False, type(exc).__name__
|
|
409
|
+
|
|
410
|
+
_alert_service: AlertService | None = None
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def set_alert_service(service: AlertService | None) -> None:
|
|
414
|
+
global _alert_service
|
|
415
|
+
_alert_service = service
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _require_alert_service() -> AlertService:
|
|
419
|
+
if _alert_service is None:
|
|
420
|
+
raise RuntimeError("AlertService is not initialized")
|
|
421
|
+
return _alert_service
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
async def send_alert(payload: AlertPayload, config: AlertConfig) -> None:
|
|
425
|
+
"""Enqueue alert for background delivery. Never blocks core path."""
|
|
426
|
+
service = _require_alert_service()
|
|
82
427
|
if config.telegram_token and config.telegram_chat_ids:
|
|
83
428
|
for chat_id in config.telegram_chat_ids:
|
|
84
429
|
if _should_send(payload, "telegram", chat_id, config.rate_limit_seconds):
|
|
85
|
-
|
|
430
|
+
service.enqueue(
|
|
431
|
+
_make_task(
|
|
432
|
+
payload,
|
|
433
|
+
destination_type="telegram",
|
|
434
|
+
destination_id=str(chat_id),
|
|
435
|
+
telegram_token=config.telegram_token,
|
|
436
|
+
)
|
|
437
|
+
)
|
|
86
438
|
|
|
87
439
|
if config.webhook_url:
|
|
88
440
|
if _should_send(payload, "webhook", config.webhook_url, config.rate_limit_seconds):
|
|
89
|
-
|
|
441
|
+
service.enqueue(
|
|
442
|
+
_make_task(
|
|
443
|
+
payload,
|
|
444
|
+
destination_type="webhook",
|
|
445
|
+
destination_id=config.webhook_url,
|
|
446
|
+
webhook_url=config.webhook_url,
|
|
447
|
+
)
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def enqueue_alert(payload: AlertPayload, config: AlertConfig) -> None:
|
|
452
|
+
"""Sync wrapper for enqueuing alerts from non-async code."""
|
|
453
|
+
from brawny.async_runtime import run_sync
|
|
454
|
+
|
|
455
|
+
run_sync(send_alert(payload, config))
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def configure_alert_worker(*, health_max_oldest_age_seconds: float | None = None) -> None:
|
|
459
|
+
service = _require_alert_service()
|
|
460
|
+
if health_max_oldest_age_seconds is not None:
|
|
461
|
+
service.configure_health_threshold(health_max_oldest_age_seconds)
|
|
90
462
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
_log_failure(payload, tasks[i], result)
|
|
463
|
+
|
|
464
|
+
def get_alert_worker_health() -> dict[str, object]:
|
|
465
|
+
service = _require_alert_service()
|
|
466
|
+
return service.health_snapshot()
|
|
96
467
|
|
|
97
468
|
|
|
98
469
|
def _should_send(
|
|
@@ -131,9 +502,10 @@ async def _send_telegram(token: str, chat_id: str, payload: AlertPayload) -> Non
|
|
|
131
502
|
"parse_mode": parse_mode,
|
|
132
503
|
"disable_web_page_preview": True,
|
|
133
504
|
}
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
505
|
+
with allow_network_calls(reason="alerts"):
|
|
506
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
507
|
+
resp = await client.post(url, json=data)
|
|
508
|
+
resp.raise_for_status()
|
|
137
509
|
|
|
138
510
|
|
|
139
511
|
async def _send_webhook(url: str, payload: AlertPayload) -> None:
|
|
@@ -149,48 +521,42 @@ async def _send_webhook(url: str, payload: AlertPayload) -> None:
|
|
|
149
521
|
|
|
150
522
|
Do not add fields without versioning discussion.
|
|
151
523
|
"""
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
524
|
+
with allow_network_calls(reason="alerts"):
|
|
525
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
526
|
+
resp = await client.post(
|
|
527
|
+
url,
|
|
528
|
+
json={
|
|
529
|
+
"job_id": payload.job_id,
|
|
530
|
+
"job_name": payload.job_name,
|
|
531
|
+
"event_type": payload.event_type.value,
|
|
532
|
+
"message": payload.message,
|
|
533
|
+
"chain_id": payload.chain_id,
|
|
534
|
+
"timestamp": payload.timestamp.isoformat() + "Z",
|
|
535
|
+
},
|
|
536
|
+
)
|
|
537
|
+
resp.raise_for_status()
|
|
165
538
|
|
|
166
539
|
|
|
167
|
-
def
|
|
168
|
-
|
|
169
|
-
|
|
540
|
+
async def _send_task(task: _AlertTask) -> None:
|
|
541
|
+
if task.destination_type == "telegram":
|
|
542
|
+
if task.telegram_token is None:
|
|
543
|
+
raise RuntimeError("telegram_token is required")
|
|
544
|
+
await _send_telegram(task.telegram_token, task.destination_id, task.payload)
|
|
545
|
+
return
|
|
546
|
+
if task.destination_type == "webhook":
|
|
547
|
+
if task.webhook_url is None:
|
|
548
|
+
raise RuntimeError("webhook_url is required")
|
|
549
|
+
await _send_webhook(task.webhook_url, task.payload)
|
|
550
|
+
return
|
|
551
|
+
raise RuntimeError(f"Unknown destination type: {task.destination_type}")
|
|
170
552
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
)
|
|
179
|
-
elif "webhook" in task_name.lower():
|
|
180
|
-
logger.warning(
|
|
181
|
-
"alert_delivery_failed",
|
|
182
|
-
job_id=payload.job_id,
|
|
183
|
-
event_type=payload.event_type.value,
|
|
184
|
-
destination="webhook",
|
|
185
|
-
error=str(error),
|
|
186
|
-
)
|
|
187
|
-
else:
|
|
188
|
-
logger.warning(
|
|
189
|
-
"alert_delivery_failed",
|
|
190
|
-
job_id=payload.job_id,
|
|
191
|
-
event_type=payload.event_type.value,
|
|
192
|
-
error=str(error),
|
|
193
|
-
)
|
|
553
|
+
|
|
554
|
+
def flush_alert_queue(timeout_seconds: float | None = None) -> None:
|
|
555
|
+
timeout = ALERT_FLUSH_TIMEOUT_SECONDS if timeout_seconds is None else timeout_seconds
|
|
556
|
+
service = _require_alert_service()
|
|
557
|
+
from brawny.async_runtime import run_sync
|
|
558
|
+
|
|
559
|
+
run_sync(service.stop(flush_timeout=timeout))
|
|
194
560
|
|
|
195
561
|
|
|
196
562
|
# =============================================================================
|
|
@@ -299,7 +665,9 @@ async def _send_alert_logged(payload: AlertPayload, config: AlertConfig) -> None
|
|
|
299
665
|
# =============================================================================
|
|
300
666
|
|
|
301
667
|
# Separate rate limiting for health alerts (prevents job alert noise from blocking health)
|
|
302
|
-
|
|
668
|
+
# Multi-threaded access - protected by _health_lock
|
|
669
|
+
# Low cardinality keys (chat IDs): maxsize=1K, ttl=1h
|
|
670
|
+
_health_last_sent: TTLCache[str, datetime] = TTLCache(maxsize=1_000, ttl=3600)
|
|
303
671
|
_health_lock = threading.Lock()
|
|
304
672
|
|
|
305
673
|
HEALTH_RATE_LIMIT_SECONDS = 1.0 # Min interval between health messages to same chat
|
|
@@ -362,3 +730,93 @@ def create_send_health(bot: "TelegramBot") -> "Callable[[str, str], None]":
|
|
|
362
730
|
)
|
|
363
731
|
|
|
364
732
|
return send_health
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
# =============================================================================
|
|
736
|
+
# JobAlertSender for ctx.alert() in Lifecycle Hooks
|
|
737
|
+
# =============================================================================
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
class JobAlertSender:
|
|
741
|
+
"""Alert sender bound to a specific job's routing configuration.
|
|
742
|
+
|
|
743
|
+
Used by lifecycle contexts (TriggerContext, SuccessContext, FailureContext)
|
|
744
|
+
to provide ctx.alert() that routes to job-specific destinations.
|
|
745
|
+
|
|
746
|
+
This class implements the AlertSender protocol from model.contexts.
|
|
747
|
+
"""
|
|
748
|
+
|
|
749
|
+
def __init__(
|
|
750
|
+
self,
|
|
751
|
+
*,
|
|
752
|
+
telegram_bot: "TelegramBot | None",
|
|
753
|
+
telegram_config: Any, # TelegramConfig
|
|
754
|
+
job_alert_to: list[str] | None,
|
|
755
|
+
job_id: str,
|
|
756
|
+
) -> None:
|
|
757
|
+
"""Initialize with job-specific routing.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
telegram_bot: TelegramBot instance (None if not configured)
|
|
761
|
+
telegram_config: TelegramConfig with chats, default, parse_mode
|
|
762
|
+
job_alert_to: Job-specific alert destinations (or None for default)
|
|
763
|
+
job_id: Job ID for logging
|
|
764
|
+
"""
|
|
765
|
+
self._bot = telegram_bot
|
|
766
|
+
self._tg_config = telegram_config
|
|
767
|
+
self._job_alert_to = job_alert_to
|
|
768
|
+
self._job_id = job_id
|
|
769
|
+
|
|
770
|
+
def send(
|
|
771
|
+
self,
|
|
772
|
+
message: str,
|
|
773
|
+
*,
|
|
774
|
+
to: str | list[str] | None = None,
|
|
775
|
+
parse_mode: str | None = None,
|
|
776
|
+
) -> None:
|
|
777
|
+
"""Send alert to configured destinations.
|
|
778
|
+
|
|
779
|
+
Routing priority:
|
|
780
|
+
1. `to` parameter (explicit override)
|
|
781
|
+
2. job_alert_to (job-specific config)
|
|
782
|
+
3. telegram.default (global default)
|
|
783
|
+
|
|
784
|
+
Args:
|
|
785
|
+
message: Alert text (up to 4096 characters)
|
|
786
|
+
to: Override routing target (name, ID, or list)
|
|
787
|
+
parse_mode: "Markdown", "MarkdownV2", "HTML", or None for config default
|
|
788
|
+
"""
|
|
789
|
+
if not self._bot or not self._tg_config:
|
|
790
|
+
return # Silent no-op (warned once at startup)
|
|
791
|
+
|
|
792
|
+
from brawny.alerts.routing import resolve_targets
|
|
793
|
+
|
|
794
|
+
# Determine target
|
|
795
|
+
if to is not None:
|
|
796
|
+
target = to
|
|
797
|
+
else:
|
|
798
|
+
target = self._job_alert_to
|
|
799
|
+
|
|
800
|
+
# Resolve to chat IDs
|
|
801
|
+
chat_ids = resolve_targets(
|
|
802
|
+
target,
|
|
803
|
+
self._tg_config.chats,
|
|
804
|
+
self._tg_config.default,
|
|
805
|
+
job_id=self._job_id,
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
if not chat_ids:
|
|
809
|
+
return # No targets configured
|
|
810
|
+
|
|
811
|
+
# Send to each resolved chat
|
|
812
|
+
for chat_id in chat_ids:
|
|
813
|
+
effective_parse_mode = (
|
|
814
|
+
parse_mode if parse_mode is not None
|
|
815
|
+
else self._tg_config.parse_mode or "Markdown"
|
|
816
|
+
)
|
|
817
|
+
self._bot.send_message(
|
|
818
|
+
message,
|
|
819
|
+
chat_id=chat_id,
|
|
820
|
+
parse_mode=effective_parse_mode,
|
|
821
|
+
disable_web_page_preview=True,
|
|
822
|
+
)
|