rlwatch 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rlwatch/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """rlwatch - Real-time GRPO/PPO training instability detection."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from rlwatch.core import attach, log_step, get_monitor, RLWatch
6
+ from rlwatch.config import RLWatchConfig, load_config
7
+
8
+ __all__ = ["attach", "log_step", "get_monitor", "RLWatch", "RLWatchConfig", "load_config"]
rlwatch/alerts.py ADDED
@@ -0,0 +1,483 @@
1
+ """Alert delivery channels — console, Slack, email, Discord, generic webhook.
2
+
3
+ This module is the only place in the codebase that's allowed to make network
4
+ calls (CLAUDE.md cardinal rule #4). The CI forbidden-pattern grep enforces
5
+ this — all ``urllib.request`` / ``requests`` / ``httpx`` references must live
6
+ here.
7
+
8
+ Every sender follows the same shape:
9
+ - Constructed with config, holds no global state.
10
+ - ``send(alert, run_id)`` is called from a daemon thread by ``AlertManager``.
11
+ - Catches and logs every exception. **Never raises into the training loop.**
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import logging
18
+ import smtplib
19
+ import string
20
+ import threading
21
+ from datetime import datetime, timezone
22
+ from email.mime.multipart import MIMEMultipart
23
+ from email.mime.text import MIMEText
24
+ from typing import Optional
25
+
26
+ from rlwatch.config import AlertConfig, DiscordConfig, WebhookConfig
27
+ from rlwatch.detectors import Alert
28
+
29
+ logger = logging.getLogger("rlwatch.alerts")
30
+
31
+
32
+ class AlertManager:
33
+ """Manages alert delivery with cooldown and rate limiting."""
34
+
35
+ def __init__(self, config: AlertConfig, run_id: str = ""):
36
+ self.config = config
37
+ self.run_id = run_id
38
+ self._alert_count = 0
39
+ # (detector, severity) -> last step that severity fired. Tracking per
40
+ # severity lets a critical preempt a warning that's still in cooldown.
41
+ self._last_alert_step: dict[tuple[str, str], int] = {}
42
+ # Last step *any* severity fired for this detector — used to honor the
43
+ # warning cooldown against repeated warnings.
44
+ self._last_warning_step: dict[str, int] = {}
45
+ self._slack_client: Optional[_SlackSender] = None
46
+ self._email_client: Optional[_EmailSender] = None
47
+ self._discord_client: Optional[_DiscordSender] = None
48
+ self._webhook_client: Optional[_WebhookSender] = None
49
+
50
+ if config.slack.enabled and config.slack.webhook_url:
51
+ self._slack_client = _SlackSender(config.slack.webhook_url)
52
+
53
+ if config.email.enabled and config.email.to_addrs:
54
+ self._email_client = _EmailSender(
55
+ host=config.email.smtp_host,
56
+ port=config.email.smtp_port,
57
+ user=config.email.smtp_user,
58
+ password=config.email.smtp_password,
59
+ from_addr=config.email.from_addr,
60
+ to_addrs=config.email.to_addrs,
61
+ )
62
+
63
+ if config.discord.enabled and config.discord.webhook_url:
64
+ self._discord_client = _DiscordSender(config.discord)
65
+
66
+ if config.webhook.enabled and config.webhook.url:
67
+ self._webhook_client = _WebhookSender(config.webhook)
68
+
69
+ def should_send(self, alert: Alert) -> bool:
70
+ """Check if an alert should be sent based on cooldown and rate limits.
71
+
72
+ Cooldown semantics:
73
+ * A repeat alert at the same (detector, severity) within
74
+ ``cooldown_steps`` is suppressed.
75
+ * A *critical* alert is allowed through even if a warning from the
76
+ same detector is still inside its cooldown window — escalation
77
+ should never be muted by an earlier, lesser alert. The critical
78
+ still respects its own per-severity cooldown.
79
+ """
80
+ if self._alert_count >= self.config.max_alerts_per_run:
81
+ return False
82
+
83
+ key = (alert.detector, alert.severity)
84
+ last_step = self._last_alert_step.get(key, -self.config.cooldown_steps - 1)
85
+ if alert.step - last_step < self.config.cooldown_steps:
86
+ return False
87
+
88
+ return True
89
+
90
+ def send(self, alert: Alert) -> bool:
91
+ """Send an alert via all configured channels (non-blocking).
92
+
93
+ Returns True if the alert was actually sent (not suppressed by cooldown).
94
+ """
95
+ if not self.should_send(alert):
96
+ return False
97
+
98
+ self._alert_count += 1
99
+ self._last_alert_step[(alert.detector, alert.severity)] = alert.step
100
+
101
+ # Log to console always
102
+ _log_alert_console(alert, self.run_id)
103
+
104
+ # Send via configured channels in background threads
105
+ if self._slack_client:
106
+ threading.Thread(
107
+ target=self._slack_client.send,
108
+ args=(alert, self.run_id),
109
+ daemon=True,
110
+ ).start()
111
+
112
+ if self._email_client:
113
+ threading.Thread(
114
+ target=self._email_client.send,
115
+ args=(alert, self.run_id),
116
+ daemon=True,
117
+ ).start()
118
+
119
+ if self._discord_client:
120
+ threading.Thread(
121
+ target=self._discord_client.send,
122
+ args=(alert, self.run_id),
123
+ daemon=True,
124
+ ).start()
125
+
126
+ if self._webhook_client:
127
+ threading.Thread(
128
+ target=self._webhook_client.send,
129
+ args=(alert, self.run_id),
130
+ daemon=True,
131
+ ).start()
132
+
133
+ return True
134
+
135
+ @property
136
+ def total_alerts_sent(self) -> int:
137
+ return self._alert_count
138
+
139
+
140
+ def _log_alert_console(alert: Alert, run_id: str):
141
+ """Log an alert to the console using rich formatting."""
142
+ try:
143
+ from rich.console import Console
144
+ from rich.panel import Panel
145
+
146
+ console = Console(stderr=True)
147
+ severity_color = "red" if alert.severity == "critical" else "yellow"
148
+ title = f"[bold {severity_color}]rlwatch {alert.severity.upper()}: {alert.detector}[/]"
149
+ body = (
150
+ f"[bold]Step {alert.step}[/] | Run: {run_id}\n\n"
151
+ f"{alert.message}\n\n"
152
+ f"[dim]Recommendation:[/] {alert.recommendation}"
153
+ )
154
+ console.print(Panel(body, title=title, border_style=severity_color))
155
+ except ImportError:
156
+ # Fallback without rich
157
+ prefix = "CRITICAL" if alert.severity == "critical" else "WARNING"
158
+ logger.warning(
159
+ "[rlwatch %s] %s at step %d: %s | %s",
160
+ prefix, alert.detector, alert.step, alert.message, alert.recommendation,
161
+ )
162
+
163
+
164
+ class _SlackSender:
165
+ """Sends alerts to Slack via webhook."""
166
+
167
+ def __init__(self, webhook_url: str):
168
+ self.webhook_url = webhook_url
169
+
170
+ def send(self, alert: Alert, run_id: str):
171
+ try:
172
+ from slack_sdk.webhook import WebhookClient
173
+
174
+ client = WebhookClient(self.webhook_url)
175
+ emoji = ":rotating_light:" if alert.severity == "critical" else ":warning:"
176
+ blocks = [
177
+ {
178
+ "type": "header",
179
+ "text": {
180
+ "type": "plain_text",
181
+ "text": f"{emoji} rlwatch {alert.severity.upper()}: {alert.detector}",
182
+ },
183
+ },
184
+ {
185
+ "type": "section",
186
+ "fields": [
187
+ {"type": "mrkdwn", "text": f"*Run:* `{run_id}`"},
188
+ {"type": "mrkdwn", "text": f"*Step:* {alert.step}"},
189
+ ],
190
+ },
191
+ {
192
+ "type": "section",
193
+ "text": {
194
+ "type": "mrkdwn",
195
+ "text": alert.message,
196
+ },
197
+ },
198
+ {
199
+ "type": "section",
200
+ "text": {
201
+ "type": "mrkdwn",
202
+ "text": f"*Recommended action:* {alert.recommendation}",
203
+ },
204
+ },
205
+ ]
206
+
207
+ # Add metric values as context
208
+ metric_fields = []
209
+ for k, v in alert.metric_values.items():
210
+ if v is not None:
211
+ formatted = f"{v:.4f}" if isinstance(v, float) else str(v)
212
+ metric_fields.append(
213
+ {"type": "mrkdwn", "text": f"`{k}`: {formatted}"}
214
+ )
215
+
216
+ if metric_fields:
217
+ # Slack limits fields to 10
218
+ blocks.append({
219
+ "type": "section",
220
+ "fields": metric_fields[:10],
221
+ })
222
+
223
+ response = client.send(blocks=blocks)
224
+ if response.status_code != 200:
225
+ logger.error("Slack webhook returned %d: %s", response.status_code, response.body)
226
+ except Exception as e:
227
+ logger.error("Failed to send Slack alert: %s", e)
228
+
229
+
230
+ class _EmailSender:
231
+ """Sends alerts via email."""
232
+
233
+ def __init__(
234
+ self,
235
+ host: str,
236
+ port: int,
237
+ user: str,
238
+ password: str,
239
+ from_addr: str,
240
+ to_addrs: list[str],
241
+ ):
242
+ self.host = host
243
+ self.port = port
244
+ self.user = user
245
+ self.password = password
246
+ self.from_addr = from_addr
247
+ self.to_addrs = to_addrs
248
+
249
+ def send(self, alert: Alert, run_id: str):
250
+ try:
251
+ msg = MIMEMultipart("alternative")
252
+ msg["Subject"] = f"[rlwatch {alert.severity.upper()}] {alert.detector} — Run {run_id} Step {alert.step}"
253
+ msg["From"] = self.from_addr
254
+ msg["To"] = ", ".join(self.to_addrs)
255
+
256
+ # Plain text
257
+ text = (
258
+ f"rlwatch {alert.severity.upper()}: {alert.detector}\n\n"
259
+ f"Run: {run_id}\n"
260
+ f"Step: {alert.step}\n\n"
261
+ f"{alert.message}\n\n"
262
+ f"Recommendation: {alert.recommendation}\n\n"
263
+ f"Metrics:\n"
264
+ )
265
+ for k, v in alert.metric_values.items():
266
+ if v is not None:
267
+ formatted = f"{v:.4f}" if isinstance(v, float) else str(v)
268
+ text += f" {k}: {formatted}\n"
269
+
270
+ # HTML
271
+ html = f"""
272
+ <html>
273
+ <body>
274
+ <h2 style="color: {'red' if alert.severity == 'critical' else 'orange'}">
275
+ rlwatch {alert.severity.upper()}: {alert.detector}
276
+ </h2>
277
+ <p><strong>Run:</strong> <code>{run_id}</code> | <strong>Step:</strong> {alert.step}</p>
278
+ <p>{alert.message}</p>
279
+ <p><strong>Recommendation:</strong> {alert.recommendation}</p>
280
+ <h3>Metrics</h3>
281
+ <table border="1" cellpadding="5" cellspacing="0">
282
+ """
283
+ for k, v in alert.metric_values.items():
284
+ if v is not None:
285
+ formatted = f"{v:.4f}" if isinstance(v, float) else str(v)
286
+ html += f"<tr><td><code>{k}</code></td><td>{formatted}</td></tr>"
287
+ html += "</table></body></html>"
288
+
289
+ msg.attach(MIMEText(text, "plain"))
290
+ msg.attach(MIMEText(html, "html"))
291
+
292
+ with smtplib.SMTP(self.host, self.port) as server:
293
+ server.starttls()
294
+ if self.user and self.password:
295
+ server.login(self.user, self.password)
296
+ server.sendmail(self.from_addr, self.to_addrs, msg.as_string())
297
+
298
+ logger.info("Email alert sent to %s", self.to_addrs)
299
+ except Exception as e:
300
+ logger.error("Failed to send email alert: %s", e)
301
+
302
+
303
+ # ---------------------------------------------------------------------------
304
+ # Discord webhook sender
305
+ # ---------------------------------------------------------------------------
306
+ class _DiscordSender:
307
+ """Sends alerts to a Discord channel via the webhook API.
308
+
309
+ Discord webhooks accept JSON at ``https://discord.com/api/webhooks/{id}/{token}``
310
+ with optional ``content`` (plain text), ``embeds`` (rich blocks), ``username``,
311
+ and ``avatar_url`` fields. We use one embed per alert with severity-coded
312
+ color and an emoji-prefixed title.
313
+ """
314
+
315
+ def __init__(self, config: DiscordConfig):
316
+ self.config = config
317
+
318
+ def send(self, alert: Alert, run_id: str):
319
+ try:
320
+ from urllib.error import HTTPError, URLError
321
+ from urllib.request import Request, urlopen
322
+
323
+ emoji = "🚨" if alert.severity == "critical" else "⚠️"
324
+ color = 0xFF0000 if alert.severity == "critical" else 0xFFA500
325
+
326
+ # Mention configured roles only on critical alerts so warnings
327
+ # don't ping the on-call rotation in the middle of the night.
328
+ mention_content: Optional[str] = None
329
+ if alert.severity == "critical" and self.config.mention_role_ids:
330
+ mention_content = " ".join(
331
+ f"<@&{rid}>" for rid in self.config.mention_role_ids
332
+ )
333
+
334
+ fields = [
335
+ {"name": "Run", "value": f"`{run_id}`", "inline": True},
336
+ {"name": "Step", "value": str(alert.step), "inline": True},
337
+ {
338
+ "name": "Recommended action",
339
+ "value": alert.recommendation,
340
+ "inline": False,
341
+ },
342
+ ]
343
+ # Discord caps embed fields at 25; cap our metric overflow at 10
344
+ # to leave headroom and stay readable.
345
+ for k, v in list(alert.metric_values.items())[:10]:
346
+ if v is None:
347
+ continue
348
+ formatted = f"{v:.4f}" if isinstance(v, float) else str(v)
349
+ fields.append(
350
+ {"name": f"`{k}`", "value": formatted, "inline": True}
351
+ )
352
+
353
+ payload: dict = {
354
+ "username": self.config.username,
355
+ "embeds": [
356
+ {
357
+ "title": f"{emoji} rlwatch {alert.severity.upper()}: {alert.detector}",
358
+ "description": alert.message,
359
+ "color": color,
360
+ "fields": fields,
361
+ }
362
+ ],
363
+ }
364
+ if self.config.avatar_url:
365
+ payload["avatar_url"] = self.config.avatar_url
366
+ if mention_content:
367
+ payload["content"] = mention_content
368
+
369
+ data = json.dumps(payload).encode("utf-8")
370
+ req = Request(
371
+ self.config.webhook_url,
372
+ data=data,
373
+ headers={"Content-Type": "application/json"},
374
+ method="POST",
375
+ )
376
+ with urlopen(req, timeout=10) as resp:
377
+ # Discord returns 204 No Content on success.
378
+ if resp.status >= 300:
379
+ logger.error("Discord webhook returned %d", resp.status)
380
+ except (HTTPError, URLError) as e:
381
+ logger.error("Failed to send Discord alert: %s", e)
382
+ except Exception as e:
383
+ logger.error("Unexpected Discord send error: %s", e)
384
+
385
+
386
+ # ---------------------------------------------------------------------------
387
+ # Generic HTTP webhook sender
388
+ # ---------------------------------------------------------------------------
389
+ _DEFAULT_WEBHOOK_TEMPLATE = """{
390
+ "detector": "${detector}",
391
+ "severity": "${severity}",
392
+ "step": ${step},
393
+ "run_id": "${run_id}",
394
+ "message": "${message}",
395
+ "recommendation": "${recommendation}",
396
+ "metrics": ${metrics_json},
397
+ "timestamp": "${timestamp}"
398
+ }"""
399
+
400
+
401
+ def _json_escape(s: str) -> str:
402
+ """Escape a string so it can be safely substituted into a JSON string slot.
403
+
404
+ Uses ``json.dumps`` and strips the surrounding quotes — that's the
405
+ canonical "give me a JSON-safe string body" trick. Handles quotes,
406
+ backslashes, newlines, control chars, and non-ASCII unicode.
407
+ """
408
+ if s is None:
409
+ return ""
410
+ return json.dumps(s)[1:-1]
411
+
412
+
413
+ class _WebhookSender:
414
+ """Generic HTTP webhook sender with ``string.Template`` substitution.
415
+
416
+ POSTs (or PUTs) a JSON body to a user-supplied URL. The body is built
417
+ from a ``string.Template`` so users can customize the payload shape for
418
+ whatever downstream system they're feeding (incident tracker, internal
419
+ log aggregator, custom Slack-of-record, etc.).
420
+
421
+ Substitutable fields:
422
+ ${detector} — alert.detector
423
+ ${severity} — "critical" | "warning"
424
+ ${severity_upper} — "CRITICAL" | "WARNING"
425
+ ${step} — int (unquoted in default template — numeric slot)
426
+ ${message} — alert.message (JSON-escaped)
427
+ ${recommendation} — alert.recommendation (JSON-escaped)
428
+ ${run_id} — manager run_id
429
+ ${timestamp} — ISO8601 UTC at send time
430
+ ${metrics_json} — json.dumps(alert.metric_values), unquoted (object slot)
431
+
432
+ The substituted body is validated with ``json.loads`` before sending.
433
+ Invalid JSON is logged and dropped — we never POST something that won't
434
+ parse on the other end.
435
+ """
436
+
437
+ def __init__(self, config: WebhookConfig):
438
+ self.config = config
439
+
440
+ def send(self, alert: Alert, run_id: str):
441
+ try:
442
+ from urllib.error import HTTPError, URLError
443
+ from urllib.request import Request, urlopen
444
+
445
+ tmpl_str = self.config.template_json or _DEFAULT_WEBHOOK_TEMPLATE
446
+ tmpl = string.Template(tmpl_str)
447
+ body = tmpl.safe_substitute(
448
+ detector=alert.detector,
449
+ severity=alert.severity,
450
+ severity_upper=alert.severity.upper(),
451
+ step=alert.step,
452
+ message=_json_escape(alert.message),
453
+ recommendation=_json_escape(alert.recommendation),
454
+ run_id=_json_escape(run_id),
455
+ metrics_json=json.dumps(alert.metric_values),
456
+ timestamp=datetime.now(timezone.utc).isoformat(),
457
+ )
458
+
459
+ # Validate the substituted body is still parseable JSON. A
460
+ # malformed custom template should fail loudly here, not on the
461
+ # receiving server.
462
+ try:
463
+ json.loads(body)
464
+ except json.JSONDecodeError as e:
465
+ logger.error(
466
+ "Webhook template produced invalid JSON after substitution: %s",
467
+ e,
468
+ )
469
+ return
470
+
471
+ req = Request(
472
+ self.config.url,
473
+ data=body.encode("utf-8"),
474
+ headers={"Content-Type": "application/json", **self.config.headers},
475
+ method=self.config.method,
476
+ )
477
+ with urlopen(req, timeout=self.config.timeout_seconds) as resp:
478
+ if resp.status >= 300:
479
+ logger.error("Webhook returned %d", resp.status)
480
+ except (HTTPError, URLError) as e:
481
+ logger.error("Failed to send webhook alert: %s", e)
482
+ except Exception as e:
483
+ logger.error("Unexpected webhook send error: %s", e)