alpha-engine-lib 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. alpha_engine_lib/__init__.py +3 -0
  2. alpha_engine_lib/agent_schemas.py +663 -0
  3. alpha_engine_lib/alerts.py +576 -0
  4. alpha_engine_lib/arcticdb.py +340 -0
  5. alpha_engine_lib/collector_results.py +69 -0
  6. alpha_engine_lib/cost.py +665 -0
  7. alpha_engine_lib/dates.py +273 -0
  8. alpha_engine_lib/decision_capture.py +462 -0
  9. alpha_engine_lib/ec2_spot.py +363 -0
  10. alpha_engine_lib/email_sender.py +206 -0
  11. alpha_engine_lib/eval_artifacts.py +361 -0
  12. alpha_engine_lib/logging.py +303 -0
  13. alpha_engine_lib/model_pricing.yaml +73 -0
  14. alpha_engine_lib/pillars.py +756 -0
  15. alpha_engine_lib/pipeline_status/__init__.py +70 -0
  16. alpha_engine_lib/pipeline_status/read.py +541 -0
  17. alpha_engine_lib/pipeline_status/registry.py +368 -0
  18. alpha_engine_lib/pipeline_status/templates.py +120 -0
  19. alpha_engine_lib/preflight.py +444 -0
  20. alpha_engine_lib/rag/__init__.py +39 -0
  21. alpha_engine_lib/rag/db.py +96 -0
  22. alpha_engine_lib/rag/embeddings.py +63 -0
  23. alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
  24. alpha_engine_lib/rag/rerank.py +377 -0
  25. alpha_engine_lib/rag/retrieval.py +465 -0
  26. alpha_engine_lib/rag/schema.sql +65 -0
  27. alpha_engine_lib/reconcile.py +203 -0
  28. alpha_engine_lib/secrets.py +186 -0
  29. alpha_engine_lib/sources/__init__.py +35 -0
  30. alpha_engine_lib/sources/protocols.py +227 -0
  31. alpha_engine_lib/ssm_log_capture.py +274 -0
  32. alpha_engine_lib/telegram.py +165 -0
  33. alpha_engine_lib/trading_calendar.py +236 -0
  34. alpha_engine_lib/transparency.py +746 -0
  35. alpha_engine_lib/transparency_inventory.yaml +260 -0
  36. alpha_engine_lib/universe.py +83 -0
  37. alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
  38. alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
  39. alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
  40. alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,576 @@
1
+ """
2
+ Unified failure-surveillance fan-out for Alpha Engine modules.
3
+
4
+ Consolidation substrate for the **"fire an operator alert from a failure
5
+ site"** pattern that has appeared inline across the fleet:
6
+
7
+ * :file:`alpha-engine/infrastructure/health_checker.sh` — raw ``curl`` to
8
+ Telegram bot API
9
+ * :file:`alpha-engine-data/infrastructure/lambdas/changelog-incident-mirror/deploy.sh`
10
+ — raw ``aws sns publish`` to ``alpha-engine-alerts``
11
+ * ROADMAP L116/L117 — names 5 more Lambda-deploying repos that need the
12
+ same canary-rollback alert primitive ("Mirror in all 5 Lambda-deploying
13
+ repos … same recurrence class as ``feedback_env_regression_recurs_per_repo_spot_script``
14
+ — fix forward across all repos in one pass, not per-repo at incident time")
15
+
16
+ Per the ``~/Development/CLAUDE.md`` SOTA / institutional-approach rule
17
+ (sub-sub-rule: lift to lib when ≥2 consumers exist), this module is the
18
+ canonical Python primitive backing all consumers. Bash callers reach it
19
+ via the CLI entry (``python -m alpha_engine_lib.alerts publish ...``) —
20
+ mirrors the :mod:`alpha_engine_lib.transparency` ``--cadence daily/weekly``
21
+ CLI convention.
22
+
23
+ **Public API:**
24
+
25
+ - :func:`publish` — fan-out to both SNS (``alpha-engine-alerts`` topic →
26
+ email) and Telegram (``@nous_ergon_alerts_bot`` channel) by default.
27
+ Each channel is independently best-effort — failure in one does not
28
+ block the other. Returns a :class:`PublishResult` dataclass with the
29
+ per-channel outcome for caller observability.
30
+ - CLI: ``python -m alpha_engine_lib.alerts publish --message "..."
31
+ --severity error --source "..."``. Designed for Bash failure-trap
32
+ callers (``cleanup()`` in spot dispatchers, ``deploy.sh`` rollback
33
+ branches). Exit code is ``0`` if *either* channel succeeded, ``1`` if
34
+ *both* failed.
35
+
36
+ **Severity tiering.** ``severity`` is a free-form string that is
37
+ prepended to the message (``[ERROR] ...`` / ``[WARNING] ...``) for both
38
+ channels. Telegram pushes (``disable_notification=False``) for
39
+ ``error``/``critical``; in-channel silent for ``info``/``warning``. SNS
40
+ delivery is identical regardless of severity — downstream subscribers
41
+ choose how to fan out.
42
+
43
+ **SNS topic resolution.** Defaults to
44
+ ``arn:aws:sns:{region}:{account_id}:alpha-engine-alerts``, with
45
+ ``region`` from ``AWS_REGION``/``AWS_DEFAULT_REGION`` (fallback
46
+ ``us-east-1``) and ``account_id`` resolved via ``sts:GetCallerIdentity``.
47
+ Override with the ``--sns-topic-arn`` CLI flag or ``sns_topic_arn``
48
+ kwarg.
49
+
50
+ **Failure behavior.** Never raises. SNS errors (boto3 ``ClientError``,
51
+ network) and Telegram errors both log at WARNING and return a
52
+ :class:`PublishResult` with the failed channel marked ``ok=False``. This
53
+ is by design — the caller is already in a failure path; secondary
54
+ surveillance failure must not mask the primary error.
55
+ """
56
+
57
+ from __future__ import annotations
58
+
59
+ import hashlib
60
+ import json
61
+ import logging
62
+ import os
63
+ import sys
64
+ from dataclasses import dataclass, field
65
+ from datetime import datetime, timedelta, timezone
66
+ from typing import Final
67
+
68
+ logger = logging.getLogger(__name__)
69
+
70
+ DEFAULT_SNS_TOPIC_NAME: Final[str] = "alpha-engine-alerts"
71
+ DEFAULT_REGION: Final[str] = "us-east-1"
72
+ SEVERITY_PUSH: Final[frozenset[str]] = frozenset({"error", "critical"})
73
+
74
+ # ── Dedup (v0.24.0) ──────────────────────────────────────────────────────────
75
+ # When the caller passes a ``dedup_key``, ``publish`` writes a marker at
76
+ # ``s3://{dedup_bucket}/{DEDUP_MARKER_PREFIX}/{sha1(dedup_key)[:16]}.json``
77
+ # after the first successful publish. Subsequent calls with the same
78
+ # ``dedup_key`` within ``dedup_window_min`` minutes find the marker and
79
+ # skip the publish. See the :func:`publish` docstring.
80
+ DEFAULT_DEDUP_BUCKET: Final[str] = "alpha-engine-research"
81
+ DEDUP_MARKER_PREFIX: Final[str] = "_alerts/_dedup"
82
+ DEFAULT_DEDUP_WINDOW_MIN: Final[int] = 60
83
+
84
+
85
+ @dataclass
86
+ class ChannelResult:
87
+ """Per-channel outcome from a :func:`publish` call."""
88
+
89
+ ok: bool
90
+ detail: str = ""
91
+
92
+
93
+ @dataclass
94
+ class PublishResult:
95
+ """Aggregated outcome from a :func:`publish` call.
96
+
97
+ ``sns`` and ``telegram`` are independent — a publish may succeed in
98
+ one channel and fail in the other. :attr:`any_ok` is the typical
99
+ caller gate (success = at least one channel delivered the alert);
100
+ :attr:`all_ok` is the strict variant for callers that want both.
101
+
102
+ When the caller passes ``dedup_key`` and an earlier publish for the
103
+ same key is still within window, :attr:`dedup_skipped` is True and
104
+ neither channel is attempted; :attr:`any_ok` still reports True
105
+ (the alert is logically in the operator's hands by virtue of the
106
+ earlier successful publish).
107
+ """
108
+
109
+ sns: ChannelResult = field(default_factory=lambda: ChannelResult(ok=False, detail="not attempted"))
110
+ telegram: ChannelResult = field(default_factory=lambda: ChannelResult(ok=False, detail="not attempted"))
111
+ dedup_skipped: bool = False
112
+ dedup_reason: str = ""
113
+
114
+ @property
115
+ def any_ok(self) -> bool:
116
+ if self.dedup_skipped:
117
+ return True
118
+ return self.sns.ok or self.telegram.ok
119
+
120
+ @property
121
+ def all_ok(self) -> bool:
122
+ if self.dedup_skipped:
123
+ return True
124
+ return self.sns.ok and self.telegram.ok
125
+
126
+
127
+ def _resolve_sns_topic_arn(explicit: str | None) -> str | None:
128
+ """Return the SNS topic ARN, resolving from env + STS if not explicit."""
129
+ if explicit:
130
+ return explicit
131
+ region = (
132
+ os.environ.get("AWS_REGION")
133
+ or os.environ.get("AWS_DEFAULT_REGION")
134
+ or DEFAULT_REGION
135
+ )
136
+ try:
137
+ import boto3
138
+
139
+ account_id = boto3.client("sts", region_name=region).get_caller_identity()["Account"]
140
+ except Exception as exc: # boto3 missing, STS unreachable, creds bad
141
+ logger.warning("alerts.publish: SNS topic ARN resolution failed: %s", exc)
142
+ return None
143
+ return f"arn:aws:sns:{region}:{account_id}:{DEFAULT_SNS_TOPIC_NAME}"
144
+
145
+
146
+ def _format_message(message: str, severity: str, source: str | None) -> str:
147
+ """Prepend severity tag + source prefix to the message body."""
148
+ tag = f"[{severity.upper()}]"
149
+ if source:
150
+ return f"{tag} {source}: {message}"
151
+ return f"{tag} {message}"
152
+
153
+
154
+ def _publish_sns(arn: str, message: str, subject: str | None = None) -> ChannelResult:
155
+ try:
156
+ import boto3
157
+
158
+ region = arn.split(":")[3] if ":" in arn else DEFAULT_REGION
159
+ client = boto3.client("sns", region_name=region)
160
+ kwargs: dict = {"TopicArn": arn, "Message": message}
161
+ if subject:
162
+ # SNS subject is limited to 100 chars + ASCII + no newlines.
163
+ cleaned = subject.replace("\n", " ").replace("\r", " ")[:100]
164
+ kwargs["Subject"] = cleaned
165
+ resp = client.publish(**kwargs)
166
+ return ChannelResult(ok=True, detail=resp.get("MessageId", "<no id>"))
167
+ except Exception as exc:
168
+ logger.warning("alerts.publish: SNS publish failed: %s", exc)
169
+ return ChannelResult(ok=False, detail=f"sns error: {exc!r}")
170
+
171
+
172
+ def _publish_telegram(message: str, severity: str) -> ChannelResult:
173
+ try:
174
+ from alpha_engine_lib.telegram import send_message
175
+
176
+ # Push for error/critical, silent in-channel for info/warning.
177
+ silent = severity.lower() not in SEVERITY_PUSH
178
+ ok = send_message(message, disable_notification=silent)
179
+ return ChannelResult(ok=bool(ok), detail="sent" if ok else "send_message returned False")
180
+ except Exception as exc: # send_message itself never raises, but defensive
181
+ logger.warning("alerts.publish: Telegram fan-out failed: %s", exc)
182
+ return ChannelResult(ok=False, detail=f"telegram error: {exc!r}")
183
+
184
+
185
+ def _dedup_marker_key(dedup_key: str) -> str:
186
+ """Stable S3 key for a dedup_key marker.
187
+
188
+ Hashes the dedup_key so the on-disk path is opaque + bounded length
189
+ (S3 keys can be arbitrarily long but operator-facing S3 listings
190
+ are easier to read with fixed-width entries). The original
191
+ dedup_key is preserved inside the marker JSON body for debugging.
192
+ """
193
+ digest = hashlib.sha1(dedup_key.encode("utf-8")).hexdigest()[:16]
194
+ return f"{DEDUP_MARKER_PREFIX}/{digest}.json"
195
+
196
+
197
+ def _check_dedup_marker(
198
+ bucket: str,
199
+ marker_key: str,
200
+ *,
201
+ dedup_window_min: int | None,
202
+ ) -> tuple[bool, str]:
203
+ """Check whether a recent publish for this dedup_key is still in window.
204
+
205
+ Returns ``(within_window, reason)``. ``within_window=True`` means
206
+ the caller should skip publish; ``False`` means proceed.
207
+
208
+ Fail-safe: any S3 error other than NoSuchKey returns
209
+ ``(False, "<error description>")`` so the caller proceeds to
210
+ publish. An extra alert is preferable to silently dropping a real
211
+ failure-surveillance event because the marker bucket was
212
+ unreachable.
213
+
214
+ ``dedup_window_min=None`` means "forever" — any existing marker
215
+ suppresses subsequent publishes indefinitely.
216
+ """
217
+ try:
218
+ import boto3
219
+ from botocore.exceptions import ClientError
220
+ except ImportError as exc:
221
+ return False, f"boto3 unavailable: {exc!r}"
222
+ client = boto3.client("s3")
223
+ try:
224
+ resp = client.get_object(Bucket=bucket, Key=marker_key)
225
+ payload = json.loads(resp["Body"].read())
226
+ except ClientError as exc:
227
+ code = exc.response.get("Error", {}).get("Code", "")
228
+ if code == "NoSuchKey":
229
+ return False, "no marker"
230
+ logger.warning(
231
+ "alerts.publish: dedup marker check errored (fail-safe to publish): %s",
232
+ exc,
233
+ )
234
+ return False, f"marker check error: {exc!r}"
235
+ except Exception as exc: # boto3 missing, network, JSON parse
236
+ logger.warning(
237
+ "alerts.publish: dedup marker parse failed (fail-safe to publish): %s",
238
+ exc,
239
+ )
240
+ return False, f"marker parse error: {exc!r}"
241
+
242
+ if dedup_window_min is None:
243
+ return True, f"marker exists; dedup_window_min=None (forever)"
244
+
245
+ last_at_str = payload.get("last_published_at") or payload.get("first_published_at")
246
+ if not last_at_str:
247
+ return False, "marker missing timestamp"
248
+ try:
249
+ last_at = datetime.fromisoformat(last_at_str.replace("Z", "+00:00"))
250
+ except ValueError:
251
+ return False, f"marker timestamp unparseable: {last_at_str!r}"
252
+
253
+ now = datetime.now(timezone.utc)
254
+ elapsed = now - last_at
255
+ window = timedelta(minutes=dedup_window_min)
256
+ if elapsed < window:
257
+ remaining = window - elapsed
258
+ return True, (
259
+ f"within {dedup_window_min}min window "
260
+ f"(last published {int(elapsed.total_seconds())}s ago; "
261
+ f"{int(remaining.total_seconds())}s remaining)"
262
+ )
263
+ return False, f"marker expired ({int(elapsed.total_seconds())}s ago > {dedup_window_min}min)"
264
+
265
+
266
+ def _write_dedup_marker(
267
+ bucket: str,
268
+ marker_key: str,
269
+ *,
270
+ dedup_key: str,
271
+ formatted_message: str,
272
+ ) -> None:
273
+ """Persist (or refresh) the dedup marker after a successful publish.
274
+
275
+ Read-modify-write: increments ``publish_count`` if the marker
276
+ already exists, otherwise starts a fresh marker. Best-effort — any
277
+ failure is logged at WARNING and swallowed (worst case: one
278
+ duplicate alert next time within the window).
279
+ """
280
+ try:
281
+ import boto3
282
+ from botocore.exceptions import ClientError
283
+ except ImportError as exc:
284
+ logger.warning(
285
+ "alerts.publish: dedup marker write skipped — boto3 unavailable: %s",
286
+ exc,
287
+ )
288
+ return
289
+ client = boto3.client("s3")
290
+ now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
291
+ # Read-modify-write so first_published_at is preserved across window
292
+ # refreshes; publish_count grows monotonically.
293
+ first_published_at = now_iso
294
+ publish_count = 1
295
+ try:
296
+ resp = client.get_object(Bucket=bucket, Key=marker_key)
297
+ prior = json.loads(resp["Body"].read())
298
+ first_published_at = prior.get("first_published_at", now_iso)
299
+ publish_count = int(prior.get("publish_count", 0)) + 1
300
+ except ClientError as exc:
301
+ code = exc.response.get("Error", {}).get("Code", "")
302
+ if code != "NoSuchKey":
303
+ # Non-fatal — fall through to write a fresh marker.
304
+ logger.warning(
305
+ "alerts.publish: dedup marker RMW read failed (writing fresh): %s",
306
+ exc,
307
+ )
308
+ except Exception: # JSON parse / corrupt marker — overwrite
309
+ pass
310
+
311
+ payload = {
312
+ "dedup_key": dedup_key,
313
+ "first_published_at": first_published_at,
314
+ "last_published_at": now_iso,
315
+ "publish_count": publish_count,
316
+ "message_preview": formatted_message[:200],
317
+ }
318
+ try:
319
+ client.put_object(
320
+ Bucket=bucket,
321
+ Key=marker_key,
322
+ Body=json.dumps(payload).encode("utf-8"),
323
+ ContentType="application/json",
324
+ )
325
+ except Exception as exc: # noqa: BLE001
326
+ logger.warning(
327
+ "alerts.publish: dedup marker write failed "
328
+ "(best-effort, swallowed; next call within window may re-publish): %s",
329
+ exc,
330
+ )
331
+
332
+
333
+ def publish(
334
+ message: str,
335
+ *,
336
+ severity: str = "error",
337
+ source: str | None = None,
338
+ sns: bool = True,
339
+ telegram: bool = True,
340
+ sns_topic_arn: str | None = None,
341
+ dedup_key: str | None = None,
342
+ dedup_window_min: int | None = DEFAULT_DEDUP_WINDOW_MIN,
343
+ dedup_bucket: str | None = None,
344
+ ) -> PublishResult:
345
+ """Fan out a failure alert to the operator-surveillance channels.
346
+
347
+ Default: publish to both ``alpha-engine-alerts`` SNS (→ email) AND
348
+ Telegram (``@nous_ergon_alerts_bot``). Pass ``sns=False`` /
349
+ ``telegram=False`` to suppress individual channels (useful for
350
+ tests, or for callers that have a narrower target).
351
+
352
+ **Dedup** (v0.24.0). When ``dedup_key`` is provided, the call
353
+ checks an S3 marker at
354
+ ``s3://{dedup_bucket}/_alerts/_dedup/{sha1(dedup_key)[:16]}.json``.
355
+ If the marker exists and the last publish for that key is within
356
+ ``dedup_window_min`` minutes (default ``60``; ``None`` = forever),
357
+ the publish is suppressed and :attr:`PublishResult.dedup_skipped`
358
+ is True. After a successful fresh publish, the marker is written
359
+ (or refreshed) with an incremented ``publish_count``. Use cases:
360
+
361
+ - **One email per cost anomaly** even when ``evaluate.py`` runs
362
+ multiple times for the same date — pass a deterministic
363
+ ``dedup_key`` derived from the anomaly inputs.
364
+ - **One alert per Lambda canary rollback episode** even when 8
365
+ Lambda repos cascade-fail from one shared lib regression — pass
366
+ ``dedup_key=f"canary-rollback-{lib_pin_sha}"`` so the cascading
367
+ deploys all collapse to one operator email.
368
+ - **Once-per-hour throttling** on noisy WARN paths — pass any
369
+ stable key + leave the default 60min window.
370
+
371
+ Dedup is best-effort: any S3 error during the check falls through
372
+ to publish (better an extra alert than a silent drop). Marker
373
+ write failure after a successful publish is logged but does NOT
374
+ propagate (worst case is one duplicate next call within window).
375
+
376
+ :param message: The alert body. Severity tag + source prefix are
377
+ prepended automatically (e.g. ``"[ERROR] spot_backtest.sh: <body>"``).
378
+ :param severity: Free-form severity string (``error`` / ``critical``
379
+ push on Telegram; everything else is silent in-channel). The tag
380
+ is uppercased in the rendered message.
381
+ :param source: Optional source identifier (script path, repo, Lambda
382
+ name) inserted between the tag and the message body. Helps the
383
+ operator triage at a glance.
384
+ :param sns: When ``False``, skip the SNS publish entirely.
385
+ :param telegram: When ``False``, skip the Telegram fan-out entirely.
386
+ :param sns_topic_arn: Explicit topic ARN. Defaults to
387
+ ``arn:aws:sns:{region}:{account_id}:alpha-engine-alerts`` resolved
388
+ from env + STS.
389
+ :param dedup_key: Opaque caller-chosen string. Same key + same
390
+ window ⇒ at most one publish per window. ``None`` (default)
391
+ disables dedup entirely; legacy callers behave unchanged.
392
+ :param dedup_window_min: Window in minutes after which a fresh
393
+ publish is allowed for the same ``dedup_key``. Default
394
+ ``60``. Pass ``None`` for "forever" (publish once per
395
+ ``dedup_key`` for the lifetime of the marker bucket).
396
+ :param dedup_bucket: S3 bucket holding the markers. Defaults to
397
+ ``alpha-engine-research`` (the shared corpus bucket).
398
+ :returns: :class:`PublishResult` — caller can inspect per-channel
399
+ outcomes. :attr:`PublishResult.any_ok` is the typical success
400
+ gate; :attr:`PublishResult.all_ok` is the strict variant.
401
+ On dedup-skip, :attr:`PublishResult.dedup_skipped` is True and
402
+ :attr:`PublishResult.dedup_reason` explains why.
403
+ """
404
+ result = PublishResult()
405
+ formatted = _format_message(message, severity, source)
406
+
407
+ # ── Dedup check (pre-publish) ────────────────────────────────────────
408
+ marker_key: str | None = None
409
+ bucket = dedup_bucket or DEFAULT_DEDUP_BUCKET
410
+ if dedup_key:
411
+ marker_key = _dedup_marker_key(dedup_key)
412
+ within_window, reason = _check_dedup_marker(
413
+ bucket, marker_key, dedup_window_min=dedup_window_min,
414
+ )
415
+ if within_window:
416
+ result.dedup_skipped = True
417
+ result.dedup_reason = reason
418
+ result.sns = ChannelResult(ok=False, detail="suppressed by dedup")
419
+ result.telegram = ChannelResult(ok=False, detail="suppressed by dedup")
420
+ logger.info(
421
+ "alerts.publish: skipped publish for dedup_key=%r (%s)",
422
+ dedup_key, reason,
423
+ )
424
+ return result
425
+
426
+ # ── Publish ──────────────────────────────────────────────────────────
427
+ if sns:
428
+ arn = _resolve_sns_topic_arn(sns_topic_arn)
429
+ if arn is None:
430
+ result.sns = ChannelResult(ok=False, detail="topic ARN resolution failed")
431
+ else:
432
+ # SNS subject — concise header, falls back to severity tag.
433
+ subject = f"Alpha Engine alert [{severity.upper()}]"
434
+ if source:
435
+ subject += f" — {source}"
436
+ result.sns = _publish_sns(arn, formatted, subject=subject)
437
+
438
+ if telegram:
439
+ result.telegram = _publish_telegram(formatted, severity=severity)
440
+
441
+ # ── Dedup marker write (post-publish, only if any channel succeeded) ─
442
+ if marker_key and (result.sns.ok or result.telegram.ok):
443
+ _write_dedup_marker(
444
+ bucket, marker_key,
445
+ dedup_key=dedup_key, formatted_message=formatted,
446
+ )
447
+
448
+ return result
449
+
450
+
451
+ # ─── CLI entry ──────────────────────────────────────────────────────────────
452
+ # Designed for Bash callers that need failure surveillance from a script
453
+ # (spot dispatcher `cleanup` traps, deploy.sh rollback branches, etc.).
454
+ # Mirrors the :mod:`alpha_engine_lib.transparency` ``python -m`` pattern so
455
+ # Bash callers reach this primitive without bootstrapping a full Python
456
+ # project. Exit code is 0 if *any* channel succeeded, 1 if both failed.
457
+
458
+
459
+ def main(argv: list[str] | None = None) -> int:
460
+ import argparse
461
+
462
+ parser = argparse.ArgumentParser(
463
+ prog="python -m alpha_engine_lib.alerts",
464
+ description=(
465
+ "Publish a failure alert to alpha-engine's operator-surveillance "
466
+ "channels (SNS topic alpha-engine-alerts + Telegram). Designed "
467
+ "for Bash callers — exit code 0 if any channel succeeded, 1 if "
468
+ "both failed. Never raises."
469
+ ),
470
+ )
471
+ subparsers = parser.add_subparsers(dest="cmd", required=True)
472
+
473
+ pub = subparsers.add_parser("publish", help="Publish an alert message.")
474
+ pub.add_argument("--message", required=True, help="Alert body text.")
475
+ pub.add_argument(
476
+ "--severity",
477
+ default="error",
478
+ help=(
479
+ "Severity tag (default: error). 'error' and 'critical' push on "
480
+ "Telegram; all others are silent in-channel."
481
+ ),
482
+ )
483
+ pub.add_argument(
484
+ "--source",
485
+ default=None,
486
+ help=(
487
+ "Optional source identifier (script path, repo, Lambda name) "
488
+ "rendered between the severity tag and the message body."
489
+ ),
490
+ )
491
+ pub.add_argument("--no-sns", action="store_true", help="Skip SNS publish.")
492
+ pub.add_argument("--no-telegram", action="store_true", help="Skip Telegram fan-out.")
493
+ pub.add_argument(
494
+ "--sns-topic-arn",
495
+ default=None,
496
+ help=(
497
+ "Override the SNS topic ARN. Defaults to "
498
+ "arn:aws:sns:{region}:{account_id}:alpha-engine-alerts."
499
+ ),
500
+ )
501
+ pub.add_argument(
502
+ "--dedup-key",
503
+ default=None,
504
+ help=(
505
+ "Optional opaque dedup key. When set, ``publish`` checks an "
506
+ "S3 marker first and suppresses the alert if an earlier "
507
+ "publish for the same key is within --dedup-window-min. "
508
+ "Use for cost anomalies / canary rollback episodes / any "
509
+ "noisy WARN path that benefits from rate-limiting. Bash "
510
+ "callers typically pass a bucketed timestamp, e.g. "
511
+ "--dedup-key \"canary-rollback-$(date -u +%Y%m%d%H)\"."
512
+ ),
513
+ )
514
+ pub.add_argument(
515
+ "--dedup-window-min",
516
+ type=int,
517
+ default=DEFAULT_DEDUP_WINDOW_MIN,
518
+ help=(
519
+ f"Window in minutes after which a fresh publish is allowed for "
520
+ f"the same --dedup-key (default: {DEFAULT_DEDUP_WINDOW_MIN}). "
521
+ "Pass 0 for 'forever' (publish once per --dedup-key for the "
522
+ "lifetime of the marker bucket)."
523
+ ),
524
+ )
525
+ pub.add_argument(
526
+ "--dedup-bucket",
527
+ default=None,
528
+ help=(
529
+ f"S3 bucket holding the dedup markers. Defaults to "
530
+ f"{DEFAULT_DEDUP_BUCKET!r}."
531
+ ),
532
+ )
533
+
534
+ args = parser.parse_args(argv)
535
+
536
+ logging.basicConfig(level=logging.WARNING)
537
+
538
+ # CLI convention: --dedup-window-min 0 = forever; map to None for the
539
+ # Python API (whose default is 60 + None=forever).
540
+ window_min: int | None
541
+ if args.dedup_window_min == 0:
542
+ window_min = None
543
+ else:
544
+ window_min = args.dedup_window_min
545
+
546
+ result = publish(
547
+ args.message,
548
+ severity=args.severity,
549
+ source=args.source,
550
+ sns=not args.no_sns,
551
+ telegram=not args.no_telegram,
552
+ sns_topic_arn=args.sns_topic_arn,
553
+ dedup_key=args.dedup_key,
554
+ dedup_window_min=window_min,
555
+ dedup_bucket=args.dedup_bucket,
556
+ )
557
+
558
+ # One-line status to stderr (stdout reserved for structured output if
559
+ # any caller starts parsing it). Bash callers can ignore.
560
+ if result.dedup_skipped:
561
+ print(
562
+ f"alerts.publish: dedup_skipped=True ({result.dedup_reason})",
563
+ file=sys.stderr,
564
+ )
565
+ else:
566
+ print(
567
+ f"alerts.publish: sns.ok={result.sns.ok} ({result.sns.detail}); "
568
+ f"telegram.ok={result.telegram.ok} ({result.telegram.detail})",
569
+ file=sys.stderr,
570
+ )
571
+
572
+ return 0 if result.any_ok else 1
573
+
574
+
575
+ if __name__ == "__main__":
576
+ sys.exit(main())