alpha-engine-lib 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alpha_engine_lib/__init__.py +3 -0
- alpha_engine_lib/agent_schemas.py +663 -0
- alpha_engine_lib/alerts.py +576 -0
- alpha_engine_lib/arcticdb.py +340 -0
- alpha_engine_lib/collector_results.py +69 -0
- alpha_engine_lib/cost.py +665 -0
- alpha_engine_lib/dates.py +273 -0
- alpha_engine_lib/decision_capture.py +462 -0
- alpha_engine_lib/ec2_spot.py +363 -0
- alpha_engine_lib/email_sender.py +206 -0
- alpha_engine_lib/eval_artifacts.py +361 -0
- alpha_engine_lib/logging.py +303 -0
- alpha_engine_lib/model_pricing.yaml +73 -0
- alpha_engine_lib/pillars.py +756 -0
- alpha_engine_lib/pipeline_status/__init__.py +70 -0
- alpha_engine_lib/pipeline_status/read.py +541 -0
- alpha_engine_lib/pipeline_status/registry.py +368 -0
- alpha_engine_lib/pipeline_status/templates.py +120 -0
- alpha_engine_lib/preflight.py +444 -0
- alpha_engine_lib/rag/__init__.py +39 -0
- alpha_engine_lib/rag/db.py +96 -0
- alpha_engine_lib/rag/embeddings.py +63 -0
- alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
- alpha_engine_lib/rag/rerank.py +377 -0
- alpha_engine_lib/rag/retrieval.py +465 -0
- alpha_engine_lib/rag/schema.sql +65 -0
- alpha_engine_lib/reconcile.py +203 -0
- alpha_engine_lib/secrets.py +186 -0
- alpha_engine_lib/sources/__init__.py +35 -0
- alpha_engine_lib/sources/protocols.py +227 -0
- alpha_engine_lib/ssm_log_capture.py +274 -0
- alpha_engine_lib/telegram.py +165 -0
- alpha_engine_lib/trading_calendar.py +236 -0
- alpha_engine_lib/transparency.py +746 -0
- alpha_engine_lib/transparency_inventory.yaml +260 -0
- alpha_engine_lib/universe.py +83 -0
- alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
- alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
- alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
- alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified failure-surveillance fan-out for Alpha Engine modules.
|
|
3
|
+
|
|
4
|
+
Consolidation substrate for the **"fire an operator alert from a failure
|
|
5
|
+
site"** pattern that has appeared inline across the fleet:
|
|
6
|
+
|
|
7
|
+
* :file:`alpha-engine/infrastructure/health_checker.sh` — raw ``curl`` to
|
|
8
|
+
Telegram bot API
|
|
9
|
+
* :file:`alpha-engine-data/infrastructure/lambdas/changelog-incident-mirror/deploy.sh`
|
|
10
|
+
— raw ``aws sns publish`` to ``alpha-engine-alerts``
|
|
11
|
+
* ROADMAP L116/L117 — names 5 more Lambda-deploying repos that need the
|
|
12
|
+
same canary-rollback alert primitive ("Mirror in all 5 Lambda-deploying
|
|
13
|
+
repos … same recurrence class as ``feedback_env_regression_recurs_per_repo_spot_script``
|
|
14
|
+
— fix forward across all repos in one pass, not per-repo at incident time")
|
|
15
|
+
|
|
16
|
+
Per the ``~/Development/CLAUDE.md`` SOTA / institutional-approach rule
|
|
17
|
+
(sub-sub-rule: lift to lib when ≥2 consumers exist), this module is the
|
|
18
|
+
canonical Python primitive backing all consumers. Bash callers reach it
|
|
19
|
+
via the CLI entry (``python -m alpha_engine_lib.alerts publish ...``) —
|
|
20
|
+
mirrors the :mod:`alpha_engine_lib.transparency` ``--cadence daily/weekly``
|
|
21
|
+
CLI convention.
|
|
22
|
+
|
|
23
|
+
**Public API:**
|
|
24
|
+
|
|
25
|
+
- :func:`publish` — fan-out to both SNS (``alpha-engine-alerts`` topic →
|
|
26
|
+
email) and Telegram (``@nous_ergon_alerts_bot`` channel) by default.
|
|
27
|
+
Each channel is independently best-effort — failure in one does not
|
|
28
|
+
block the other. Returns a :class:`PublishResult` dataclass with the
|
|
29
|
+
per-channel outcome for caller observability.
|
|
30
|
+
- CLI: ``python -m alpha_engine_lib.alerts publish --message "..."
|
|
31
|
+
--severity error --source "..."``. Designed for Bash failure-trap
|
|
32
|
+
callers (``cleanup()`` in spot dispatchers, ``deploy.sh`` rollback
|
|
33
|
+
branches). Exit code is ``0`` if *either* channel succeeded, ``1`` if
|
|
34
|
+
*both* failed.
|
|
35
|
+
|
|
36
|
+
**Severity tiering.** ``severity`` is a free-form string that is
|
|
37
|
+
prepended to the message (``[ERROR] ...`` / ``[WARNING] ...``) for both
|
|
38
|
+
channels. Telegram pushes (``disable_notification=False``) for
|
|
39
|
+
``error``/``critical``; in-channel silent for ``info``/``warning``. SNS
|
|
40
|
+
delivery is identical regardless of severity — downstream subscribers
|
|
41
|
+
choose how to fan out.
|
|
42
|
+
|
|
43
|
+
**SNS topic resolution.** Defaults to
|
|
44
|
+
``arn:aws:sns:{region}:{account_id}:alpha-engine-alerts``, with
|
|
45
|
+
``region`` from ``AWS_REGION``/``AWS_DEFAULT_REGION`` (fallback
|
|
46
|
+
``us-east-1``) and ``account_id`` resolved via ``sts:GetCallerIdentity``.
|
|
47
|
+
Override with the ``--sns-topic-arn`` CLI flag or ``sns_topic_arn``
|
|
48
|
+
kwarg.
|
|
49
|
+
|
|
50
|
+
**Failure behavior.** Never raises. SNS errors (boto3 ``ClientError``,
|
|
51
|
+
network) and Telegram errors both log at WARNING and return a
|
|
52
|
+
:class:`PublishResult` with the failed channel marked ``ok=False``. This
|
|
53
|
+
is by design — the caller is already in a failure path; secondary
|
|
54
|
+
surveillance failure must not mask the primary error.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
from __future__ import annotations
|
|
58
|
+
|
|
59
|
+
import hashlib
|
|
60
|
+
import json
|
|
61
|
+
import logging
|
|
62
|
+
import os
|
|
63
|
+
import sys
|
|
64
|
+
from dataclasses import dataclass, field
|
|
65
|
+
from datetime import datetime, timedelta, timezone
|
|
66
|
+
from typing import Final
|
|
67
|
+
|
|
68
|
+
logger = logging.getLogger(__name__)
|
|
69
|
+
|
|
70
|
+
DEFAULT_SNS_TOPIC_NAME: Final[str] = "alpha-engine-alerts"
|
|
71
|
+
DEFAULT_REGION: Final[str] = "us-east-1"
|
|
72
|
+
SEVERITY_PUSH: Final[frozenset[str]] = frozenset({"error", "critical"})
|
|
73
|
+
|
|
74
|
+
# ── Dedup (v0.24.0) ──────────────────────────────────────────────────────────
|
|
75
|
+
# When the caller passes a ``dedup_key``, ``publish`` writes a marker at
|
|
76
|
+
# ``s3://{dedup_bucket}/{DEDUP_MARKER_PREFIX}/{sha1(dedup_key)[:16]}.json``
|
|
77
|
+
# after the first successful publish. Subsequent calls with the same
|
|
78
|
+
# ``dedup_key`` within ``dedup_window_min`` minutes find the marker and
|
|
79
|
+
# skip the publish. See the :func:`publish` docstring.
|
|
80
|
+
DEFAULT_DEDUP_BUCKET: Final[str] = "alpha-engine-research"
|
|
81
|
+
DEDUP_MARKER_PREFIX: Final[str] = "_alerts/_dedup"
|
|
82
|
+
DEFAULT_DEDUP_WINDOW_MIN: Final[int] = 60
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class ChannelResult:
|
|
87
|
+
"""Per-channel outcome from a :func:`publish` call."""
|
|
88
|
+
|
|
89
|
+
ok: bool
|
|
90
|
+
detail: str = ""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass
|
|
94
|
+
class PublishResult:
|
|
95
|
+
"""Aggregated outcome from a :func:`publish` call.
|
|
96
|
+
|
|
97
|
+
``sns`` and ``telegram`` are independent — a publish may succeed in
|
|
98
|
+
one channel and fail in the other. :attr:`any_ok` is the typical
|
|
99
|
+
caller gate (success = at least one channel delivered the alert);
|
|
100
|
+
:attr:`all_ok` is the strict variant for callers that want both.
|
|
101
|
+
|
|
102
|
+
When the caller passes ``dedup_key`` and an earlier publish for the
|
|
103
|
+
same key is still within window, :attr:`dedup_skipped` is True and
|
|
104
|
+
neither channel is attempted; :attr:`any_ok` still reports True
|
|
105
|
+
(the alert is logically in the operator's hands by virtue of the
|
|
106
|
+
earlier successful publish).
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
sns: ChannelResult = field(default_factory=lambda: ChannelResult(ok=False, detail="not attempted"))
|
|
110
|
+
telegram: ChannelResult = field(default_factory=lambda: ChannelResult(ok=False, detail="not attempted"))
|
|
111
|
+
dedup_skipped: bool = False
|
|
112
|
+
dedup_reason: str = ""
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def any_ok(self) -> bool:
|
|
116
|
+
if self.dedup_skipped:
|
|
117
|
+
return True
|
|
118
|
+
return self.sns.ok or self.telegram.ok
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def all_ok(self) -> bool:
|
|
122
|
+
if self.dedup_skipped:
|
|
123
|
+
return True
|
|
124
|
+
return self.sns.ok and self.telegram.ok
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _resolve_sns_topic_arn(explicit: str | None) -> str | None:
|
|
128
|
+
"""Return the SNS topic ARN, resolving from env + STS if not explicit."""
|
|
129
|
+
if explicit:
|
|
130
|
+
return explicit
|
|
131
|
+
region = (
|
|
132
|
+
os.environ.get("AWS_REGION")
|
|
133
|
+
or os.environ.get("AWS_DEFAULT_REGION")
|
|
134
|
+
or DEFAULT_REGION
|
|
135
|
+
)
|
|
136
|
+
try:
|
|
137
|
+
import boto3
|
|
138
|
+
|
|
139
|
+
account_id = boto3.client("sts", region_name=region).get_caller_identity()["Account"]
|
|
140
|
+
except Exception as exc: # boto3 missing, STS unreachable, creds bad
|
|
141
|
+
logger.warning("alerts.publish: SNS topic ARN resolution failed: %s", exc)
|
|
142
|
+
return None
|
|
143
|
+
return f"arn:aws:sns:{region}:{account_id}:{DEFAULT_SNS_TOPIC_NAME}"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _format_message(message: str, severity: str, source: str | None) -> str:
|
|
147
|
+
"""Prepend severity tag + source prefix to the message body."""
|
|
148
|
+
tag = f"[{severity.upper()}]"
|
|
149
|
+
if source:
|
|
150
|
+
return f"{tag} {source}: {message}"
|
|
151
|
+
return f"{tag} {message}"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _publish_sns(arn: str, message: str, subject: str | None = None) -> ChannelResult:
|
|
155
|
+
try:
|
|
156
|
+
import boto3
|
|
157
|
+
|
|
158
|
+
region = arn.split(":")[3] if ":" in arn else DEFAULT_REGION
|
|
159
|
+
client = boto3.client("sns", region_name=region)
|
|
160
|
+
kwargs: dict = {"TopicArn": arn, "Message": message}
|
|
161
|
+
if subject:
|
|
162
|
+
# SNS subject is limited to 100 chars + ASCII + no newlines.
|
|
163
|
+
cleaned = subject.replace("\n", " ").replace("\r", " ")[:100]
|
|
164
|
+
kwargs["Subject"] = cleaned
|
|
165
|
+
resp = client.publish(**kwargs)
|
|
166
|
+
return ChannelResult(ok=True, detail=resp.get("MessageId", "<no id>"))
|
|
167
|
+
except Exception as exc:
|
|
168
|
+
logger.warning("alerts.publish: SNS publish failed: %s", exc)
|
|
169
|
+
return ChannelResult(ok=False, detail=f"sns error: {exc!r}")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _publish_telegram(message: str, severity: str) -> ChannelResult:
|
|
173
|
+
try:
|
|
174
|
+
from alpha_engine_lib.telegram import send_message
|
|
175
|
+
|
|
176
|
+
# Push for error/critical, silent in-channel for info/warning.
|
|
177
|
+
silent = severity.lower() not in SEVERITY_PUSH
|
|
178
|
+
ok = send_message(message, disable_notification=silent)
|
|
179
|
+
return ChannelResult(ok=bool(ok), detail="sent" if ok else "send_message returned False")
|
|
180
|
+
except Exception as exc: # send_message itself never raises, but defensive
|
|
181
|
+
logger.warning("alerts.publish: Telegram fan-out failed: %s", exc)
|
|
182
|
+
return ChannelResult(ok=False, detail=f"telegram error: {exc!r}")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _dedup_marker_key(dedup_key: str) -> str:
|
|
186
|
+
"""Stable S3 key for a dedup_key marker.
|
|
187
|
+
|
|
188
|
+
Hashes the dedup_key so the on-disk path is opaque + bounded length
|
|
189
|
+
(S3 keys can be arbitrarily long but operator-facing S3 listings
|
|
190
|
+
are easier to read with fixed-width entries). The original
|
|
191
|
+
dedup_key is preserved inside the marker JSON body for debugging.
|
|
192
|
+
"""
|
|
193
|
+
digest = hashlib.sha1(dedup_key.encode("utf-8")).hexdigest()[:16]
|
|
194
|
+
return f"{DEDUP_MARKER_PREFIX}/{digest}.json"
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _check_dedup_marker(
|
|
198
|
+
bucket: str,
|
|
199
|
+
marker_key: str,
|
|
200
|
+
*,
|
|
201
|
+
dedup_window_min: int | None,
|
|
202
|
+
) -> tuple[bool, str]:
|
|
203
|
+
"""Check whether a recent publish for this dedup_key is still in window.
|
|
204
|
+
|
|
205
|
+
Returns ``(within_window, reason)``. ``within_window=True`` means
|
|
206
|
+
the caller should skip publish; ``False`` means proceed.
|
|
207
|
+
|
|
208
|
+
Fail-safe: any S3 error other than NoSuchKey returns
|
|
209
|
+
``(False, "<error description>")`` so the caller proceeds to
|
|
210
|
+
publish. An extra alert is preferable to silently dropping a real
|
|
211
|
+
failure-surveillance event because the marker bucket was
|
|
212
|
+
unreachable.
|
|
213
|
+
|
|
214
|
+
``dedup_window_min=None`` means "forever" — any existing marker
|
|
215
|
+
suppresses subsequent publishes indefinitely.
|
|
216
|
+
"""
|
|
217
|
+
try:
|
|
218
|
+
import boto3
|
|
219
|
+
from botocore.exceptions import ClientError
|
|
220
|
+
except ImportError as exc:
|
|
221
|
+
return False, f"boto3 unavailable: {exc!r}"
|
|
222
|
+
client = boto3.client("s3")
|
|
223
|
+
try:
|
|
224
|
+
resp = client.get_object(Bucket=bucket, Key=marker_key)
|
|
225
|
+
payload = json.loads(resp["Body"].read())
|
|
226
|
+
except ClientError as exc:
|
|
227
|
+
code = exc.response.get("Error", {}).get("Code", "")
|
|
228
|
+
if code == "NoSuchKey":
|
|
229
|
+
return False, "no marker"
|
|
230
|
+
logger.warning(
|
|
231
|
+
"alerts.publish: dedup marker check errored (fail-safe to publish): %s",
|
|
232
|
+
exc,
|
|
233
|
+
)
|
|
234
|
+
return False, f"marker check error: {exc!r}"
|
|
235
|
+
except Exception as exc: # boto3 missing, network, JSON parse
|
|
236
|
+
logger.warning(
|
|
237
|
+
"alerts.publish: dedup marker parse failed (fail-safe to publish): %s",
|
|
238
|
+
exc,
|
|
239
|
+
)
|
|
240
|
+
return False, f"marker parse error: {exc!r}"
|
|
241
|
+
|
|
242
|
+
if dedup_window_min is None:
|
|
243
|
+
return True, f"marker exists; dedup_window_min=None (forever)"
|
|
244
|
+
|
|
245
|
+
last_at_str = payload.get("last_published_at") or payload.get("first_published_at")
|
|
246
|
+
if not last_at_str:
|
|
247
|
+
return False, "marker missing timestamp"
|
|
248
|
+
try:
|
|
249
|
+
last_at = datetime.fromisoformat(last_at_str.replace("Z", "+00:00"))
|
|
250
|
+
except ValueError:
|
|
251
|
+
return False, f"marker timestamp unparseable: {last_at_str!r}"
|
|
252
|
+
|
|
253
|
+
now = datetime.now(timezone.utc)
|
|
254
|
+
elapsed = now - last_at
|
|
255
|
+
window = timedelta(minutes=dedup_window_min)
|
|
256
|
+
if elapsed < window:
|
|
257
|
+
remaining = window - elapsed
|
|
258
|
+
return True, (
|
|
259
|
+
f"within {dedup_window_min}min window "
|
|
260
|
+
f"(last published {int(elapsed.total_seconds())}s ago; "
|
|
261
|
+
f"{int(remaining.total_seconds())}s remaining)"
|
|
262
|
+
)
|
|
263
|
+
return False, f"marker expired ({int(elapsed.total_seconds())}s ago > {dedup_window_min}min)"
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _write_dedup_marker(
|
|
267
|
+
bucket: str,
|
|
268
|
+
marker_key: str,
|
|
269
|
+
*,
|
|
270
|
+
dedup_key: str,
|
|
271
|
+
formatted_message: str,
|
|
272
|
+
) -> None:
|
|
273
|
+
"""Persist (or refresh) the dedup marker after a successful publish.
|
|
274
|
+
|
|
275
|
+
Read-modify-write: increments ``publish_count`` if the marker
|
|
276
|
+
already exists, otherwise starts a fresh marker. Best-effort — any
|
|
277
|
+
failure is logged at WARNING and swallowed (worst case: one
|
|
278
|
+
duplicate alert next time within the window).
|
|
279
|
+
"""
|
|
280
|
+
try:
|
|
281
|
+
import boto3
|
|
282
|
+
from botocore.exceptions import ClientError
|
|
283
|
+
except ImportError as exc:
|
|
284
|
+
logger.warning(
|
|
285
|
+
"alerts.publish: dedup marker write skipped — boto3 unavailable: %s",
|
|
286
|
+
exc,
|
|
287
|
+
)
|
|
288
|
+
return
|
|
289
|
+
client = boto3.client("s3")
|
|
290
|
+
now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
291
|
+
# Read-modify-write so first_published_at is preserved across window
|
|
292
|
+
# refreshes; publish_count grows monotonically.
|
|
293
|
+
first_published_at = now_iso
|
|
294
|
+
publish_count = 1
|
|
295
|
+
try:
|
|
296
|
+
resp = client.get_object(Bucket=bucket, Key=marker_key)
|
|
297
|
+
prior = json.loads(resp["Body"].read())
|
|
298
|
+
first_published_at = prior.get("first_published_at", now_iso)
|
|
299
|
+
publish_count = int(prior.get("publish_count", 0)) + 1
|
|
300
|
+
except ClientError as exc:
|
|
301
|
+
code = exc.response.get("Error", {}).get("Code", "")
|
|
302
|
+
if code != "NoSuchKey":
|
|
303
|
+
# Non-fatal — fall through to write a fresh marker.
|
|
304
|
+
logger.warning(
|
|
305
|
+
"alerts.publish: dedup marker RMW read failed (writing fresh): %s",
|
|
306
|
+
exc,
|
|
307
|
+
)
|
|
308
|
+
except Exception: # JSON parse / corrupt marker — overwrite
|
|
309
|
+
pass
|
|
310
|
+
|
|
311
|
+
payload = {
|
|
312
|
+
"dedup_key": dedup_key,
|
|
313
|
+
"first_published_at": first_published_at,
|
|
314
|
+
"last_published_at": now_iso,
|
|
315
|
+
"publish_count": publish_count,
|
|
316
|
+
"message_preview": formatted_message[:200],
|
|
317
|
+
}
|
|
318
|
+
try:
|
|
319
|
+
client.put_object(
|
|
320
|
+
Bucket=bucket,
|
|
321
|
+
Key=marker_key,
|
|
322
|
+
Body=json.dumps(payload).encode("utf-8"),
|
|
323
|
+
ContentType="application/json",
|
|
324
|
+
)
|
|
325
|
+
except Exception as exc: # noqa: BLE001
|
|
326
|
+
logger.warning(
|
|
327
|
+
"alerts.publish: dedup marker write failed "
|
|
328
|
+
"(best-effort, swallowed; next call within window may re-publish): %s",
|
|
329
|
+
exc,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def publish(
|
|
334
|
+
message: str,
|
|
335
|
+
*,
|
|
336
|
+
severity: str = "error",
|
|
337
|
+
source: str | None = None,
|
|
338
|
+
sns: bool = True,
|
|
339
|
+
telegram: bool = True,
|
|
340
|
+
sns_topic_arn: str | None = None,
|
|
341
|
+
dedup_key: str | None = None,
|
|
342
|
+
dedup_window_min: int | None = DEFAULT_DEDUP_WINDOW_MIN,
|
|
343
|
+
dedup_bucket: str | None = None,
|
|
344
|
+
) -> PublishResult:
|
|
345
|
+
"""Fan out a failure alert to the operator-surveillance channels.
|
|
346
|
+
|
|
347
|
+
Default: publish to both ``alpha-engine-alerts`` SNS (→ email) AND
|
|
348
|
+
Telegram (``@nous_ergon_alerts_bot``). Pass ``sns=False`` /
|
|
349
|
+
``telegram=False`` to suppress individual channels (useful for
|
|
350
|
+
tests, or for callers that have a narrower target).
|
|
351
|
+
|
|
352
|
+
**Dedup** (v0.24.0). When ``dedup_key`` is provided, the call
|
|
353
|
+
checks an S3 marker at
|
|
354
|
+
``s3://{dedup_bucket}/_alerts/_dedup/{sha1(dedup_key)[:16]}.json``.
|
|
355
|
+
If the marker exists and the last publish for that key is within
|
|
356
|
+
``dedup_window_min`` minutes (default ``60``; ``None`` = forever),
|
|
357
|
+
the publish is suppressed and :attr:`PublishResult.dedup_skipped`
|
|
358
|
+
is True. After a successful fresh publish, the marker is written
|
|
359
|
+
(or refreshed) with an incremented ``publish_count``. Use cases:
|
|
360
|
+
|
|
361
|
+
- **One email per cost anomaly** even when ``evaluate.py`` runs
|
|
362
|
+
multiple times for the same date — pass a deterministic
|
|
363
|
+
``dedup_key`` derived from the anomaly inputs.
|
|
364
|
+
- **One alert per Lambda canary rollback episode** even when 8
|
|
365
|
+
Lambda repos cascade-fail from one shared lib regression — pass
|
|
366
|
+
``dedup_key=f"canary-rollback-{lib_pin_sha}"`` so the cascading
|
|
367
|
+
deploys all collapse to one operator email.
|
|
368
|
+
- **Once-per-hour throttling** on noisy WARN paths — pass any
|
|
369
|
+
stable key + leave the default 60min window.
|
|
370
|
+
|
|
371
|
+
Dedup is best-effort: any S3 error during the check falls through
|
|
372
|
+
to publish (better an extra alert than a silent drop). Marker
|
|
373
|
+
write failure after a successful publish is logged but does NOT
|
|
374
|
+
propagate (worst case is one duplicate next call within window).
|
|
375
|
+
|
|
376
|
+
:param message: The alert body. Severity tag + source prefix are
|
|
377
|
+
prepended automatically (e.g. ``"[ERROR] spot_backtest.sh: <body>"``).
|
|
378
|
+
:param severity: Free-form severity string (``error`` / ``critical``
|
|
379
|
+
push on Telegram; everything else is silent in-channel). The tag
|
|
380
|
+
is uppercased in the rendered message.
|
|
381
|
+
:param source: Optional source identifier (script path, repo, Lambda
|
|
382
|
+
name) inserted between the tag and the message body. Helps the
|
|
383
|
+
operator triage at a glance.
|
|
384
|
+
:param sns: When ``False``, skip the SNS publish entirely.
|
|
385
|
+
:param telegram: When ``False``, skip the Telegram fan-out entirely.
|
|
386
|
+
:param sns_topic_arn: Explicit topic ARN. Defaults to
|
|
387
|
+
``arn:aws:sns:{region}:{account_id}:alpha-engine-alerts`` resolved
|
|
388
|
+
from env + STS.
|
|
389
|
+
:param dedup_key: Opaque caller-chosen string. Same key + same
|
|
390
|
+
window ⇒ at most one publish per window. ``None`` (default)
|
|
391
|
+
disables dedup entirely; legacy callers behave unchanged.
|
|
392
|
+
:param dedup_window_min: Window in minutes after which a fresh
|
|
393
|
+
publish is allowed for the same ``dedup_key``. Default
|
|
394
|
+
``60``. Pass ``None`` for "forever" (publish once per
|
|
395
|
+
``dedup_key`` for the lifetime of the marker bucket).
|
|
396
|
+
:param dedup_bucket: S3 bucket holding the markers. Defaults to
|
|
397
|
+
``alpha-engine-research`` (the shared corpus bucket).
|
|
398
|
+
:returns: :class:`PublishResult` — caller can inspect per-channel
|
|
399
|
+
outcomes. :attr:`PublishResult.any_ok` is the typical success
|
|
400
|
+
gate; :attr:`PublishResult.all_ok` is the strict variant.
|
|
401
|
+
On dedup-skip, :attr:`PublishResult.dedup_skipped` is True and
|
|
402
|
+
:attr:`PublishResult.dedup_reason` explains why.
|
|
403
|
+
"""
|
|
404
|
+
result = PublishResult()
|
|
405
|
+
formatted = _format_message(message, severity, source)
|
|
406
|
+
|
|
407
|
+
# ── Dedup check (pre-publish) ────────────────────────────────────────
|
|
408
|
+
marker_key: str | None = None
|
|
409
|
+
bucket = dedup_bucket or DEFAULT_DEDUP_BUCKET
|
|
410
|
+
if dedup_key:
|
|
411
|
+
marker_key = _dedup_marker_key(dedup_key)
|
|
412
|
+
within_window, reason = _check_dedup_marker(
|
|
413
|
+
bucket, marker_key, dedup_window_min=dedup_window_min,
|
|
414
|
+
)
|
|
415
|
+
if within_window:
|
|
416
|
+
result.dedup_skipped = True
|
|
417
|
+
result.dedup_reason = reason
|
|
418
|
+
result.sns = ChannelResult(ok=False, detail="suppressed by dedup")
|
|
419
|
+
result.telegram = ChannelResult(ok=False, detail="suppressed by dedup")
|
|
420
|
+
logger.info(
|
|
421
|
+
"alerts.publish: skipped publish for dedup_key=%r (%s)",
|
|
422
|
+
dedup_key, reason,
|
|
423
|
+
)
|
|
424
|
+
return result
|
|
425
|
+
|
|
426
|
+
# ── Publish ──────────────────────────────────────────────────────────
|
|
427
|
+
if sns:
|
|
428
|
+
arn = _resolve_sns_topic_arn(sns_topic_arn)
|
|
429
|
+
if arn is None:
|
|
430
|
+
result.sns = ChannelResult(ok=False, detail="topic ARN resolution failed")
|
|
431
|
+
else:
|
|
432
|
+
# SNS subject — concise header, falls back to severity tag.
|
|
433
|
+
subject = f"Alpha Engine alert [{severity.upper()}]"
|
|
434
|
+
if source:
|
|
435
|
+
subject += f" — {source}"
|
|
436
|
+
result.sns = _publish_sns(arn, formatted, subject=subject)
|
|
437
|
+
|
|
438
|
+
if telegram:
|
|
439
|
+
result.telegram = _publish_telegram(formatted, severity=severity)
|
|
440
|
+
|
|
441
|
+
# ── Dedup marker write (post-publish, only if any channel succeeded) ─
|
|
442
|
+
if marker_key and (result.sns.ok or result.telegram.ok):
|
|
443
|
+
_write_dedup_marker(
|
|
444
|
+
bucket, marker_key,
|
|
445
|
+
dedup_key=dedup_key, formatted_message=formatted,
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
return result
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
# ─── CLI entry ──────────────────────────────────────────────────────────────
|
|
452
|
+
# Designed for Bash callers that need failure surveillance from a script
|
|
453
|
+
# (spot dispatcher `cleanup` traps, deploy.sh rollback branches, etc.).
|
|
454
|
+
# Mirrors the :mod:`alpha_engine_lib.transparency` ``python -m`` pattern so
|
|
455
|
+
# Bash callers reach this primitive without bootstrapping a full Python
|
|
456
|
+
# project. Exit code is 0 if *any* channel succeeded, 1 if both failed.
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def main(argv: list[str] | None = None) -> int:
|
|
460
|
+
import argparse
|
|
461
|
+
|
|
462
|
+
parser = argparse.ArgumentParser(
|
|
463
|
+
prog="python -m alpha_engine_lib.alerts",
|
|
464
|
+
description=(
|
|
465
|
+
"Publish a failure alert to alpha-engine's operator-surveillance "
|
|
466
|
+
"channels (SNS topic alpha-engine-alerts + Telegram). Designed "
|
|
467
|
+
"for Bash callers — exit code 0 if any channel succeeded, 1 if "
|
|
468
|
+
"both failed. Never raises."
|
|
469
|
+
),
|
|
470
|
+
)
|
|
471
|
+
subparsers = parser.add_subparsers(dest="cmd", required=True)
|
|
472
|
+
|
|
473
|
+
pub = subparsers.add_parser("publish", help="Publish an alert message.")
|
|
474
|
+
pub.add_argument("--message", required=True, help="Alert body text.")
|
|
475
|
+
pub.add_argument(
|
|
476
|
+
"--severity",
|
|
477
|
+
default="error",
|
|
478
|
+
help=(
|
|
479
|
+
"Severity tag (default: error). 'error' and 'critical' push on "
|
|
480
|
+
"Telegram; all others are silent in-channel."
|
|
481
|
+
),
|
|
482
|
+
)
|
|
483
|
+
pub.add_argument(
|
|
484
|
+
"--source",
|
|
485
|
+
default=None,
|
|
486
|
+
help=(
|
|
487
|
+
"Optional source identifier (script path, repo, Lambda name) "
|
|
488
|
+
"rendered between the severity tag and the message body."
|
|
489
|
+
),
|
|
490
|
+
)
|
|
491
|
+
pub.add_argument("--no-sns", action="store_true", help="Skip SNS publish.")
|
|
492
|
+
pub.add_argument("--no-telegram", action="store_true", help="Skip Telegram fan-out.")
|
|
493
|
+
pub.add_argument(
|
|
494
|
+
"--sns-topic-arn",
|
|
495
|
+
default=None,
|
|
496
|
+
help=(
|
|
497
|
+
"Override the SNS topic ARN. Defaults to "
|
|
498
|
+
"arn:aws:sns:{region}:{account_id}:alpha-engine-alerts."
|
|
499
|
+
),
|
|
500
|
+
)
|
|
501
|
+
pub.add_argument(
|
|
502
|
+
"--dedup-key",
|
|
503
|
+
default=None,
|
|
504
|
+
help=(
|
|
505
|
+
"Optional opaque dedup key. When set, ``publish`` checks an "
|
|
506
|
+
"S3 marker first and suppresses the alert if an earlier "
|
|
507
|
+
"publish for the same key is within --dedup-window-min. "
|
|
508
|
+
"Use for cost anomalies / canary rollback episodes / any "
|
|
509
|
+
"noisy WARN path that benefits from rate-limiting. Bash "
|
|
510
|
+
"callers typically pass a bucketed timestamp, e.g. "
|
|
511
|
+
"--dedup-key \"canary-rollback-$(date -u +%Y%m%d%H)\"."
|
|
512
|
+
),
|
|
513
|
+
)
|
|
514
|
+
pub.add_argument(
|
|
515
|
+
"--dedup-window-min",
|
|
516
|
+
type=int,
|
|
517
|
+
default=DEFAULT_DEDUP_WINDOW_MIN,
|
|
518
|
+
help=(
|
|
519
|
+
f"Window in minutes after which a fresh publish is allowed for "
|
|
520
|
+
f"the same --dedup-key (default: {DEFAULT_DEDUP_WINDOW_MIN}). "
|
|
521
|
+
"Pass 0 for 'forever' (publish once per --dedup-key for the "
|
|
522
|
+
"lifetime of the marker bucket)."
|
|
523
|
+
),
|
|
524
|
+
)
|
|
525
|
+
pub.add_argument(
|
|
526
|
+
"--dedup-bucket",
|
|
527
|
+
default=None,
|
|
528
|
+
help=(
|
|
529
|
+
f"S3 bucket holding the dedup markers. Defaults to "
|
|
530
|
+
f"{DEFAULT_DEDUP_BUCKET!r}."
|
|
531
|
+
),
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
args = parser.parse_args(argv)
|
|
535
|
+
|
|
536
|
+
logging.basicConfig(level=logging.WARNING)
|
|
537
|
+
|
|
538
|
+
# CLI convention: --dedup-window-min 0 = forever; map to None for the
|
|
539
|
+
# Python API (whose default is 60 + None=forever).
|
|
540
|
+
window_min: int | None
|
|
541
|
+
if args.dedup_window_min == 0:
|
|
542
|
+
window_min = None
|
|
543
|
+
else:
|
|
544
|
+
window_min = args.dedup_window_min
|
|
545
|
+
|
|
546
|
+
result = publish(
|
|
547
|
+
args.message,
|
|
548
|
+
severity=args.severity,
|
|
549
|
+
source=args.source,
|
|
550
|
+
sns=not args.no_sns,
|
|
551
|
+
telegram=not args.no_telegram,
|
|
552
|
+
sns_topic_arn=args.sns_topic_arn,
|
|
553
|
+
dedup_key=args.dedup_key,
|
|
554
|
+
dedup_window_min=window_min,
|
|
555
|
+
dedup_bucket=args.dedup_bucket,
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
# One-line status to stderr (stdout reserved for structured output if
|
|
559
|
+
# any caller starts parsing it). Bash callers can ignore.
|
|
560
|
+
if result.dedup_skipped:
|
|
561
|
+
print(
|
|
562
|
+
f"alerts.publish: dedup_skipped=True ({result.dedup_reason})",
|
|
563
|
+
file=sys.stderr,
|
|
564
|
+
)
|
|
565
|
+
else:
|
|
566
|
+
print(
|
|
567
|
+
f"alerts.publish: sns.ok={result.sns.ok} ({result.sns.detail}); "
|
|
568
|
+
f"telegram.ok={result.telegram.ok} ({result.telegram.detail})",
|
|
569
|
+
file=sys.stderr,
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
return 0 if result.any_ok else 1
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
if __name__ == "__main__":
|
|
576
|
+
sys.exit(main())
|