spanforge 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spanforge/__init__.py +815 -0
- spanforge/_ansi.py +93 -0
- spanforge/_batch_exporter.py +409 -0
- spanforge/_cli.py +2094 -0
- spanforge/_cli_audit.py +639 -0
- spanforge/_cli_compliance.py +711 -0
- spanforge/_cli_cost.py +243 -0
- spanforge/_cli_ops.py +791 -0
- spanforge/_cli_phase11.py +356 -0
- spanforge/_hooks.py +337 -0
- spanforge/_server.py +1708 -0
- spanforge/_span.py +1036 -0
- spanforge/_store.py +288 -0
- spanforge/_stream.py +664 -0
- spanforge/_trace.py +335 -0
- spanforge/_tracer.py +254 -0
- spanforge/actor.py +141 -0
- spanforge/alerts.py +469 -0
- spanforge/auto.py +464 -0
- spanforge/baseline.py +335 -0
- spanforge/cache.py +635 -0
- spanforge/compliance.py +325 -0
- spanforge/config.py +532 -0
- spanforge/consent.py +228 -0
- spanforge/consumer.py +377 -0
- spanforge/core/__init__.py +5 -0
- spanforge/core/compliance_mapping.py +1254 -0
- spanforge/cost.py +600 -0
- spanforge/debug.py +548 -0
- spanforge/deprecations.py +205 -0
- spanforge/drift.py +482 -0
- spanforge/egress.py +58 -0
- spanforge/eval.py +648 -0
- spanforge/event.py +1064 -0
- spanforge/exceptions.py +240 -0
- spanforge/explain.py +178 -0
- spanforge/export/__init__.py +69 -0
- spanforge/export/append_only.py +337 -0
- spanforge/export/cloud.py +357 -0
- spanforge/export/datadog.py +497 -0
- spanforge/export/grafana.py +320 -0
- spanforge/export/jsonl.py +195 -0
- spanforge/export/openinference.py +158 -0
- spanforge/export/otel_bridge.py +294 -0
- spanforge/export/otlp.py +811 -0
- spanforge/export/otlp_bridge.py +233 -0
- spanforge/export/redis_backend.py +282 -0
- spanforge/export/siem_schema.py +98 -0
- spanforge/export/siem_splunk.py +264 -0
- spanforge/export/siem_syslog.py +212 -0
- spanforge/export/webhook.py +299 -0
- spanforge/exporters/__init__.py +30 -0
- spanforge/exporters/console.py +271 -0
- spanforge/exporters/jsonl.py +144 -0
- spanforge/exporters/sqlite.py +142 -0
- spanforge/gate.py +1150 -0
- spanforge/governance.py +181 -0
- spanforge/hitl.py +295 -0
- spanforge/http.py +187 -0
- spanforge/inspect.py +427 -0
- spanforge/integrations/__init__.py +45 -0
- spanforge/integrations/_pricing.py +280 -0
- spanforge/integrations/anthropic.py +388 -0
- spanforge/integrations/azure_openai.py +133 -0
- spanforge/integrations/bedrock.py +292 -0
- spanforge/integrations/crewai.py +251 -0
- spanforge/integrations/gemini.py +351 -0
- spanforge/integrations/groq.py +442 -0
- spanforge/integrations/langchain.py +349 -0
- spanforge/integrations/langgraph.py +306 -0
- spanforge/integrations/llamaindex.py +373 -0
- spanforge/integrations/ollama.py +287 -0
- spanforge/integrations/openai.py +368 -0
- spanforge/integrations/together.py +483 -0
- spanforge/io.py +214 -0
- spanforge/lint.py +322 -0
- spanforge/metrics.py +417 -0
- spanforge/metrics_export.py +343 -0
- spanforge/migrate.py +402 -0
- spanforge/model_registry.py +278 -0
- spanforge/models.py +389 -0
- spanforge/namespaces/__init__.py +254 -0
- spanforge/namespaces/audit.py +256 -0
- spanforge/namespaces/cache.py +237 -0
- spanforge/namespaces/chain.py +77 -0
- spanforge/namespaces/confidence.py +72 -0
- spanforge/namespaces/consent.py +92 -0
- spanforge/namespaces/cost.py +179 -0
- spanforge/namespaces/decision.py +143 -0
- spanforge/namespaces/diff.py +157 -0
- spanforge/namespaces/drift.py +80 -0
- spanforge/namespaces/eval_.py +251 -0
- spanforge/namespaces/feedback.py +241 -0
- spanforge/namespaces/fence.py +193 -0
- spanforge/namespaces/guard.py +105 -0
- spanforge/namespaces/hitl.py +91 -0
- spanforge/namespaces/latency.py +72 -0
- spanforge/namespaces/prompt.py +190 -0
- spanforge/namespaces/redact.py +173 -0
- spanforge/namespaces/retrieval.py +379 -0
- spanforge/namespaces/runtime_governance.py +494 -0
- spanforge/namespaces/template.py +208 -0
- spanforge/namespaces/tool_call.py +77 -0
- spanforge/namespaces/trace.py +1029 -0
- spanforge/normalizer.py +171 -0
- spanforge/plugins.py +82 -0
- spanforge/presidio_backend.py +349 -0
- spanforge/processor.py +258 -0
- spanforge/prompt_registry.py +418 -0
- spanforge/py.typed +0 -0
- spanforge/redact.py +914 -0
- spanforge/regression.py +192 -0
- spanforge/runtime_policy.py +159 -0
- spanforge/sampling.py +511 -0
- spanforge/schema.py +183 -0
- spanforge/schemas/v1.0/schema.json +170 -0
- spanforge/schemas/v2.0/schema.json +536 -0
- spanforge/sdk/__init__.py +625 -0
- spanforge/sdk/_base.py +584 -0
- spanforge/sdk/_base.pyi +71 -0
- spanforge/sdk/_exceptions.py +1096 -0
- spanforge/sdk/_types.py +2184 -0
- spanforge/sdk/alert.py +1514 -0
- spanforge/sdk/alert.pyi +56 -0
- spanforge/sdk/audit.py +1196 -0
- spanforge/sdk/audit.pyi +67 -0
- spanforge/sdk/cec.py +1215 -0
- spanforge/sdk/cec.pyi +37 -0
- spanforge/sdk/config.py +641 -0
- spanforge/sdk/config.pyi +55 -0
- spanforge/sdk/enterprise.py +714 -0
- spanforge/sdk/enterprise.pyi +79 -0
- spanforge/sdk/explain.py +170 -0
- spanforge/sdk/fallback.py +432 -0
- spanforge/sdk/feedback.py +351 -0
- spanforge/sdk/gate.py +874 -0
- spanforge/sdk/gate.pyi +51 -0
- spanforge/sdk/identity.py +2114 -0
- spanforge/sdk/identity.pyi +47 -0
- spanforge/sdk/lineage.py +175 -0
- spanforge/sdk/observe.py +1065 -0
- spanforge/sdk/observe.pyi +50 -0
- spanforge/sdk/operator.py +338 -0
- spanforge/sdk/pii.py +1473 -0
- spanforge/sdk/pii.pyi +119 -0
- spanforge/sdk/pipelines.py +458 -0
- spanforge/sdk/pipelines.pyi +39 -0
- spanforge/sdk/policy.py +930 -0
- spanforge/sdk/rag.py +594 -0
- spanforge/sdk/rbac.py +280 -0
- spanforge/sdk/registry.py +430 -0
- spanforge/sdk/registry.pyi +46 -0
- spanforge/sdk/scope.py +279 -0
- spanforge/sdk/secrets.py +293 -0
- spanforge/sdk/secrets.pyi +25 -0
- spanforge/sdk/security.py +560 -0
- spanforge/sdk/security.pyi +57 -0
- spanforge/sdk/trust.py +472 -0
- spanforge/sdk/trust.pyi +41 -0
- spanforge/secrets.py +799 -0
- spanforge/signing.py +1179 -0
- spanforge/stats.py +100 -0
- spanforge/stream.py +560 -0
- spanforge/testing.py +378 -0
- spanforge/testing_mocks.py +1052 -0
- spanforge/trace.py +199 -0
- spanforge/types.py +696 -0
- spanforge/ulid.py +300 -0
- spanforge/validate.py +379 -0
- spanforge-1.0.0.dist-info/METADATA +1509 -0
- spanforge-1.0.0.dist-info/RECORD +174 -0
- spanforge-1.0.0.dist-info/WHEEL +4 -0
- spanforge-1.0.0.dist-info/entry_points.txt +5 -0
- spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/sdk/alert.py
ADDED
|
@@ -0,0 +1,1514 @@
|
|
|
1
|
+
"""spanforge.sdk.alert — SpanForge sf-alert Alert Routing Service (Phase 7).
|
|
2
|
+
|
|
3
|
+
Implements the full sf-alert API surface: topic-based publish, per-sink circuit
|
|
4
|
+
breakers, 5-minute deduplication, per-project rate limiting, alert grouping,
|
|
5
|
+
CRITICAL escalation policy, maintenance-window suppression, webhook HMAC signing,
|
|
6
|
+
and integrations with Slack, Teams, PagerDuty, OpsGenie, VictorOps, Incident.io,
|
|
7
|
+
SMS (Twilio), and generic HMAC-signed webhooks.
|
|
8
|
+
|
|
9
|
+
Architecture
|
|
10
|
+
------------
|
|
11
|
+
* :meth:`publish` is the **primary entry point**. It validates the topic,
|
|
12
|
+
checks maintenance windows, deduplicates by ``(topic, project_id)``, applies
|
|
13
|
+
per-project rate limits, enqueues the alert, and returns a
|
|
14
|
+
:class:`~spanforge.sdk._types.PublishResult` immediately.
|
|
15
|
+
* A **background worker thread** drains the queue and dispatches to each
|
|
16
|
+
configured sink through its own :class:`~spanforge.sdk._base._CircuitBreaker`.
|
|
17
|
+
* **CRITICAL alerts** schedule a :class:`threading.Timer` for auto-escalation
|
|
18
|
+
after ``escalation_wait_seconds`` (default: 900 s = 15 min).
|
|
19
|
+
:meth:`acknowledge` cancels the timer.
|
|
20
|
+
* All alert emissions are appended to ``sf-audit`` schema ``spanforge.alert.v1``
|
|
21
|
+
on a best-effort basis (failures are logged at DEBUG level).
|
|
22
|
+
|
|
23
|
+
Topic registry (ALT-002, ALT-003)
|
|
24
|
+
-----------------------------------
|
|
25
|
+
Eight built-in topics match HallucCheck's published event taxonomy. Additional
|
|
26
|
+
topics can be registered with :meth:`register_topic`. Publishing to an unknown
|
|
27
|
+
topic logs a WARNING and routes to the catch-all sink list if configured.
|
|
28
|
+
|
|
29
|
+
Deduplication (ALT-010)
|
|
30
|
+
------------------------
|
|
31
|
+
The same ``(topic, project_id)`` pair is suppressed for ``dedup_window_seconds``
|
|
32
|
+
(default: 300 s). Per-topic windows override the client default.
|
|
33
|
+
|
|
34
|
+
Alert grouping (ALT-011)
|
|
35
|
+
--------------------------
|
|
36
|
+
Multiple alerts sharing the same ``(topic_prefix, project_id)`` within a 2-minute
|
|
37
|
+
window are coalesced into a single notification. The group is flushed when the
|
|
38
|
+
timer fires or when the window elapses.
|
|
39
|
+
|
|
40
|
+
Escalation policy (ALT-020, ALT-021)
|
|
41
|
+
--------------------------------------
|
|
42
|
+
CRITICAL severity alerts schedule an escalation timer. When the timer fires the
|
|
43
|
+
alert is re-dispatched to the escalation sink list. Call :meth:`acknowledge` to
|
|
44
|
+
cancel the timer.
|
|
45
|
+
|
|
46
|
+
Sink integrations
|
|
47
|
+
-----------------
|
|
48
|
+
All sinks live in this module:
|
|
49
|
+
|
|
50
|
+
* :class:`WebhookAlerter` — generic HMAC-signed webhook (ALT-034)
|
|
51
|
+
* :class:`OpsGenieAlerter` — OpsGenie Alert API v2 (ALT-030)
|
|
52
|
+
* :class:`VictorOpsAlerter` — VictorOps / Splunk On-Call (ALT-031)
|
|
53
|
+
* :class:`IncidentIOAlerter` — Incident.io (ALT-032)
|
|
54
|
+
* :class:`SMSAlerter` — Twilio SMS (ALT-033)
|
|
55
|
+
* :class:`TeamsAdaptiveCardAlerter` — enhanced Teams Adaptive Card (ALT-035)
|
|
56
|
+
|
|
57
|
+
The existing ``spanforge.alerts`` sinks (Slack, Teams, PagerDuty, Email) are
|
|
58
|
+
re-exported here for convenience.
|
|
59
|
+
|
|
60
|
+
Security requirements
|
|
61
|
+
---------------------
|
|
62
|
+
* Webhook HMAC secrets are never logged. :class:`WebhookAlerter` uses
|
|
63
|
+
:func:`hmac.compare_digest` for constant-time comparison and sets the
|
|
64
|
+
``X-SF-Signature: sha256=<hex>`` header.
|
|
65
|
+
* PagerDuty and OpsGenie integration keys are stored in ``repr=False`` fields.
|
|
66
|
+
* All remote URLs are validated with :func:`_validate_http_url` (same guard used
|
|
67
|
+
in ``observe.py``) before each request.
|
|
68
|
+
* The audit log appended to sf-audit uses ``best_effort=True``; any failure is
|
|
69
|
+
swallowed at DEBUG level so alerting itself is never blocked by audit issues.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
from __future__ import annotations
|
|
73
|
+
|
|
74
|
+
import hashlib
|
|
75
|
+
import hmac as _hmac
|
|
76
|
+
import ipaddress
|
|
77
|
+
import json
|
|
78
|
+
import logging
|
|
79
|
+
import os
|
|
80
|
+
import queue
|
|
81
|
+
import threading
|
|
82
|
+
import time
|
|
83
|
+
import urllib.error
|
|
84
|
+
import urllib.parse
|
|
85
|
+
import urllib.request
|
|
86
|
+
import uuid
|
|
87
|
+
from dataclasses import dataclass, field
|
|
88
|
+
from datetime import datetime, timezone
|
|
89
|
+
from typing import Any, Union
|
|
90
|
+
|
|
91
|
+
from spanforge.sdk._base import (
|
|
92
|
+
SFClientConfig,
|
|
93
|
+
SFServiceClient,
|
|
94
|
+
_CircuitBreaker,
|
|
95
|
+
_SlidingWindowRateLimiter,
|
|
96
|
+
)
|
|
97
|
+
from spanforge.sdk._exceptions import (
|
|
98
|
+
SFAlertRateLimitedError,
|
|
99
|
+
)
|
|
100
|
+
from spanforge.sdk._types import (
|
|
101
|
+
AlertRecord,
|
|
102
|
+
AlertStatusInfo,
|
|
103
|
+
MaintenanceWindow,
|
|
104
|
+
PublishResult,
|
|
105
|
+
TopicRegistration,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
__all__ = [
|
|
109
|
+
"KNOWN_TOPICS",
|
|
110
|
+
"IncidentIOAlerter",
|
|
111
|
+
"OpsGenieAlerter",
|
|
112
|
+
"SFAlertClient",
|
|
113
|
+
"SMSAlerter",
|
|
114
|
+
"TeamsAdaptiveCardAlerter",
|
|
115
|
+
"VictorOpsAlerter",
|
|
116
|
+
"WebhookAlerter",
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
_log = logging.getLogger(__name__)
|
|
120
|
+
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
# Constants
|
|
123
|
+
# ---------------------------------------------------------------------------
|
|
124
|
+
|
|
125
|
+
#: The eight HallucCheck-defined topics wired at design time.
|
|
126
|
+
KNOWN_TOPICS: frozenset[str] = frozenset(
|
|
127
|
+
{
|
|
128
|
+
"halluccheck.drift.amber",
|
|
129
|
+
"halluccheck.drift.red",
|
|
130
|
+
"halluccheck.bias.critical",
|
|
131
|
+
"halluccheck.prri.red",
|
|
132
|
+
"halluccheck.benchmark.regression",
|
|
133
|
+
"halluccheck.pii.detected",
|
|
134
|
+
"halluccheck.secrets.detected",
|
|
135
|
+
"halluccheck.trust_gate.failed",
|
|
136
|
+
},
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
_DEDUP_WINDOW_DEFAULT: float = 300.0 # 5 min
|
|
140
|
+
_GROUP_WINDOW_SECS: float = 120.0 # 2 min
|
|
141
|
+
_ESCALATION_WAIT_DEFAULT: float = 900.0 # 15 min
|
|
142
|
+
_QUEUE_MAX: int = 1_000
|
|
143
|
+
_RATE_LIMIT_PER_MINUTE: int = 60
|
|
144
|
+
_HISTORY_MAX: int = 10_000
|
|
145
|
+
|
|
146
|
+
# Severity ordinal for escalation gating
|
|
147
|
+
_SEVERITY_RANK: dict[str, int] = {"info": 0, "warning": 1, "high": 2, "critical": 3}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# ---------------------------------------------------------------------------
|
|
151
|
+
# URL validation (SSRF guard)
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _validate_http_url(url: str) -> None:
|
|
156
|
+
"""Raise :exc:`ValueError` if *url* is not a safe HTTP/HTTPS URL.
|
|
157
|
+
|
|
158
|
+
Rejects:
|
|
159
|
+
* Non-HTTP/HTTPS schemes.
|
|
160
|
+
* Private/loopback IP targets (unless ``SPANFORGE_ALLOW_LOOPBACK=1``).
|
|
161
|
+
* Overly long URLs (> 2 048 chars).
|
|
162
|
+
"""
|
|
163
|
+
if len(url) > 2048:
|
|
164
|
+
raise ValueError(f"URL too long: {len(url)} chars (max 2048)")
|
|
165
|
+
parsed = urllib.parse.urlparse(url)
|
|
166
|
+
if parsed.scheme not in ("http", "https"):
|
|
167
|
+
raise ValueError(f"Unsupported scheme {parsed.scheme!r}; only http/https allowed")
|
|
168
|
+
hostname = parsed.hostname or ""
|
|
169
|
+
if os.environ.get("SPANFORGE_ALLOW_LOOPBACK", "").lower() not in ("1", "true", "yes"):
|
|
170
|
+
try:
|
|
171
|
+
addr = ipaddress.ip_address(hostname)
|
|
172
|
+
if addr.is_private or addr.is_loopback or addr.is_link_local:
|
|
173
|
+
raise ValueError(f"Destination IP {hostname!r} is private/loopback (SSRF guard)")
|
|
174
|
+
except ValueError as exc:
|
|
175
|
+
if "SSRF" in str(exc):
|
|
176
|
+
raise
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
# ---------------------------------------------------------------------------
|
|
180
|
+
# Sink implementations (Phase 7 additions)
|
|
181
|
+
# ---------------------------------------------------------------------------
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@dataclass
|
|
185
|
+
class WebhookAlerter:
|
|
186
|
+
"""Generic HMAC-signed webhook sink (ALT-034).
|
|
187
|
+
|
|
188
|
+
Sends a JSON POST with ``X-SF-Signature: sha256=<hmac>`` header.
|
|
189
|
+
The HMAC is computed over the UTF-8 encoded request body using the
|
|
190
|
+
configured *secret*. Receivers verify with :func:`hmac.compare_digest`.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
url: Webhook endpoint URL.
|
|
194
|
+
secret: HMAC signing secret. **Never logged.**
|
|
195
|
+
timeout: HTTP timeout in seconds.
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
url: str
|
|
199
|
+
secret: str = field(repr=False, default="")
|
|
200
|
+
timeout: int = 10
|
|
201
|
+
|
|
202
|
+
def send(
|
|
203
|
+
self,
|
|
204
|
+
title: str,
|
|
205
|
+
message: str,
|
|
206
|
+
severity: str = "warning",
|
|
207
|
+
extra: dict[str, Any] | None = None,
|
|
208
|
+
) -> None:
|
|
209
|
+
"""POST alert JSON with HMAC signature."""
|
|
210
|
+
_validate_http_url(self.url)
|
|
211
|
+
body: dict[str, Any] = {"title": title, "message": message, "severity": severity}
|
|
212
|
+
if extra:
|
|
213
|
+
body.update(extra)
|
|
214
|
+
data = json.dumps(body).encode()
|
|
215
|
+
sig = _hmac.new(self.secret.encode(), data, hashlib.sha256).hexdigest()
|
|
216
|
+
req = urllib.request.Request(
|
|
217
|
+
self.url,
|
|
218
|
+
data=data,
|
|
219
|
+
headers={
|
|
220
|
+
"Content-Type": "application/json",
|
|
221
|
+
"X-SF-Signature": f"sha256={sig}",
|
|
222
|
+
},
|
|
223
|
+
method="POST",
|
|
224
|
+
)
|
|
225
|
+
try:
|
|
226
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
|
|
227
|
+
if resp.status not in (200, 201, 202, 204):
|
|
228
|
+
_log.warning("WebhookAlerter: unexpected status %s", resp.status)
|
|
229
|
+
except urllib.error.URLError as exc:
|
|
230
|
+
_log.warning("WebhookAlerter: request failed: %s", exc)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@dataclass
|
|
234
|
+
class OpsGenieAlerter:
|
|
235
|
+
"""OpsGenie Alert API v2 sink (ALT-030).
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
api_key: OpsGenie API key. **Never logged.**
|
|
239
|
+
region: ``"us"`` (default) or ``"eu"``.
|
|
240
|
+
timeout: HTTP timeout in seconds.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
api_key: str = field(repr=False)
|
|
244
|
+
region: str = "us"
|
|
245
|
+
timeout: int = 10
|
|
246
|
+
|
|
247
|
+
_PRIORITY_MAP: dict[str, str] = field(
|
|
248
|
+
init=False,
|
|
249
|
+
repr=False,
|
|
250
|
+
default_factory=lambda: {
|
|
251
|
+
"info": "P5",
|
|
252
|
+
"warning": "P3",
|
|
253
|
+
"high": "P2",
|
|
254
|
+
"critical": "P1",
|
|
255
|
+
},
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
def _url(self) -> str:
|
|
259
|
+
if self.region == "eu":
|
|
260
|
+
return "https://api.eu.opsgenie.com/v2/alerts"
|
|
261
|
+
return "https://api.opsgenie.com/v2/alerts"
|
|
262
|
+
|
|
263
|
+
def send(
|
|
264
|
+
self,
|
|
265
|
+
title: str,
|
|
266
|
+
message: str,
|
|
267
|
+
severity: str = "warning",
|
|
268
|
+
extra: dict[str, Any] | None = None,
|
|
269
|
+
) -> None:
|
|
270
|
+
"""Create an OpsGenie alert."""
|
|
271
|
+
url = self._url()
|
|
272
|
+
_validate_http_url(url)
|
|
273
|
+
priority = self._PRIORITY_MAP.get(severity.lower(), "P3")
|
|
274
|
+
payload: dict[str, Any] = {
|
|
275
|
+
"message": title,
|
|
276
|
+
"description": message,
|
|
277
|
+
"priority": priority,
|
|
278
|
+
"tags": [f"severity:{severity}", "spanforge"],
|
|
279
|
+
}
|
|
280
|
+
if extra:
|
|
281
|
+
payload["details"] = {str(k): str(v) for k, v in extra.items()}
|
|
282
|
+
data = json.dumps(payload).encode()
|
|
283
|
+
req = urllib.request.Request(
|
|
284
|
+
url,
|
|
285
|
+
data=data,
|
|
286
|
+
headers={
|
|
287
|
+
"Content-Type": "application/json",
|
|
288
|
+
"Authorization": f"GenieKey {self.api_key}",
|
|
289
|
+
},
|
|
290
|
+
method="POST",
|
|
291
|
+
)
|
|
292
|
+
try:
|
|
293
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
|
|
294
|
+
if resp.status not in (200, 201, 202):
|
|
295
|
+
_log.warning("OpsGenieAlerter: unexpected status %s", resp.status)
|
|
296
|
+
except urllib.error.URLError as exc:
|
|
297
|
+
_log.warning("OpsGenieAlerter: request failed: %s", exc)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
@dataclass
|
|
301
|
+
class VictorOpsAlerter:
|
|
302
|
+
"""VictorOps / Splunk On-Call sink (ALT-031).
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
rest_endpoint_url: VictorOps REST endpoint URL including routing key.
|
|
306
|
+
timeout: HTTP timeout in seconds.
|
|
307
|
+
"""
|
|
308
|
+
|
|
309
|
+
rest_endpoint_url: str
|
|
310
|
+
timeout: int = 10
|
|
311
|
+
|
|
312
|
+
_MSG_TYPE_MAP: dict[str, str] = field(
|
|
313
|
+
init=False,
|
|
314
|
+
repr=False,
|
|
315
|
+
default_factory=lambda: {
|
|
316
|
+
"info": "INFO",
|
|
317
|
+
"warning": "WARNING",
|
|
318
|
+
"high": "CRITICAL",
|
|
319
|
+
"critical": "CRITICAL",
|
|
320
|
+
},
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
def send(
|
|
324
|
+
self,
|
|
325
|
+
title: str,
|
|
326
|
+
message: str,
|
|
327
|
+
severity: str = "warning",
|
|
328
|
+
extra: dict[str, Any] | None = None,
|
|
329
|
+
) -> None:
|
|
330
|
+
"""POST to VictorOps REST endpoint."""
|
|
331
|
+
_validate_http_url(self.rest_endpoint_url)
|
|
332
|
+
message_type = self._MSG_TYPE_MAP.get(severity.lower(), "WARNING")
|
|
333
|
+
payload: dict[str, Any] = {
|
|
334
|
+
"message_type": message_type,
|
|
335
|
+
"entity_display_name": title,
|
|
336
|
+
"state_message": message,
|
|
337
|
+
}
|
|
338
|
+
if extra:
|
|
339
|
+
payload.update(extra)
|
|
340
|
+
data = json.dumps(payload).encode()
|
|
341
|
+
req = urllib.request.Request(
|
|
342
|
+
self.rest_endpoint_url,
|
|
343
|
+
data=data,
|
|
344
|
+
headers={"Content-Type": "application/json"},
|
|
345
|
+
method="POST",
|
|
346
|
+
)
|
|
347
|
+
try:
|
|
348
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
|
|
349
|
+
if resp.status not in (200, 201, 202):
|
|
350
|
+
_log.warning("VictorOpsAlerter: unexpected status %s", resp.status)
|
|
351
|
+
except urllib.error.URLError as exc:
|
|
352
|
+
_log.warning("VictorOpsAlerter: request failed: %s", exc)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
@dataclass
|
|
356
|
+
class IncidentIOAlerter:
|
|
357
|
+
"""Incident.io sink (ALT-032).
|
|
358
|
+
|
|
359
|
+
Creates or updates an Incident.io incident via the REST API.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
api_key: Incident.io API key. **Never logged.**
|
|
363
|
+
timeout: HTTP timeout in seconds.
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
api_key: str = field(repr=False)
|
|
367
|
+
timeout: int = 10
|
|
368
|
+
|
|
369
|
+
_SEVERITY_MAP: dict[str, str] = field(
|
|
370
|
+
init=False,
|
|
371
|
+
repr=False,
|
|
372
|
+
default_factory=lambda: {
|
|
373
|
+
"info": "minor",
|
|
374
|
+
"warning": "major",
|
|
375
|
+
"high": "major",
|
|
376
|
+
"critical": "critical",
|
|
377
|
+
},
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
_URL: str = "https://api.incident.io/v1/incidents"
|
|
381
|
+
|
|
382
|
+
def send(
|
|
383
|
+
self,
|
|
384
|
+
title: str,
|
|
385
|
+
message: str,
|
|
386
|
+
severity: str = "warning",
|
|
387
|
+
extra: dict[str, Any] | None = None,
|
|
388
|
+
) -> None:
|
|
389
|
+
"""Create an Incident.io incident."""
|
|
390
|
+
_validate_http_url(self._URL)
|
|
391
|
+
sev = self._SEVERITY_MAP.get(severity.lower(), "major")
|
|
392
|
+
payload: dict[str, Any] = {
|
|
393
|
+
"name": title,
|
|
394
|
+
"summary": message,
|
|
395
|
+
"severity": {"name": sev},
|
|
396
|
+
"visibility": "public",
|
|
397
|
+
}
|
|
398
|
+
if extra:
|
|
399
|
+
payload["custom_field_entries"] = [
|
|
400
|
+
{"custom_field": {"name": str(k)}, "values": [{"value_text": str(v)}]}
|
|
401
|
+
for k, v in extra.items()
|
|
402
|
+
]
|
|
403
|
+
data = json.dumps(payload).encode()
|
|
404
|
+
req = urllib.request.Request(
|
|
405
|
+
self._URL,
|
|
406
|
+
data=data,
|
|
407
|
+
headers={
|
|
408
|
+
"Content-Type": "application/json",
|
|
409
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
410
|
+
},
|
|
411
|
+
method="POST",
|
|
412
|
+
)
|
|
413
|
+
try:
|
|
414
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
|
|
415
|
+
if resp.status not in (200, 201, 202):
|
|
416
|
+
_log.warning("IncidentIOAlerter: unexpected status %s", resp.status)
|
|
417
|
+
except urllib.error.URLError as exc:
|
|
418
|
+
_log.warning("IncidentIOAlerter: request failed: %s", exc)
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
@dataclass
|
|
422
|
+
class SMSAlerter:
|
|
423
|
+
"""Twilio SMS sink (ALT-033). Enterprise tier only.
|
|
424
|
+
|
|
425
|
+
Sends a 160-character-limited SMS via the Twilio REST API.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
account_sid: Twilio Account SID.
|
|
429
|
+
auth_token: Twilio Auth Token. **Never logged.**
|
|
430
|
+
from_number: Twilio phone number (E.164 format, e.g. ``"+15005550006"``).
|
|
431
|
+
to_numbers: List of recipient phone numbers (E.164 format).
|
|
432
|
+
timeout: HTTP timeout in seconds.
|
|
433
|
+
"""
|
|
434
|
+
|
|
435
|
+
account_sid: str
|
|
436
|
+
auth_token: str = field(repr=False)
|
|
437
|
+
from_number: str
|
|
438
|
+
to_numbers: list[str] = field(default_factory=list)
|
|
439
|
+
timeout: int = 10
|
|
440
|
+
|
|
441
|
+
def send(
|
|
442
|
+
self,
|
|
443
|
+
title: str,
|
|
444
|
+
message: str,
|
|
445
|
+
severity: str = "warning",
|
|
446
|
+
extra: dict[str, Any] | None = None,
|
|
447
|
+
) -> None:
|
|
448
|
+
"""Send SMS to all configured recipients."""
|
|
449
|
+
if not self.to_numbers:
|
|
450
|
+
_log.warning("SMSAlerter: no recipients configured, skipping")
|
|
451
|
+
return
|
|
452
|
+
body_raw = f"[{severity.upper()}] {title}: {message}"
|
|
453
|
+
body = body_raw[:160]
|
|
454
|
+
url = f"https://api.twilio.com/2010-04-01/Accounts/{self.account_sid}/Messages.json"
|
|
455
|
+
_validate_http_url(url)
|
|
456
|
+
for to_number in self.to_numbers:
|
|
457
|
+
form_data = urllib.parse.urlencode(
|
|
458
|
+
{"From": self.from_number, "To": to_number, "Body": body},
|
|
459
|
+
).encode()
|
|
460
|
+
# Basic auth: account_sid:auth_token
|
|
461
|
+
cred = f"{self.account_sid}:{self.auth_token}".encode()
|
|
462
|
+
import base64
|
|
463
|
+
|
|
464
|
+
b64 = base64.b64encode(cred).decode()
|
|
465
|
+
req = urllib.request.Request(
|
|
466
|
+
url,
|
|
467
|
+
data=form_data,
|
|
468
|
+
headers={
|
|
469
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
470
|
+
"Authorization": f"Basic {b64}",
|
|
471
|
+
},
|
|
472
|
+
method="POST",
|
|
473
|
+
)
|
|
474
|
+
try:
|
|
475
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
|
|
476
|
+
if resp.status not in (200, 201):
|
|
477
|
+
_log.warning(
|
|
478
|
+
"SMSAlerter: unexpected status %s for %s",
|
|
479
|
+
resp.status,
|
|
480
|
+
to_number,
|
|
481
|
+
)
|
|
482
|
+
except urllib.error.URLError as exc:
|
|
483
|
+
_log.warning("SMSAlerter: request failed for %s: %s", to_number, exc)
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
@dataclass
|
|
487
|
+
class TeamsAdaptiveCardAlerter:
|
|
488
|
+
"""Enhanced Microsoft Teams Adaptive Card sink (ALT-035).
|
|
489
|
+
|
|
490
|
+
Sends a rich Adaptive Card with a severity colour band, a fact table
|
|
491
|
+
from payload fields, and Acknowledge / Silence action buttons.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
webhook_url: Teams channel Incoming Webhook URL.
|
|
495
|
+
timeout: HTTP timeout in seconds.
|
|
496
|
+
"""
|
|
497
|
+
|
|
498
|
+
webhook_url: str
|
|
499
|
+
timeout: int = 10
|
|
500
|
+
|
|
501
|
+
_COLOUR_MAP: dict[str, str] = field(
|
|
502
|
+
init=False,
|
|
503
|
+
repr=False,
|
|
504
|
+
default_factory=lambda: {
|
|
505
|
+
"info": "Good",
|
|
506
|
+
"warning": "Warning",
|
|
507
|
+
"high": "Warning",
|
|
508
|
+
"critical": "Attention",
|
|
509
|
+
},
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
def send(
|
|
513
|
+
self,
|
|
514
|
+
title: str,
|
|
515
|
+
message: str,
|
|
516
|
+
severity: str = "warning",
|
|
517
|
+
extra: dict[str, Any] | None = None,
|
|
518
|
+
) -> None:
|
|
519
|
+
"""POST an Adaptive Card to the Teams webhook."""
|
|
520
|
+
_validate_http_url(self.webhook_url)
|
|
521
|
+
colour = self._COLOUR_MAP.get(severity.lower(), "Warning")
|
|
522
|
+
facts = [{"title": str(k), "value": str(v)} for k, v in (extra or {}).items()]
|
|
523
|
+
card_body: list[dict[str, Any]] = [
|
|
524
|
+
{
|
|
525
|
+
"type": "TextBlock",
|
|
526
|
+
"text": title,
|
|
527
|
+
"weight": "Bolder",
|
|
528
|
+
"size": "Medium",
|
|
529
|
+
"color": colour,
|
|
530
|
+
},
|
|
531
|
+
{"type": "TextBlock", "text": message, "wrap": True},
|
|
532
|
+
]
|
|
533
|
+
if facts:
|
|
534
|
+
card_body.append(
|
|
535
|
+
{
|
|
536
|
+
"type": "FactSet",
|
|
537
|
+
"facts": facts,
|
|
538
|
+
},
|
|
539
|
+
)
|
|
540
|
+
payload = {
|
|
541
|
+
"type": "message",
|
|
542
|
+
"attachments": [
|
|
543
|
+
{
|
|
544
|
+
"contentType": "application/vnd.microsoft.card.adaptive",
|
|
545
|
+
"content": {
|
|
546
|
+
"$schema": "http://adaptivecards.io/schemas/adaptive-card.json",
|
|
547
|
+
"type": "AdaptiveCard",
|
|
548
|
+
"version": "1.3",
|
|
549
|
+
"body": card_body,
|
|
550
|
+
"actions": [
|
|
551
|
+
{
|
|
552
|
+
"type": "Action.Submit",
|
|
553
|
+
"title": "Acknowledge",
|
|
554
|
+
"data": {"action": "acknowledge"},
|
|
555
|
+
},
|
|
556
|
+
{
|
|
557
|
+
"type": "Action.Submit",
|
|
558
|
+
"title": "Silence",
|
|
559
|
+
"data": {"action": "silence"},
|
|
560
|
+
},
|
|
561
|
+
],
|
|
562
|
+
},
|
|
563
|
+
},
|
|
564
|
+
],
|
|
565
|
+
}
|
|
566
|
+
data = json.dumps(payload).encode()
|
|
567
|
+
req = urllib.request.Request(
|
|
568
|
+
self.webhook_url,
|
|
569
|
+
data=data,
|
|
570
|
+
headers={"Content-Type": "application/json"},
|
|
571
|
+
method="POST",
|
|
572
|
+
)
|
|
573
|
+
try:
|
|
574
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
|
|
575
|
+
if resp.status not in (200, 202):
|
|
576
|
+
_log.warning("TeamsAdaptiveCardAlerter: unexpected status %s", resp.status)
|
|
577
|
+
except urllib.error.URLError as exc:
|
|
578
|
+
_log.warning("TeamsAdaptiveCardAlerter: request failed: %s", exc)
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
# ---------------------------------------------------------------------------
|
|
582
|
+
# Sink wrapper (circuit breaker per sink)
|
|
583
|
+
# ---------------------------------------------------------------------------
|
|
584
|
+
|
|
585
|
+
#: A type alias for any sink that supports a ``send()`` method.
|
|
586
|
+
_Alerter = Union[
|
|
587
|
+
WebhookAlerter,
|
|
588
|
+
OpsGenieAlerter,
|
|
589
|
+
VictorOpsAlerter,
|
|
590
|
+
IncidentIOAlerter,
|
|
591
|
+
SMSAlerter,
|
|
592
|
+
TeamsAdaptiveCardAlerter,
|
|
593
|
+
Any,
|
|
594
|
+
]
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
@dataclass
|
|
598
|
+
class _SinkWrapper:
|
|
599
|
+
"""Wraps a sink instance with its own circuit breaker and a name."""
|
|
600
|
+
|
|
601
|
+
alerter: _Alerter
|
|
602
|
+
name: str
|
|
603
|
+
cb: _CircuitBreaker = field(default_factory=_CircuitBreaker)
|
|
604
|
+
|
|
605
|
+
def dispatch(
|
|
606
|
+
self,
|
|
607
|
+
title: str,
|
|
608
|
+
message: str,
|
|
609
|
+
severity: str,
|
|
610
|
+
extra: dict[str, Any] | None = None,
|
|
611
|
+
) -> bool:
|
|
612
|
+
"""Send alert through the wrapped alerter, updating the circuit breaker.
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
``True`` if the alert was sent successfully.
|
|
616
|
+
"""
|
|
617
|
+
if self.cb.is_open():
|
|
618
|
+
_log.debug("_SinkWrapper[%s]: circuit open, skipping", self.name)
|
|
619
|
+
return False
|
|
620
|
+
try:
|
|
621
|
+
if hasattr(self.alerter, "send"):
|
|
622
|
+
try:
|
|
623
|
+
self.alerter.send(title, message, severity=severity, extra=extra)
|
|
624
|
+
except TypeError:
|
|
625
|
+
# Older sinks (from alerts.py) don't accept extra kwarg
|
|
626
|
+
self.alerter.send(title, message, severity=severity)
|
|
627
|
+
except Exception:
|
|
628
|
+
self.cb.record_failure()
|
|
629
|
+
_log.exception("_SinkWrapper[%s]: dispatch error", self.name)
|
|
630
|
+
return False
|
|
631
|
+
else:
|
|
632
|
+
self.cb.record_success()
|
|
633
|
+
return True
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
# ---------------------------------------------------------------------------
|
|
637
|
+
# Queue item
|
|
638
|
+
# ---------------------------------------------------------------------------
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
@dataclass
|
|
642
|
+
class _QueueItem:
|
|
643
|
+
alert_id: str
|
|
644
|
+
topic: str
|
|
645
|
+
title: str
|
|
646
|
+
message: str
|
|
647
|
+
severity: str
|
|
648
|
+
project_id: str
|
|
649
|
+
payload: dict[str, Any]
|
|
650
|
+
runbook_url: str | None
|
|
651
|
+
is_escalation: bool = False
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
# ---------------------------------------------------------------------------
|
|
655
|
+
# SFAlertClient
|
|
656
|
+
# ---------------------------------------------------------------------------
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
class SFAlertClient(SFServiceClient):
|
|
660
|
+
"""SpanForge sf-alert Alert Routing Service client.
|
|
661
|
+
|
|
662
|
+
Topic-based publish/subscribe model with deduplication, escalation policy,
|
|
663
|
+
per-sink circuit breakers, per-project rate limiting, and audit logging.
|
|
664
|
+
|
|
665
|
+
All operations are **thread-safe**.
|
|
666
|
+
|
|
667
|
+
Args:
|
|
668
|
+
config: :class:`~spanforge.sdk._base.SFClientConfig` loaded
|
|
669
|
+
from env or via :func:`~spanforge.sdk.configure`.
|
|
670
|
+
sinks: Optional list of sink instances pre-wired at
|
|
671
|
+
construction time. Sinks are also auto-discovered
|
|
672
|
+
from ``SPANFORGE_ALERT_*`` environment variables.
|
|
673
|
+
dedup_window_seconds: Client-wide deduplication window (default: 300 s).
|
|
674
|
+
rate_limit_per_minute: Per-project alert rate limit (default: 60).
|
|
675
|
+
escalation_wait_seconds: Seconds before a CRITICAL alert auto-escalates
|
|
676
|
+
(default: 900 s = 15 min).
|
|
677
|
+
escalation_sinks: Sink names to route escalated alerts to. If
|
|
678
|
+
empty, all sinks are used for escalation.
|
|
679
|
+
|
|
680
|
+
Environment variables
|
|
681
|
+
---------------------
|
|
682
|
+
.. code-block:: text
|
|
683
|
+
|
|
684
|
+
SPANFORGE_ALERT_SLACK_WEBHOOK → SlackAlerter (from spanforge.alerts)
|
|
685
|
+
SPANFORGE_ALERT_TEAMS_WEBHOOK → TeamsAdaptiveCardAlerter
|
|
686
|
+
SPANFORGE_ALERT_PAGERDUTY_KEY → PagerDutyAlerter (from spanforge.alerts)
|
|
687
|
+
SPANFORGE_ALERT_OPSGENIE_KEY → OpsGenieAlerter
|
|
688
|
+
SPANFORGE_ALERT_OPSGENIE_REGION → OpsGenieAlerter region (us|eu)
|
|
689
|
+
SPANFORGE_ALERT_VICTOROPS_URL → VictorOpsAlerter
|
|
690
|
+
SPANFORGE_ALERT_WEBHOOK_URL → WebhookAlerter
|
|
691
|
+
SPANFORGE_ALERT_WEBHOOK_SECRET → WebhookAlerter HMAC secret
|
|
692
|
+
SPANFORGE_ALERT_DEDUP_SECONDS → dedup_window_seconds (default: 300)
|
|
693
|
+
SPANFORGE_ALERT_RATE_LIMIT → rate_limit_per_minute (default: 60)
|
|
694
|
+
SPANFORGE_ALERT_ESCALATION_WAIT → escalation_wait_seconds (default: 900)
|
|
695
|
+
"""
|
|
696
|
+
|
|
697
|
+
def __init__(
|
|
698
|
+
self,
|
|
699
|
+
config: SFClientConfig,
|
|
700
|
+
sinks: list[_Alerter] | None = None,
|
|
701
|
+
*,
|
|
702
|
+
dedup_window_seconds: float | None = None,
|
|
703
|
+
rate_limit_per_minute: int | None = None,
|
|
704
|
+
escalation_wait_seconds: float | None = None,
|
|
705
|
+
escalation_sinks: list[str] | None = None,
|
|
706
|
+
) -> None:
|
|
707
|
+
super().__init__(config, "alert")
|
|
708
|
+
self._lock = threading.RLock()
|
|
709
|
+
|
|
710
|
+
# Configuration
|
|
711
|
+
_dedup_raw = os.environ.get("SPANFORGE_ALERT_DEDUP_SECONDS", "")
|
|
712
|
+
self._dedup_window: float = (
|
|
713
|
+
dedup_window_seconds
|
|
714
|
+
if dedup_window_seconds is not None
|
|
715
|
+
else (float(_dedup_raw) if _dedup_raw else _DEDUP_WINDOW_DEFAULT)
|
|
716
|
+
)
|
|
717
|
+
_rl_raw = os.environ.get("SPANFORGE_ALERT_RATE_LIMIT", "")
|
|
718
|
+
self._rate_limit: int = (
|
|
719
|
+
rate_limit_per_minute
|
|
720
|
+
if rate_limit_per_minute is not None
|
|
721
|
+
else (int(_rl_raw) if _rl_raw else _RATE_LIMIT_PER_MINUTE)
|
|
722
|
+
)
|
|
723
|
+
_esc_raw = os.environ.get("SPANFORGE_ALERT_ESCALATION_WAIT", "")
|
|
724
|
+
self._escalation_wait: float = (
|
|
725
|
+
escalation_wait_seconds
|
|
726
|
+
if escalation_wait_seconds is not None
|
|
727
|
+
else (float(_esc_raw) if _esc_raw else _ESCALATION_WAIT_DEFAULT)
|
|
728
|
+
)
|
|
729
|
+
self._escalation_sink_names: list[str] = escalation_sinks or []
|
|
730
|
+
|
|
731
|
+
# Topic registry — pre-populate known topics
|
|
732
|
+
self._topic_registry: dict[str, TopicRegistration] = {}
|
|
733
|
+
for t in KNOWN_TOPICS:
|
|
734
|
+
sev = "critical" if t.endswith((".red", ".critical", ".failed")) else "warning"
|
|
735
|
+
self._topic_registry[t] = TopicRegistration(
|
|
736
|
+
topic=t,
|
|
737
|
+
description=f"Built-in topic: {t}",
|
|
738
|
+
default_severity=sev,
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
# Sinks — env-var discovery + constructor-supplied
|
|
742
|
+
self._sinks: list[_SinkWrapper] = []
|
|
743
|
+
self._build_sinks_from_env()
|
|
744
|
+
for s in sinks or []:
|
|
745
|
+
name = type(s).__name__.lower()
|
|
746
|
+
self._sinks.append(_SinkWrapper(alerter=s, name=name))
|
|
747
|
+
|
|
748
|
+
# Rate limiter (per project_id)
|
|
749
|
+
self._rate_limiter: _SlidingWindowRateLimiter = _SlidingWindowRateLimiter(
|
|
750
|
+
limit=self._rate_limit,
|
|
751
|
+
window_seconds=60.0,
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
# Deduplication state
|
|
755
|
+
self._dedup: dict[tuple[str, str], float] = {}
|
|
756
|
+
|
|
757
|
+
# Alert grouping buffer
|
|
758
|
+
self._group_buffers: dict[tuple[str, str], list[_QueueItem]] = {}
|
|
759
|
+
self._group_timers: dict[tuple[str, str], threading.Timer] = {}
|
|
760
|
+
|
|
761
|
+
# Maintenance windows
|
|
762
|
+
self._maintenance_windows: list[MaintenanceWindow] = []
|
|
763
|
+
|
|
764
|
+
# Escalation tracking
|
|
765
|
+
self._escalation_timers: dict[str, threading.Timer] = {}
|
|
766
|
+
self._pending_escalation: dict[str, _QueueItem] = {}
|
|
767
|
+
|
|
768
|
+
# Alert history (bounded)
|
|
769
|
+
self._history: list[AlertRecord] = []
|
|
770
|
+
|
|
771
|
+
# Session stats
|
|
772
|
+
self._publish_count: int = 0
|
|
773
|
+
self._suppress_count: int = 0
|
|
774
|
+
|
|
775
|
+
# Async dispatch queue + worker thread
|
|
776
|
+
self._queue: queue.Queue[_QueueItem | None] = queue.Queue(maxsize=_QUEUE_MAX)
|
|
777
|
+
self._worker = threading.Thread(
|
|
778
|
+
target=self._worker_loop,
|
|
779
|
+
name="sf-alert-worker",
|
|
780
|
+
daemon=True,
|
|
781
|
+
)
|
|
782
|
+
self._worker.start()
|
|
783
|
+
|
|
784
|
+
# ------------------------------------------------------------------
|
|
785
|
+
# SFServiceClient abstract method
|
|
786
|
+
# ------------------------------------------------------------------
|
|
787
|
+
|
|
788
|
+
# ------------------------------------------------------------------
|
|
789
|
+
# Env-var sink discovery
|
|
790
|
+
# ------------------------------------------------------------------
|
|
791
|
+
|
|
792
|
+
def _build_sinks_from_env(self) -> None:
|
|
793
|
+
"""Auto-discover sinks from ``SPANFORGE_ALERT_*`` environment variables."""
|
|
794
|
+
# Slack
|
|
795
|
+
slack_url = os.environ.get("SPANFORGE_ALERT_SLACK_WEBHOOK", "")
|
|
796
|
+
if slack_url:
|
|
797
|
+
try:
|
|
798
|
+
from spanforge.alerts import SlackAlerter
|
|
799
|
+
|
|
800
|
+
self._sinks.append(
|
|
801
|
+
_SinkWrapper(alerter=SlackAlerter(webhook_url=slack_url), name="slack"),
|
|
802
|
+
)
|
|
803
|
+
except Exception:
|
|
804
|
+
_log.warning("Failed to create SlackAlerter from env")
|
|
805
|
+
|
|
806
|
+
# Teams
|
|
807
|
+
teams_url = os.environ.get("SPANFORGE_ALERT_TEAMS_WEBHOOK", "")
|
|
808
|
+
if teams_url:
|
|
809
|
+
self._sinks.append(
|
|
810
|
+
_SinkWrapper(alerter=TeamsAdaptiveCardAlerter(webhook_url=teams_url), name="teams"),
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
# PagerDuty
|
|
814
|
+
pd_key = os.environ.get("SPANFORGE_ALERT_PAGERDUTY_KEY", "")
|
|
815
|
+
if pd_key:
|
|
816
|
+
try:
|
|
817
|
+
from spanforge.alerts import PagerDutyAlerter
|
|
818
|
+
|
|
819
|
+
self._sinks.append(
|
|
820
|
+
_SinkWrapper(
|
|
821
|
+
alerter=PagerDutyAlerter(integration_key=pd_key),
|
|
822
|
+
name="pagerduty",
|
|
823
|
+
),
|
|
824
|
+
)
|
|
825
|
+
except Exception:
|
|
826
|
+
_log.warning("Failed to create PagerDutyAlerter from env")
|
|
827
|
+
|
|
828
|
+
# OpsGenie
|
|
829
|
+
og_key = os.environ.get("SPANFORGE_ALERT_OPSGENIE_KEY", "")
|
|
830
|
+
if og_key:
|
|
831
|
+
region = os.environ.get("SPANFORGE_ALERT_OPSGENIE_REGION", "us")
|
|
832
|
+
self._sinks.append(
|
|
833
|
+
_SinkWrapper(
|
|
834
|
+
alerter=OpsGenieAlerter(api_key=og_key, region=region),
|
|
835
|
+
name="opsgenie",
|
|
836
|
+
),
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
# VictorOps
|
|
840
|
+
vo_url = os.environ.get("SPANFORGE_ALERT_VICTOROPS_URL", "")
|
|
841
|
+
if vo_url:
|
|
842
|
+
self._sinks.append(
|
|
843
|
+
_SinkWrapper(alerter=VictorOpsAlerter(rest_endpoint_url=vo_url), name="victorops"),
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
# Generic webhook
|
|
847
|
+
wh_url = os.environ.get("SPANFORGE_ALERT_WEBHOOK_URL", "")
|
|
848
|
+
if wh_url:
|
|
849
|
+
wh_secret = os.environ.get("SPANFORGE_ALERT_WEBHOOK_SECRET", "")
|
|
850
|
+
self._sinks.append(
|
|
851
|
+
_SinkWrapper(alerter=WebhookAlerter(url=wh_url, secret=wh_secret), name="webhook"),
|
|
852
|
+
)
|
|
853
|
+
|
|
854
|
+
# ------------------------------------------------------------------
|
|
855
|
+
# Public API — topic registry (ALT-003)
|
|
856
|
+
# ------------------------------------------------------------------
|
|
857
|
+
|
|
858
|
+
def register_topic(
|
|
859
|
+
self,
|
|
860
|
+
topic: str,
|
|
861
|
+
description: str,
|
|
862
|
+
default_severity: str = "warning",
|
|
863
|
+
*,
|
|
864
|
+
runbook_url: str | None = None,
|
|
865
|
+
dedup_window_seconds: float | None = None,
|
|
866
|
+
) -> None:
|
|
867
|
+
"""Register a custom topic.
|
|
868
|
+
|
|
869
|
+
Args:
|
|
870
|
+
topic: Dot-separated topic string.
|
|
871
|
+
description: Human-readable purpose.
|
|
872
|
+
default_severity: Default severity (``"info"``, ``"warning"``,
|
|
873
|
+
``"high"``, or ``"critical"``).
|
|
874
|
+
runbook_url: Optional URL to the runbook for this topic.
|
|
875
|
+
dedup_window_seconds: Per-topic dedup window override.
|
|
876
|
+
"""
|
|
877
|
+
reg = TopicRegistration(
|
|
878
|
+
topic=topic,
|
|
879
|
+
description=description,
|
|
880
|
+
default_severity=default_severity,
|
|
881
|
+
runbook_url=runbook_url,
|
|
882
|
+
dedup_window_seconds=dedup_window_seconds,
|
|
883
|
+
)
|
|
884
|
+
with self._lock:
|
|
885
|
+
self._topic_registry[topic] = reg
|
|
886
|
+
|
|
887
|
+
# ------------------------------------------------------------------
|
|
888
|
+
# Public API — maintenance windows (ALT-012)
|
|
889
|
+
# ------------------------------------------------------------------
|
|
890
|
+
|
|
891
|
+
def set_maintenance_window(
|
|
892
|
+
self,
|
|
893
|
+
project_id: str,
|
|
894
|
+
start: datetime,
|
|
895
|
+
end: datetime,
|
|
896
|
+
) -> None:
|
|
897
|
+
"""Register a maintenance window.
|
|
898
|
+
|
|
899
|
+
During the window all alerts for *project_id* are suppressed.
|
|
900
|
+
|
|
901
|
+
Args:
|
|
902
|
+
project_id: Project whose alerts should be suppressed.
|
|
903
|
+
start: Window start (UTC-aware recommended).
|
|
904
|
+
end: Window end (UTC-aware recommended).
|
|
905
|
+
"""
|
|
906
|
+
mw = MaintenanceWindow(project_id=project_id, start=start, end=end)
|
|
907
|
+
with self._lock:
|
|
908
|
+
self._maintenance_windows.append(mw)
|
|
909
|
+
self._append_audit_record(
|
|
910
|
+
{
|
|
911
|
+
"event": "maintenance_window_set",
|
|
912
|
+
"project_id": project_id,
|
|
913
|
+
"start": start.isoformat(),
|
|
914
|
+
"end": end.isoformat(),
|
|
915
|
+
},
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
def remove_maintenance_windows(self, project_id: str) -> int:
|
|
919
|
+
"""Remove all maintenance windows for *project_id*.
|
|
920
|
+
|
|
921
|
+
Returns the number of windows removed.
|
|
922
|
+
"""
|
|
923
|
+
with self._lock:
|
|
924
|
+
before = len(self._maintenance_windows)
|
|
925
|
+
self._maintenance_windows = [
|
|
926
|
+
mw for mw in self._maintenance_windows if mw.project_id != project_id
|
|
927
|
+
]
|
|
928
|
+
return before - len(self._maintenance_windows)
|
|
929
|
+
|
|
930
|
+
# ------------------------------------------------------------------
|
|
931
|
+
# Public API — publish (ALT-001, ALT-050)
|
|
932
|
+
# ------------------------------------------------------------------
|
|
933
|
+
|
|
934
|
+
def publish(
|
|
935
|
+
self,
|
|
936
|
+
topic: str,
|
|
937
|
+
payload: dict[str, Any],
|
|
938
|
+
*,
|
|
939
|
+
severity: str | None = None,
|
|
940
|
+
project_id: str | None = None,
|
|
941
|
+
) -> PublishResult:
|
|
942
|
+
"""Publish an alert to the given *topic*.
|
|
943
|
+
|
|
944
|
+
Steps:
|
|
945
|
+
1. Resolve topic registration (warn on unknown topics).
|
|
946
|
+
2. Resolve effective severity.
|
|
947
|
+
3. Check maintenance window suppression.
|
|
948
|
+
4. Check per-project rate limit.
|
|
949
|
+
5. Check deduplication window.
|
|
950
|
+
6. Enqueue for background dispatch.
|
|
951
|
+
|
|
952
|
+
Args:
|
|
953
|
+
topic: Dot-separated topic identifier.
|
|
954
|
+
payload: Arbitrary payload dict. **Never include raw secrets.**
|
|
955
|
+
severity: Explicit severity override. Defaults to the topic's
|
|
956
|
+
``default_severity``.
|
|
957
|
+
project_id: Project scope. Defaults to ``config.project_id``.
|
|
958
|
+
|
|
959
|
+
Returns:
|
|
960
|
+
:class:`~spanforge.sdk._types.PublishResult` with ``alert_id``,
|
|
961
|
+
``routed_to``, and ``suppressed``.
|
|
962
|
+
|
|
963
|
+
Raises:
|
|
964
|
+
:exc:`~spanforge.sdk._exceptions.SFAlertRateLimitedError` when the
|
|
965
|
+
per-project rate limit is exceeded **and** the client is in
|
|
966
|
+
strict mode (``local_fallback_enabled=False``).
|
|
967
|
+
"""
|
|
968
|
+
pid = project_id or self._config.project_id or ""
|
|
969
|
+
alert_id = str(uuid.uuid4())
|
|
970
|
+
|
|
971
|
+
with self._lock:
|
|
972
|
+
self._publish_count += 1
|
|
973
|
+
|
|
974
|
+
# Topic lookup
|
|
975
|
+
reg = self._topic_registry.get(topic)
|
|
976
|
+
if reg is None:
|
|
977
|
+
_log.warning(
|
|
978
|
+
"sf-alert: unknown topic %r — routing to catch-all. "
|
|
979
|
+
"Register custom topics with register_topic().",
|
|
980
|
+
topic,
|
|
981
|
+
)
|
|
982
|
+
resolved_severity = severity or (reg.default_severity if reg else "warning")
|
|
983
|
+
runbook_url = reg.runbook_url if reg else None
|
|
984
|
+
per_topic_dedup = reg.dedup_window_seconds if reg else None
|
|
985
|
+
effective_dedup = per_topic_dedup if per_topic_dedup is not None else self._dedup_window
|
|
986
|
+
|
|
987
|
+
# Maintenance window check
|
|
988
|
+
if self._is_maintenance_window(pid):
|
|
989
|
+
self._suppress_count += 1
|
|
990
|
+
_log.debug(
|
|
991
|
+
"sf-alert: suppressed %r — maintenance window for project %r",
|
|
992
|
+
topic,
|
|
993
|
+
pid,
|
|
994
|
+
)
|
|
995
|
+
return PublishResult(alert_id=alert_id, routed_to=[], suppressed=True)
|
|
996
|
+
|
|
997
|
+
# Rate limit check
|
|
998
|
+
if not self._rate_limiter.record(pid or "__global__"):
|
|
999
|
+
self._suppress_count += 1
|
|
1000
|
+
_log.warning(
|
|
1001
|
+
"sf-alert: rate limit %d/min exceeded for project %r; alert suppressed",
|
|
1002
|
+
self._rate_limit,
|
|
1003
|
+
pid,
|
|
1004
|
+
)
|
|
1005
|
+
if not self._config.local_fallback_enabled:
|
|
1006
|
+
raise SFAlertRateLimitedError(pid, self._rate_limit)
|
|
1007
|
+
return PublishResult(alert_id=alert_id, routed_to=[], suppressed=True)
|
|
1008
|
+
|
|
1009
|
+
# Deduplication check
|
|
1010
|
+
dedup_key = (topic, pid)
|
|
1011
|
+
last_ts = self._dedup.get(dedup_key, 0.0)
|
|
1012
|
+
if time.monotonic() - last_ts < effective_dedup:
|
|
1013
|
+
self._suppress_count += 1
|
|
1014
|
+
_log.debug("sf-alert: suppressed %r (dedup window %.0fs)", topic, effective_dedup)
|
|
1015
|
+
return PublishResult(alert_id=alert_id, routed_to=[], suppressed=True)
|
|
1016
|
+
self._dedup[dedup_key] = time.monotonic()
|
|
1017
|
+
|
|
1018
|
+
# Build summary message
|
|
1019
|
+
title = f"[{resolved_severity.upper()}] {topic}"
|
|
1020
|
+
message = _build_message(topic, payload, runbook_url)
|
|
1021
|
+
item = _QueueItem(
|
|
1022
|
+
alert_id=alert_id,
|
|
1023
|
+
topic=topic,
|
|
1024
|
+
title=title,
|
|
1025
|
+
message=message,
|
|
1026
|
+
severity=resolved_severity,
|
|
1027
|
+
project_id=pid,
|
|
1028
|
+
payload=payload,
|
|
1029
|
+
runbook_url=runbook_url,
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
# Alert grouping check (ALT-011)
|
|
1033
|
+
# The FIRST alert in a group is dispatched immediately.
|
|
1034
|
+
# Subsequent alerts sharing the same (topic_prefix, project_id) within
|
|
1035
|
+
# _GROUP_WINDOW_SECS are coalesced and flushed as one notification.
|
|
1036
|
+
group_key = (_topic_prefix(topic), pid)
|
|
1037
|
+
with self._lock:
|
|
1038
|
+
if group_key in self._group_buffers:
|
|
1039
|
+
# Add to existing group buffer; dispatch deferred until flush
|
|
1040
|
+
self._group_buffers[group_key].append(item)
|
|
1041
|
+
_log.debug("sf-alert: grouped %r into existing group %r", topic, group_key)
|
|
1042
|
+
return PublishResult(alert_id=alert_id, routed_to=[], suppressed=False)
|
|
1043
|
+
# Start a new group window; the first item is enqueued immediately
|
|
1044
|
+
self._group_buffers[group_key] = [] # buffer for SUBSEQUENT items only
|
|
1045
|
+
timer = threading.Timer(
|
|
1046
|
+
_GROUP_WINDOW_SECS,
|
|
1047
|
+
self._flush_group,
|
|
1048
|
+
args=(group_key,),
|
|
1049
|
+
)
|
|
1050
|
+
timer.daemon = True
|
|
1051
|
+
self._group_timers[group_key] = timer
|
|
1052
|
+
timer.start()
|
|
1053
|
+
|
|
1054
|
+
# Enqueue the first item for immediate dispatch
|
|
1055
|
+
try:
|
|
1056
|
+
self._queue.put_nowait(item)
|
|
1057
|
+
except queue.Full:
|
|
1058
|
+
try:
|
|
1059
|
+
self._queue.get_nowait()
|
|
1060
|
+
except queue.Empty:
|
|
1061
|
+
pass
|
|
1062
|
+
try:
|
|
1063
|
+
self._queue.put_nowait(item)
|
|
1064
|
+
except queue.Full:
|
|
1065
|
+
pass
|
|
1066
|
+
_log.warning("sf-alert: alert queue full (%d items), oldest item dropped", _QUEUE_MAX)
|
|
1067
|
+
|
|
1068
|
+
return PublishResult(alert_id=alert_id, routed_to=[], suppressed=False)
|
|
1069
|
+
|
|
1070
|
+
# ------------------------------------------------------------------
|
|
1071
|
+
# Public API — acknowledge (ALT-020)
|
|
1072
|
+
# ------------------------------------------------------------------
|
|
1073
|
+
|
|
1074
|
+
def acknowledge(self, alert_id: str) -> bool:
|
|
1075
|
+
"""Acknowledge a CRITICAL alert, cancelling its escalation timer.
|
|
1076
|
+
|
|
1077
|
+
Args:
|
|
1078
|
+
alert_id: The UUID returned by :meth:`publish`.
|
|
1079
|
+
|
|
1080
|
+
Returns:
|
|
1081
|
+
``True`` if a pending escalation timer was found and cancelled.
|
|
1082
|
+
"""
|
|
1083
|
+
with self._lock:
|
|
1084
|
+
timer = self._escalation_timers.pop(alert_id, None)
|
|
1085
|
+
self._pending_escalation.pop(alert_id, None)
|
|
1086
|
+
if timer is not None:
|
|
1087
|
+
timer.cancel()
|
|
1088
|
+
_log.debug("sf-alert: escalation cancelled for alert %s", alert_id)
|
|
1089
|
+
# Update status in history
|
|
1090
|
+
self._update_history_status(alert_id, "acknowledged")
|
|
1091
|
+
return True
|
|
1092
|
+
return False
|
|
1093
|
+
|
|
1094
|
+
# ------------------------------------------------------------------
|
|
1095
|
+
# Public API — alert history (ALT-042)
|
|
1096
|
+
# ------------------------------------------------------------------
|
|
1097
|
+
|
|
1098
|
+
def get_alert_history(
|
|
1099
|
+
self,
|
|
1100
|
+
*,
|
|
1101
|
+
project_id: str | None = None,
|
|
1102
|
+
topic: str | None = None,
|
|
1103
|
+
from_dt: datetime | None = None,
|
|
1104
|
+
to_dt: datetime | None = None,
|
|
1105
|
+
status: str | None = None,
|
|
1106
|
+
limit: int = 100,
|
|
1107
|
+
) -> list[AlertRecord]:
|
|
1108
|
+
"""Query the in-memory alert history.
|
|
1109
|
+
|
|
1110
|
+
Args:
|
|
1111
|
+
project_id: Filter by project.
|
|
1112
|
+
topic: Filter by topic.
|
|
1113
|
+
from_dt: Include alerts at or after this UTC datetime.
|
|
1114
|
+
to_dt: Include alerts at or before this UTC datetime.
|
|
1115
|
+
status: Filter by status: ``"open"``, ``"acknowledged"``,
|
|
1116
|
+
or ``"resolved"``.
|
|
1117
|
+
limit: Maximum number of results (default: 100).
|
|
1118
|
+
|
|
1119
|
+
Returns:
|
|
1120
|
+
Most-recent-first list of matching :class:`~spanforge.sdk._types.AlertRecord`.
|
|
1121
|
+
"""
|
|
1122
|
+
with self._lock:
|
|
1123
|
+
results = list(self._history)
|
|
1124
|
+
|
|
1125
|
+
# Filter
|
|
1126
|
+
if project_id:
|
|
1127
|
+
results = [r for r in results if r.project_id == project_id]
|
|
1128
|
+
if topic:
|
|
1129
|
+
results = [r for r in results if r.topic == topic]
|
|
1130
|
+
if status:
|
|
1131
|
+
results = [r for r in results if r.status == status]
|
|
1132
|
+
if from_dt:
|
|
1133
|
+
from_str = from_dt.isoformat()
|
|
1134
|
+
results = [r for r in results if r.timestamp >= from_str]
|
|
1135
|
+
if to_dt:
|
|
1136
|
+
to_str = to_dt.isoformat()
|
|
1137
|
+
results = [r for r in results if r.timestamp <= to_str]
|
|
1138
|
+
|
|
1139
|
+
# Most recent first
|
|
1140
|
+
results.sort(key=lambda r: r.timestamp, reverse=True)
|
|
1141
|
+
return results[:limit]
|
|
1142
|
+
|
|
1143
|
+
# ------------------------------------------------------------------
|
|
1144
|
+
# publish_async (F-10)
|
|
1145
|
+
# ------------------------------------------------------------------
|
|
1146
|
+
|
|
1147
|
+
async def publish_async(
|
|
1148
|
+
self,
|
|
1149
|
+
topic: str,
|
|
1150
|
+
payload: dict[str, Any],
|
|
1151
|
+
*,
|
|
1152
|
+
severity: str | None = None,
|
|
1153
|
+
project_id: str | None = None,
|
|
1154
|
+
) -> PublishResult:
|
|
1155
|
+
"""Async variant of :meth:`publish`.
|
|
1156
|
+
|
|
1157
|
+
Dispatches the alert enqueue in the default executor so the event
|
|
1158
|
+
loop is not blocked by rate-limit checks or deduplication lookups.
|
|
1159
|
+
|
|
1160
|
+
Args:
|
|
1161
|
+
topic: Alert topic string.
|
|
1162
|
+
payload: Alert payload dict.
|
|
1163
|
+
severity: Optional severity override.
|
|
1164
|
+
project_id: Optional project scope override.
|
|
1165
|
+
|
|
1166
|
+
Returns:
|
|
1167
|
+
:class:`~spanforge.sdk._types.PublishResult`.
|
|
1168
|
+
"""
|
|
1169
|
+
import asyncio
|
|
1170
|
+
import functools
|
|
1171
|
+
|
|
1172
|
+
loop = asyncio.get_event_loop()
|
|
1173
|
+
return await loop.run_in_executor(
|
|
1174
|
+
None,
|
|
1175
|
+
functools.partial(
|
|
1176
|
+
self.publish,
|
|
1177
|
+
topic,
|
|
1178
|
+
payload,
|
|
1179
|
+
severity=severity,
|
|
1180
|
+
project_id=project_id,
|
|
1181
|
+
),
|
|
1182
|
+
)
|
|
1183
|
+
|
|
1184
|
+
# ------------------------------------------------------------------
|
|
1185
|
+
# Public API — status / health
|
|
1186
|
+
# ------------------------------------------------------------------
|
|
1187
|
+
|
|
1188
|
+
def get_status(self) -> AlertStatusInfo:
|
|
1189
|
+
"""Return health and session statistics."""
|
|
1190
|
+
with self._lock:
|
|
1191
|
+
publish_count = self._publish_count
|
|
1192
|
+
suppress_count = self._suppress_count
|
|
1193
|
+
now = datetime.now(timezone.utc)
|
|
1194
|
+
active_mw = sum(1 for mw in self._maintenance_windows if mw.start <= now <= mw.end)
|
|
1195
|
+
registered = len(self._topic_registry)
|
|
1196
|
+
|
|
1197
|
+
queue_depth = self._queue.qsize()
|
|
1198
|
+
all_healthy = all(not w.cb.is_open() for w in self._sinks)
|
|
1199
|
+
status = "ok" if all_healthy else "degraded"
|
|
1200
|
+
return AlertStatusInfo(
|
|
1201
|
+
status=status,
|
|
1202
|
+
publish_count=publish_count,
|
|
1203
|
+
suppress_count=suppress_count,
|
|
1204
|
+
queue_depth=queue_depth,
|
|
1205
|
+
registered_topics=registered,
|
|
1206
|
+
active_maintenance_windows=active_mw,
|
|
1207
|
+
healthy=all_healthy,
|
|
1208
|
+
)
|
|
1209
|
+
|
|
1210
|
+
@property
|
|
1211
|
+
def healthy(self) -> bool:
|
|
1212
|
+
"""``True`` when no sink circuit breaker is open."""
|
|
1213
|
+
return all(not w.cb.is_open() for w in self._sinks)
|
|
1214
|
+
|
|
1215
|
+
# ------------------------------------------------------------------
|
|
1216
|
+
# Public API — sink management
|
|
1217
|
+
# ------------------------------------------------------------------
|
|
1218
|
+
|
|
1219
|
+
def add_sink(self, alerter: _Alerter, name: str | None = None) -> None:
|
|
1220
|
+
"""Add a sink at runtime.
|
|
1221
|
+
|
|
1222
|
+
Args:
|
|
1223
|
+
alerter: Sink instance with a ``send()`` method.
|
|
1224
|
+
name: Optional display name (defaults to class name).
|
|
1225
|
+
"""
|
|
1226
|
+
sink_name = name or type(alerter).__name__.lower()
|
|
1227
|
+
with self._lock:
|
|
1228
|
+
self._sinks.append(_SinkWrapper(alerter=alerter, name=sink_name))
|
|
1229
|
+
|
|
1230
|
+
# ------------------------------------------------------------------
|
|
1231
|
+
# Graceful shutdown
|
|
1232
|
+
# ------------------------------------------------------------------
|
|
1233
|
+
|
|
1234
|
+
def shutdown(self, timeout: float = 5.0) -> None:
|
|
1235
|
+
"""Drain the queue and stop the worker thread.
|
|
1236
|
+
|
|
1237
|
+
Args:
|
|
1238
|
+
timeout: Seconds to wait for the worker to finish (default: 5.0).
|
|
1239
|
+
"""
|
|
1240
|
+
# Cancel all escalation timers
|
|
1241
|
+
with self._lock:
|
|
1242
|
+
timers = list(self._escalation_timers.values())
|
|
1243
|
+
self._escalation_timers.clear()
|
|
1244
|
+
for t in timers:
|
|
1245
|
+
t.cancel()
|
|
1246
|
+
# Flush all groups
|
|
1247
|
+
with self._lock:
|
|
1248
|
+
group_keys = list(self._group_buffers.keys())
|
|
1249
|
+
for gk in group_keys:
|
|
1250
|
+
self._flush_group(gk)
|
|
1251
|
+
# Signal worker to stop
|
|
1252
|
+
try:
|
|
1253
|
+
self._queue.put_nowait(None)
|
|
1254
|
+
except queue.Full:
|
|
1255
|
+
pass
|
|
1256
|
+
self._worker.join(timeout=timeout)
|
|
1257
|
+
|
|
1258
|
+
# ------------------------------------------------------------------
|
|
1259
|
+
# Internal — group flushing
|
|
1260
|
+
# ------------------------------------------------------------------
|
|
1261
|
+
|
|
1262
|
+
def _flush_group(self, group_key: tuple[str, str]) -> None:
|
|
1263
|
+
"""Flush a group window: coalesce buffered secondary items and enqueue one dispatch task."""
|
|
1264
|
+
with self._lock:
|
|
1265
|
+
items = self._group_buffers.pop(group_key, [])
|
|
1266
|
+
timer = self._group_timers.pop(group_key, None)
|
|
1267
|
+
if timer is not None:
|
|
1268
|
+
timer.cancel()
|
|
1269
|
+
if not items:
|
|
1270
|
+
# No secondary alerts buffered; the first item was already dispatched
|
|
1271
|
+
return
|
|
1272
|
+
|
|
1273
|
+
# Coalesce: use the first item as the representative
|
|
1274
|
+
first = items[0]
|
|
1275
|
+
if len(items) > 1:
|
|
1276
|
+
extra_topics = ", ".join(i.topic for i in items[1:])
|
|
1277
|
+
first = _QueueItem(
|
|
1278
|
+
alert_id=first.alert_id,
|
|
1279
|
+
topic=first.topic,
|
|
1280
|
+
title=first.title,
|
|
1281
|
+
message=f"{first.message}\n(+{len(items) - 1} grouped: {extra_topics})",
|
|
1282
|
+
severity=max(
|
|
1283
|
+
(i.severity for i in items),
|
|
1284
|
+
key=lambda s: _SEVERITY_RANK.get(s, 0),
|
|
1285
|
+
),
|
|
1286
|
+
project_id=first.project_id,
|
|
1287
|
+
payload=first.payload,
|
|
1288
|
+
runbook_url=first.runbook_url,
|
|
1289
|
+
)
|
|
1290
|
+
|
|
1291
|
+
try:
|
|
1292
|
+
self._queue.put_nowait(first)
|
|
1293
|
+
except queue.Full:
|
|
1294
|
+
# Drop oldest
|
|
1295
|
+
try:
|
|
1296
|
+
self._queue.get_nowait()
|
|
1297
|
+
except queue.Empty:
|
|
1298
|
+
pass
|
|
1299
|
+
try:
|
|
1300
|
+
self._queue.put_nowait(first)
|
|
1301
|
+
except queue.Full:
|
|
1302
|
+
pass
|
|
1303
|
+
_log.warning("sf-alert: alert queue full (%d items), oldest item dropped", _QUEUE_MAX)
|
|
1304
|
+
|
|
1305
|
+
# ------------------------------------------------------------------
|
|
1306
|
+
# Internal — worker loop (ALT-050)
|
|
1307
|
+
# ------------------------------------------------------------------
|
|
1308
|
+
|
|
1309
|
+
def _worker_loop(self) -> None:
|
|
1310
|
+
"""Background thread: drain queue and dispatch to sinks."""
|
|
1311
|
+
while True:
|
|
1312
|
+
try:
|
|
1313
|
+
item = self._queue.get(timeout=1.0)
|
|
1314
|
+
except queue.Empty:
|
|
1315
|
+
continue
|
|
1316
|
+
if item is None:
|
|
1317
|
+
# Shutdown sentinel
|
|
1318
|
+
break
|
|
1319
|
+
try:
|
|
1320
|
+
self._dispatch(item)
|
|
1321
|
+
except Exception:
|
|
1322
|
+
_log.exception("sf-alert: unhandled error dispatching %r", item.topic)
|
|
1323
|
+
finally:
|
|
1324
|
+
self._queue.task_done()
|
|
1325
|
+
|
|
1326
|
+
def _dispatch(self, item: _QueueItem) -> None:
|
|
1327
|
+
"""Dispatch an alert to all configured sinks."""
|
|
1328
|
+
with self._lock:
|
|
1329
|
+
sinks = list(self._sinks)
|
|
1330
|
+
escalation_names = list(self._escalation_sink_names)
|
|
1331
|
+
|
|
1332
|
+
if item.is_escalation and escalation_names:
|
|
1333
|
+
sinks = [s for s in sinks if s.name in escalation_names] or sinks
|
|
1334
|
+
|
|
1335
|
+
extra: dict[str, Any] = {
|
|
1336
|
+
"alert_id": item.alert_id,
|
|
1337
|
+
"topic": item.topic,
|
|
1338
|
+
"project_id": item.project_id,
|
|
1339
|
+
}
|
|
1340
|
+
if item.runbook_url:
|
|
1341
|
+
extra["runbook_url"] = item.runbook_url
|
|
1342
|
+
|
|
1343
|
+
routed_to: list[str] = []
|
|
1344
|
+
for sink in sinks:
|
|
1345
|
+
ok = sink.dispatch(item.title, item.message, item.severity, extra)
|
|
1346
|
+
if ok:
|
|
1347
|
+
routed_to.append(sink.name)
|
|
1348
|
+
|
|
1349
|
+
# Record in history
|
|
1350
|
+
record = AlertRecord(
|
|
1351
|
+
alert_id=item.alert_id,
|
|
1352
|
+
topic=item.topic,
|
|
1353
|
+
severity=item.severity,
|
|
1354
|
+
project_id=item.project_id,
|
|
1355
|
+
payload=item.payload,
|
|
1356
|
+
sinks_notified=routed_to,
|
|
1357
|
+
suppressed=False,
|
|
1358
|
+
status="open",
|
|
1359
|
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
1360
|
+
)
|
|
1361
|
+
with self._lock:
|
|
1362
|
+
self._history.append(record)
|
|
1363
|
+
if len(self._history) > _HISTORY_MAX:
|
|
1364
|
+
self._history = self._history[-_HISTORY_MAX:]
|
|
1365
|
+
|
|
1366
|
+
# Audit log
|
|
1367
|
+
self._append_audit_record(
|
|
1368
|
+
{
|
|
1369
|
+
"event": "alert.published",
|
|
1370
|
+
"alert_id": item.alert_id,
|
|
1371
|
+
"topic": item.topic,
|
|
1372
|
+
"severity": item.severity,
|
|
1373
|
+
"project_id": item.project_id,
|
|
1374
|
+
"sinks_notified": routed_to,
|
|
1375
|
+
"suppressed": False,
|
|
1376
|
+
},
|
|
1377
|
+
)
|
|
1378
|
+
|
|
1379
|
+
# Schedule escalation for CRITICAL (ALT-020)
|
|
1380
|
+
is_critical = _SEVERITY_RANK.get(item.severity, 0) >= _SEVERITY_RANK["critical"]
|
|
1381
|
+
if is_critical and not item.is_escalation:
|
|
1382
|
+
self._schedule_escalation(item)
|
|
1383
|
+
|
|
1384
|
+
# ------------------------------------------------------------------
|
|
1385
|
+
# Internal — escalation (ALT-020)
|
|
1386
|
+
# ------------------------------------------------------------------
|
|
1387
|
+
|
|
1388
|
+
def _schedule_escalation(self, item: _QueueItem) -> None:
|
|
1389
|
+
"""Start a timer to escalate *item* after :attr:`_escalation_wait` seconds."""
|
|
1390
|
+
timer = threading.Timer(
|
|
1391
|
+
self._escalation_wait,
|
|
1392
|
+
self._fire_escalation,
|
|
1393
|
+
args=(item.alert_id,),
|
|
1394
|
+
)
|
|
1395
|
+
timer.daemon = True
|
|
1396
|
+
with self._lock:
|
|
1397
|
+
self._escalation_timers[item.alert_id] = timer
|
|
1398
|
+
self._pending_escalation[item.alert_id] = item
|
|
1399
|
+
timer.start()
|
|
1400
|
+
_log.debug(
|
|
1401
|
+
"sf-alert: escalation scheduled for %s in %.0fs",
|
|
1402
|
+
item.alert_id,
|
|
1403
|
+
self._escalation_wait,
|
|
1404
|
+
)
|
|
1405
|
+
|
|
1406
|
+
def _fire_escalation(self, alert_id: str) -> None:
|
|
1407
|
+
"""Escalation timer callback — re-dispatch with ``is_escalation=True``."""
|
|
1408
|
+
with self._lock:
|
|
1409
|
+
item = self._pending_escalation.pop(alert_id, None)
|
|
1410
|
+
self._escalation_timers.pop(alert_id, None)
|
|
1411
|
+
if item is None:
|
|
1412
|
+
return
|
|
1413
|
+
escalated = _QueueItem(
|
|
1414
|
+
alert_id=item.alert_id,
|
|
1415
|
+
topic=item.topic,
|
|
1416
|
+
title=f"[ESCALATED] {item.title}",
|
|
1417
|
+
message=f"[AUTO-ESCALATED after {self._escalation_wait:.0f}s]\n{item.message}",
|
|
1418
|
+
severity=item.severity,
|
|
1419
|
+
project_id=item.project_id,
|
|
1420
|
+
payload=item.payload,
|
|
1421
|
+
runbook_url=item.runbook_url,
|
|
1422
|
+
is_escalation=True,
|
|
1423
|
+
)
|
|
1424
|
+
_log.warning(
|
|
1425
|
+
"sf-alert: CRITICAL alert %r not acknowledged in %.0fs — escalating",
|
|
1426
|
+
alert_id,
|
|
1427
|
+
self._escalation_wait,
|
|
1428
|
+
)
|
|
1429
|
+
try:
|
|
1430
|
+
self._queue.put_nowait(escalated)
|
|
1431
|
+
except queue.Full:
|
|
1432
|
+
_log.warning("sf-alert: queue full during escalation; escalated alert dropped")
|
|
1433
|
+
|
|
1434
|
+
# ------------------------------------------------------------------
|
|
1435
|
+
# Internal — audit log (ALT-053)
|
|
1436
|
+
# ------------------------------------------------------------------
|
|
1437
|
+
|
|
1438
|
+
def _append_audit_record(self, record: dict[str, Any]) -> None:
|
|
1439
|
+
"""Append *record* to sf-audit schema ``spanforge.alert.v1`` (best-effort)."""
|
|
1440
|
+
try:
|
|
1441
|
+
from spanforge.sdk import sf_audit
|
|
1442
|
+
|
|
1443
|
+
sf_audit.append(record, "spanforge.alert.v1")
|
|
1444
|
+
except Exception:
|
|
1445
|
+
_log.debug("sf-alert: audit append skipped (sf_audit unavailable or error)")
|
|
1446
|
+
|
|
1447
|
+
# ------------------------------------------------------------------
|
|
1448
|
+
# Internal — helper predicates
|
|
1449
|
+
# ------------------------------------------------------------------
|
|
1450
|
+
|
|
1451
|
+
def _is_maintenance_window(self, project_id: str) -> bool:
|
|
1452
|
+
"""Return ``True`` when *project_id* is currently in a maintenance window.
|
|
1453
|
+
|
|
1454
|
+
Must be called with ``self._lock`` held **or** within a context that
|
|
1455
|
+
doesn't need the lock (the caller holds it).
|
|
1456
|
+
"""
|
|
1457
|
+
now = datetime.now(timezone.utc)
|
|
1458
|
+
for mw in self._maintenance_windows:
|
|
1459
|
+
if mw.project_id == project_id and mw.start <= now <= mw.end:
|
|
1460
|
+
return True
|
|
1461
|
+
return False
|
|
1462
|
+
|
|
1463
|
+
def _update_history_status(self, alert_id: str, status: str) -> None:
|
|
1464
|
+
"""Update the status field of a history record (best-effort)."""
|
|
1465
|
+
with self._lock:
|
|
1466
|
+
for i, rec in enumerate(self._history):
|
|
1467
|
+
if rec.alert_id == alert_id:
|
|
1468
|
+
# Dataclass is frozen — replace the record
|
|
1469
|
+
self._history[i] = AlertRecord(
|
|
1470
|
+
alert_id=rec.alert_id,
|
|
1471
|
+
topic=rec.topic,
|
|
1472
|
+
severity=rec.severity,
|
|
1473
|
+
project_id=rec.project_id,
|
|
1474
|
+
payload=rec.payload,
|
|
1475
|
+
sinks_notified=rec.sinks_notified,
|
|
1476
|
+
suppressed=rec.suppressed,
|
|
1477
|
+
status=status,
|
|
1478
|
+
timestamp=rec.timestamp,
|
|
1479
|
+
)
|
|
1480
|
+
break
|
|
1481
|
+
|
|
1482
|
+
# ------------------------------------------------------------------
|
|
1483
|
+
# SFServiceClient — abstract requirement
|
|
1484
|
+
# ------------------------------------------------------------------
|
|
1485
|
+
|
|
1486
|
+
def _request(
|
|
1487
|
+
self,
|
|
1488
|
+
method: str,
|
|
1489
|
+
path: str,
|
|
1490
|
+
body: dict[str, Any] | None = None,
|
|
1491
|
+
) -> dict[str, Any]:
|
|
1492
|
+
"""Not used directly; alert routing is purely outbound push."""
|
|
1493
|
+
raise NotImplementedError("SFAlertClient does not expose a request interface")
|
|
1494
|
+
|
|
1495
|
+
|
|
1496
|
+
# ---------------------------------------------------------------------------
|
|
1497
|
+
# Module-level helpers
|
|
1498
|
+
# ---------------------------------------------------------------------------
|
|
1499
|
+
|
|
1500
|
+
|
|
1501
|
+
def _topic_prefix(topic: str) -> str:
|
|
1502
|
+
"""Return everything before the last dot in *topic*."""
|
|
1503
|
+
idx = topic.rfind(".")
|
|
1504
|
+
return topic[:idx] if idx != -1 else topic
|
|
1505
|
+
|
|
1506
|
+
|
|
1507
|
+
def _build_message(topic: str, payload: dict[str, Any], runbook_url: str | None) -> str:
|
|
1508
|
+
"""Construct a human-readable alert message."""
|
|
1509
|
+
lines: list[str] = [f"Topic: {topic}"]
|
|
1510
|
+
for key, value in payload.items():
|
|
1511
|
+
lines.append(f" {key}: {value}")
|
|
1512
|
+
if runbook_url:
|
|
1513
|
+
lines.append(f"Runbook: {runbook_url}")
|
|
1514
|
+
return "\n".join(lines)
|