spanforge 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. spanforge/__init__.py +815 -0
  2. spanforge/_ansi.py +93 -0
  3. spanforge/_batch_exporter.py +409 -0
  4. spanforge/_cli.py +2094 -0
  5. spanforge/_cli_audit.py +639 -0
  6. spanforge/_cli_compliance.py +711 -0
  7. spanforge/_cli_cost.py +243 -0
  8. spanforge/_cli_ops.py +791 -0
  9. spanforge/_cli_phase11.py +356 -0
  10. spanforge/_hooks.py +337 -0
  11. spanforge/_server.py +1708 -0
  12. spanforge/_span.py +1036 -0
  13. spanforge/_store.py +288 -0
  14. spanforge/_stream.py +664 -0
  15. spanforge/_trace.py +335 -0
  16. spanforge/_tracer.py +254 -0
  17. spanforge/actor.py +141 -0
  18. spanforge/alerts.py +469 -0
  19. spanforge/auto.py +464 -0
  20. spanforge/baseline.py +335 -0
  21. spanforge/cache.py +635 -0
  22. spanforge/compliance.py +325 -0
  23. spanforge/config.py +532 -0
  24. spanforge/consent.py +228 -0
  25. spanforge/consumer.py +377 -0
  26. spanforge/core/__init__.py +5 -0
  27. spanforge/core/compliance_mapping.py +1254 -0
  28. spanforge/cost.py +600 -0
  29. spanforge/debug.py +548 -0
  30. spanforge/deprecations.py +205 -0
  31. spanforge/drift.py +482 -0
  32. spanforge/egress.py +58 -0
  33. spanforge/eval.py +648 -0
  34. spanforge/event.py +1064 -0
  35. spanforge/exceptions.py +240 -0
  36. spanforge/explain.py +178 -0
  37. spanforge/export/__init__.py +69 -0
  38. spanforge/export/append_only.py +337 -0
  39. spanforge/export/cloud.py +357 -0
  40. spanforge/export/datadog.py +497 -0
  41. spanforge/export/grafana.py +320 -0
  42. spanforge/export/jsonl.py +195 -0
  43. spanforge/export/openinference.py +158 -0
  44. spanforge/export/otel_bridge.py +294 -0
  45. spanforge/export/otlp.py +811 -0
  46. spanforge/export/otlp_bridge.py +233 -0
  47. spanforge/export/redis_backend.py +282 -0
  48. spanforge/export/siem_schema.py +98 -0
  49. spanforge/export/siem_splunk.py +264 -0
  50. spanforge/export/siem_syslog.py +212 -0
  51. spanforge/export/webhook.py +299 -0
  52. spanforge/exporters/__init__.py +30 -0
  53. spanforge/exporters/console.py +271 -0
  54. spanforge/exporters/jsonl.py +144 -0
  55. spanforge/exporters/sqlite.py +142 -0
  56. spanforge/gate.py +1150 -0
  57. spanforge/governance.py +181 -0
  58. spanforge/hitl.py +295 -0
  59. spanforge/http.py +187 -0
  60. spanforge/inspect.py +427 -0
  61. spanforge/integrations/__init__.py +45 -0
  62. spanforge/integrations/_pricing.py +280 -0
  63. spanforge/integrations/anthropic.py +388 -0
  64. spanforge/integrations/azure_openai.py +133 -0
  65. spanforge/integrations/bedrock.py +292 -0
  66. spanforge/integrations/crewai.py +251 -0
  67. spanforge/integrations/gemini.py +351 -0
  68. spanforge/integrations/groq.py +442 -0
  69. spanforge/integrations/langchain.py +349 -0
  70. spanforge/integrations/langgraph.py +306 -0
  71. spanforge/integrations/llamaindex.py +373 -0
  72. spanforge/integrations/ollama.py +287 -0
  73. spanforge/integrations/openai.py +368 -0
  74. spanforge/integrations/together.py +483 -0
  75. spanforge/io.py +214 -0
  76. spanforge/lint.py +322 -0
  77. spanforge/metrics.py +417 -0
  78. spanforge/metrics_export.py +343 -0
  79. spanforge/migrate.py +402 -0
  80. spanforge/model_registry.py +278 -0
  81. spanforge/models.py +389 -0
  82. spanforge/namespaces/__init__.py +254 -0
  83. spanforge/namespaces/audit.py +256 -0
  84. spanforge/namespaces/cache.py +237 -0
  85. spanforge/namespaces/chain.py +77 -0
  86. spanforge/namespaces/confidence.py +72 -0
  87. spanforge/namespaces/consent.py +92 -0
  88. spanforge/namespaces/cost.py +179 -0
  89. spanforge/namespaces/decision.py +143 -0
  90. spanforge/namespaces/diff.py +157 -0
  91. spanforge/namespaces/drift.py +80 -0
  92. spanforge/namespaces/eval_.py +251 -0
  93. spanforge/namespaces/feedback.py +241 -0
  94. spanforge/namespaces/fence.py +193 -0
  95. spanforge/namespaces/guard.py +105 -0
  96. spanforge/namespaces/hitl.py +91 -0
  97. spanforge/namespaces/latency.py +72 -0
  98. spanforge/namespaces/prompt.py +190 -0
  99. spanforge/namespaces/redact.py +173 -0
  100. spanforge/namespaces/retrieval.py +379 -0
  101. spanforge/namespaces/runtime_governance.py +494 -0
  102. spanforge/namespaces/template.py +208 -0
  103. spanforge/namespaces/tool_call.py +77 -0
  104. spanforge/namespaces/trace.py +1029 -0
  105. spanforge/normalizer.py +171 -0
  106. spanforge/plugins.py +82 -0
  107. spanforge/presidio_backend.py +349 -0
  108. spanforge/processor.py +258 -0
  109. spanforge/prompt_registry.py +418 -0
  110. spanforge/py.typed +0 -0
  111. spanforge/redact.py +914 -0
  112. spanforge/regression.py +192 -0
  113. spanforge/runtime_policy.py +159 -0
  114. spanforge/sampling.py +511 -0
  115. spanforge/schema.py +183 -0
  116. spanforge/schemas/v1.0/schema.json +170 -0
  117. spanforge/schemas/v2.0/schema.json +536 -0
  118. spanforge/sdk/__init__.py +625 -0
  119. spanforge/sdk/_base.py +584 -0
  120. spanforge/sdk/_base.pyi +71 -0
  121. spanforge/sdk/_exceptions.py +1096 -0
  122. spanforge/sdk/_types.py +2184 -0
  123. spanforge/sdk/alert.py +1514 -0
  124. spanforge/sdk/alert.pyi +56 -0
  125. spanforge/sdk/audit.py +1196 -0
  126. spanforge/sdk/audit.pyi +67 -0
  127. spanforge/sdk/cec.py +1215 -0
  128. spanforge/sdk/cec.pyi +37 -0
  129. spanforge/sdk/config.py +641 -0
  130. spanforge/sdk/config.pyi +55 -0
  131. spanforge/sdk/enterprise.py +714 -0
  132. spanforge/sdk/enterprise.pyi +79 -0
  133. spanforge/sdk/explain.py +170 -0
  134. spanforge/sdk/fallback.py +432 -0
  135. spanforge/sdk/feedback.py +351 -0
  136. spanforge/sdk/gate.py +874 -0
  137. spanforge/sdk/gate.pyi +51 -0
  138. spanforge/sdk/identity.py +2114 -0
  139. spanforge/sdk/identity.pyi +47 -0
  140. spanforge/sdk/lineage.py +175 -0
  141. spanforge/sdk/observe.py +1065 -0
  142. spanforge/sdk/observe.pyi +50 -0
  143. spanforge/sdk/operator.py +338 -0
  144. spanforge/sdk/pii.py +1473 -0
  145. spanforge/sdk/pii.pyi +119 -0
  146. spanforge/sdk/pipelines.py +458 -0
  147. spanforge/sdk/pipelines.pyi +39 -0
  148. spanforge/sdk/policy.py +930 -0
  149. spanforge/sdk/rag.py +594 -0
  150. spanforge/sdk/rbac.py +280 -0
  151. spanforge/sdk/registry.py +430 -0
  152. spanforge/sdk/registry.pyi +46 -0
  153. spanforge/sdk/scope.py +279 -0
  154. spanforge/sdk/secrets.py +293 -0
  155. spanforge/sdk/secrets.pyi +25 -0
  156. spanforge/sdk/security.py +560 -0
  157. spanforge/sdk/security.pyi +57 -0
  158. spanforge/sdk/trust.py +472 -0
  159. spanforge/sdk/trust.pyi +41 -0
  160. spanforge/secrets.py +799 -0
  161. spanforge/signing.py +1179 -0
  162. spanforge/stats.py +100 -0
  163. spanforge/stream.py +560 -0
  164. spanforge/testing.py +378 -0
  165. spanforge/testing_mocks.py +1052 -0
  166. spanforge/trace.py +199 -0
  167. spanforge/types.py +696 -0
  168. spanforge/ulid.py +300 -0
  169. spanforge/validate.py +379 -0
  170. spanforge-1.0.0.dist-info/METADATA +1509 -0
  171. spanforge-1.0.0.dist-info/RECORD +174 -0
  172. spanforge-1.0.0.dist-info/WHEEL +4 -0
  173. spanforge-1.0.0.dist-info/entry_points.txt +5 -0
  174. spanforge-1.0.0.dist-info/licenses/LICENSE +128 -0
spanforge/sdk/alert.py ADDED
@@ -0,0 +1,1514 @@
1
+ """spanforge.sdk.alert — SpanForge sf-alert Alert Routing Service (Phase 7).
2
+
3
+ Implements the full sf-alert API surface: topic-based publish, per-sink circuit
4
+ breakers, 5-minute deduplication, per-project rate limiting, alert grouping,
5
+ CRITICAL escalation policy, maintenance-window suppression, webhook HMAC signing,
6
+ and integrations with Slack, Teams, PagerDuty, OpsGenie, VictorOps, Incident.io,
7
+ SMS (Twilio), and generic HMAC-signed webhooks.
8
+
9
+ Architecture
10
+ ------------
11
+ * :meth:`publish` is the **primary entry point**. It validates the topic,
12
+ checks maintenance windows, deduplicates by ``(topic, project_id)``, applies
13
+ per-project rate limits, enqueues the alert, and returns a
14
+ :class:`~spanforge.sdk._types.PublishResult` immediately.
15
+ * A **background worker thread** drains the queue and dispatches to each
16
+ configured sink through its own :class:`~spanforge.sdk._base._CircuitBreaker`.
17
+ * **CRITICAL alerts** schedule a :class:`threading.Timer` for auto-escalation
18
+ after ``escalation_wait_seconds`` (default: 900 s = 15 min).
19
+ :meth:`acknowledge` cancels the timer.
20
+ * All alert emissions are appended to ``sf-audit`` schema ``spanforge.alert.v1``
21
+ on a best-effort basis (failures are logged at DEBUG level).
22
+
23
+ Topic registry (ALT-002, ALT-003)
24
+ -----------------------------------
25
+ Eight built-in topics match HallucCheck's published event taxonomy. Additional
26
+ topics can be registered with :meth:`register_topic`. Publishing to an unknown
27
+ topic logs a WARNING and routes to the catch-all sink list if configured.
28
+
29
+ Deduplication (ALT-010)
30
+ ------------------------
31
+ The same ``(topic, project_id)`` pair is suppressed for ``dedup_window_seconds``
32
+ (default: 300 s). Per-topic windows override the client default.
33
+
34
+ Alert grouping (ALT-011)
35
+ --------------------------
36
+ Multiple alerts sharing the same ``(topic_prefix, project_id)`` within a 2-minute
37
+ window are coalesced into a single notification. The group is flushed when the
38
+ timer fires or when the window elapses.
39
+
40
+ Escalation policy (ALT-020, ALT-021)
41
+ --------------------------------------
42
+ CRITICAL severity alerts schedule an escalation timer. When the timer fires the
43
+ alert is re-dispatched to the escalation sink list. Call :meth:`acknowledge` to
44
+ cancel the timer.
45
+
46
+ Sink integrations
47
+ -----------------
48
+ All sinks live in this module:
49
+
50
+ * :class:`WebhookAlerter` — generic HMAC-signed webhook (ALT-034)
51
+ * :class:`OpsGenieAlerter` — OpsGenie Alert API v2 (ALT-030)
52
+ * :class:`VictorOpsAlerter` — VictorOps / Splunk On-Call (ALT-031)
53
+ * :class:`IncidentIOAlerter` — Incident.io (ALT-032)
54
+ * :class:`SMSAlerter` — Twilio SMS (ALT-033)
55
+ * :class:`TeamsAdaptiveCardAlerter` — enhanced Teams Adaptive Card (ALT-035)
56
+
57
+ The existing ``spanforge.alerts`` sinks (Slack, Teams, PagerDuty, Email) are
58
+ re-exported here for convenience.
59
+
60
+ Security requirements
61
+ ---------------------
62
+ * Webhook HMAC secrets are never logged. :class:`WebhookAlerter` uses
63
+ :func:`hmac.compare_digest` for constant-time comparison and sets the
64
+ ``X-SF-Signature: sha256=<hex>`` header.
65
+ * PagerDuty and OpsGenie integration keys are stored in ``repr=False`` fields.
66
+ * All remote URLs are validated with :func:`_validate_http_url` (same guard used
67
+ in ``observe.py``) before each request.
68
+ * The audit log appended to sf-audit uses ``best_effort=True``; any failure is
69
+ swallowed at DEBUG level so alerting itself is never blocked by audit issues.
70
+ """
71
+
72
+ from __future__ import annotations
73
+
74
+ import hashlib
75
+ import hmac as _hmac
76
+ import ipaddress
77
+ import json
78
+ import logging
79
+ import os
80
+ import queue
81
+ import threading
82
+ import time
83
+ import urllib.error
84
+ import urllib.parse
85
+ import urllib.request
86
+ import uuid
87
+ from dataclasses import dataclass, field
88
+ from datetime import datetime, timezone
89
+ from typing import Any, Union
90
+
91
+ from spanforge.sdk._base import (
92
+ SFClientConfig,
93
+ SFServiceClient,
94
+ _CircuitBreaker,
95
+ _SlidingWindowRateLimiter,
96
+ )
97
+ from spanforge.sdk._exceptions import (
98
+ SFAlertRateLimitedError,
99
+ )
100
+ from spanforge.sdk._types import (
101
+ AlertRecord,
102
+ AlertStatusInfo,
103
+ MaintenanceWindow,
104
+ PublishResult,
105
+ TopicRegistration,
106
+ )
107
+
108
+ __all__ = [
109
+ "KNOWN_TOPICS",
110
+ "IncidentIOAlerter",
111
+ "OpsGenieAlerter",
112
+ "SFAlertClient",
113
+ "SMSAlerter",
114
+ "TeamsAdaptiveCardAlerter",
115
+ "VictorOpsAlerter",
116
+ "WebhookAlerter",
117
+ ]
118
+
119
+ _log = logging.getLogger(__name__)
120
+
121
+ # ---------------------------------------------------------------------------
122
+ # Constants
123
+ # ---------------------------------------------------------------------------
124
+
125
+ #: The eight HallucCheck-defined topics wired at design time.
126
+ KNOWN_TOPICS: frozenset[str] = frozenset(
127
+ {
128
+ "halluccheck.drift.amber",
129
+ "halluccheck.drift.red",
130
+ "halluccheck.bias.critical",
131
+ "halluccheck.prri.red",
132
+ "halluccheck.benchmark.regression",
133
+ "halluccheck.pii.detected",
134
+ "halluccheck.secrets.detected",
135
+ "halluccheck.trust_gate.failed",
136
+ },
137
+ )
138
+
139
+ _DEDUP_WINDOW_DEFAULT: float = 300.0 # 5 min
140
+ _GROUP_WINDOW_SECS: float = 120.0 # 2 min
141
+ _ESCALATION_WAIT_DEFAULT: float = 900.0 # 15 min
142
+ _QUEUE_MAX: int = 1_000
143
+ _RATE_LIMIT_PER_MINUTE: int = 60
144
+ _HISTORY_MAX: int = 10_000
145
+
146
+ # Severity ordinal for escalation gating
147
+ _SEVERITY_RANK: dict[str, int] = {"info": 0, "warning": 1, "high": 2, "critical": 3}
148
+
149
+
150
+ # ---------------------------------------------------------------------------
151
+ # URL validation (SSRF guard)
152
+ # ---------------------------------------------------------------------------
153
+
154
+
155
+ def _validate_http_url(url: str) -> None:
156
+ """Raise :exc:`ValueError` if *url* is not a safe HTTP/HTTPS URL.
157
+
158
+ Rejects:
159
+ * Non-HTTP/HTTPS schemes.
160
+ * Private/loopback IP targets (unless ``SPANFORGE_ALLOW_LOOPBACK=1``).
161
+ * Overly long URLs (> 2 048 chars).
162
+ """
163
+ if len(url) > 2048:
164
+ raise ValueError(f"URL too long: {len(url)} chars (max 2048)")
165
+ parsed = urllib.parse.urlparse(url)
166
+ if parsed.scheme not in ("http", "https"):
167
+ raise ValueError(f"Unsupported scheme {parsed.scheme!r}; only http/https allowed")
168
+ hostname = parsed.hostname or ""
169
+ if os.environ.get("SPANFORGE_ALLOW_LOOPBACK", "").lower() not in ("1", "true", "yes"):
170
+ try:
171
+ addr = ipaddress.ip_address(hostname)
172
+ if addr.is_private or addr.is_loopback or addr.is_link_local:
173
+ raise ValueError(f"Destination IP {hostname!r} is private/loopback (SSRF guard)")
174
+ except ValueError as exc:
175
+ if "SSRF" in str(exc):
176
+ raise
177
+
178
+
179
+ # ---------------------------------------------------------------------------
180
+ # Sink implementations (Phase 7 additions)
181
+ # ---------------------------------------------------------------------------
182
+
183
+
184
+ @dataclass
185
+ class WebhookAlerter:
186
+ """Generic HMAC-signed webhook sink (ALT-034).
187
+
188
+ Sends a JSON POST with ``X-SF-Signature: sha256=<hmac>`` header.
189
+ The HMAC is computed over the UTF-8 encoded request body using the
190
+ configured *secret*. Receivers verify with :func:`hmac.compare_digest`.
191
+
192
+ Args:
193
+ url: Webhook endpoint URL.
194
+ secret: HMAC signing secret. **Never logged.**
195
+ timeout: HTTP timeout in seconds.
196
+ """
197
+
198
+ url: str
199
+ secret: str = field(repr=False, default="")
200
+ timeout: int = 10
201
+
202
+ def send(
203
+ self,
204
+ title: str,
205
+ message: str,
206
+ severity: str = "warning",
207
+ extra: dict[str, Any] | None = None,
208
+ ) -> None:
209
+ """POST alert JSON with HMAC signature."""
210
+ _validate_http_url(self.url)
211
+ body: dict[str, Any] = {"title": title, "message": message, "severity": severity}
212
+ if extra:
213
+ body.update(extra)
214
+ data = json.dumps(body).encode()
215
+ sig = _hmac.new(self.secret.encode(), data, hashlib.sha256).hexdigest()
216
+ req = urllib.request.Request(
217
+ self.url,
218
+ data=data,
219
+ headers={
220
+ "Content-Type": "application/json",
221
+ "X-SF-Signature": f"sha256={sig}",
222
+ },
223
+ method="POST",
224
+ )
225
+ try:
226
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
227
+ if resp.status not in (200, 201, 202, 204):
228
+ _log.warning("WebhookAlerter: unexpected status %s", resp.status)
229
+ except urllib.error.URLError as exc:
230
+ _log.warning("WebhookAlerter: request failed: %s", exc)
231
+
232
+
233
+ @dataclass
234
+ class OpsGenieAlerter:
235
+ """OpsGenie Alert API v2 sink (ALT-030).
236
+
237
+ Args:
238
+ api_key: OpsGenie API key. **Never logged.**
239
+ region: ``"us"`` (default) or ``"eu"``.
240
+ timeout: HTTP timeout in seconds.
241
+ """
242
+
243
+ api_key: str = field(repr=False)
244
+ region: str = "us"
245
+ timeout: int = 10
246
+
247
+ _PRIORITY_MAP: dict[str, str] = field(
248
+ init=False,
249
+ repr=False,
250
+ default_factory=lambda: {
251
+ "info": "P5",
252
+ "warning": "P3",
253
+ "high": "P2",
254
+ "critical": "P1",
255
+ },
256
+ )
257
+
258
+ def _url(self) -> str:
259
+ if self.region == "eu":
260
+ return "https://api.eu.opsgenie.com/v2/alerts"
261
+ return "https://api.opsgenie.com/v2/alerts"
262
+
263
+ def send(
264
+ self,
265
+ title: str,
266
+ message: str,
267
+ severity: str = "warning",
268
+ extra: dict[str, Any] | None = None,
269
+ ) -> None:
270
+ """Create an OpsGenie alert."""
271
+ url = self._url()
272
+ _validate_http_url(url)
273
+ priority = self._PRIORITY_MAP.get(severity.lower(), "P3")
274
+ payload: dict[str, Any] = {
275
+ "message": title,
276
+ "description": message,
277
+ "priority": priority,
278
+ "tags": [f"severity:{severity}", "spanforge"],
279
+ }
280
+ if extra:
281
+ payload["details"] = {str(k): str(v) for k, v in extra.items()}
282
+ data = json.dumps(payload).encode()
283
+ req = urllib.request.Request(
284
+ url,
285
+ data=data,
286
+ headers={
287
+ "Content-Type": "application/json",
288
+ "Authorization": f"GenieKey {self.api_key}",
289
+ },
290
+ method="POST",
291
+ )
292
+ try:
293
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
294
+ if resp.status not in (200, 201, 202):
295
+ _log.warning("OpsGenieAlerter: unexpected status %s", resp.status)
296
+ except urllib.error.URLError as exc:
297
+ _log.warning("OpsGenieAlerter: request failed: %s", exc)
298
+
299
+
300
+ @dataclass
301
+ class VictorOpsAlerter:
302
+ """VictorOps / Splunk On-Call sink (ALT-031).
303
+
304
+ Args:
305
+ rest_endpoint_url: VictorOps REST endpoint URL including routing key.
306
+ timeout: HTTP timeout in seconds.
307
+ """
308
+
309
+ rest_endpoint_url: str
310
+ timeout: int = 10
311
+
312
+ _MSG_TYPE_MAP: dict[str, str] = field(
313
+ init=False,
314
+ repr=False,
315
+ default_factory=lambda: {
316
+ "info": "INFO",
317
+ "warning": "WARNING",
318
+ "high": "CRITICAL",
319
+ "critical": "CRITICAL",
320
+ },
321
+ )
322
+
323
+ def send(
324
+ self,
325
+ title: str,
326
+ message: str,
327
+ severity: str = "warning",
328
+ extra: dict[str, Any] | None = None,
329
+ ) -> None:
330
+ """POST to VictorOps REST endpoint."""
331
+ _validate_http_url(self.rest_endpoint_url)
332
+ message_type = self._MSG_TYPE_MAP.get(severity.lower(), "WARNING")
333
+ payload: dict[str, Any] = {
334
+ "message_type": message_type,
335
+ "entity_display_name": title,
336
+ "state_message": message,
337
+ }
338
+ if extra:
339
+ payload.update(extra)
340
+ data = json.dumps(payload).encode()
341
+ req = urllib.request.Request(
342
+ self.rest_endpoint_url,
343
+ data=data,
344
+ headers={"Content-Type": "application/json"},
345
+ method="POST",
346
+ )
347
+ try:
348
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
349
+ if resp.status not in (200, 201, 202):
350
+ _log.warning("VictorOpsAlerter: unexpected status %s", resp.status)
351
+ except urllib.error.URLError as exc:
352
+ _log.warning("VictorOpsAlerter: request failed: %s", exc)
353
+
354
+
355
+ @dataclass
356
+ class IncidentIOAlerter:
357
+ """Incident.io sink (ALT-032).
358
+
359
+ Creates or updates an Incident.io incident via the REST API.
360
+
361
+ Args:
362
+ api_key: Incident.io API key. **Never logged.**
363
+ timeout: HTTP timeout in seconds.
364
+ """
365
+
366
+ api_key: str = field(repr=False)
367
+ timeout: int = 10
368
+
369
+ _SEVERITY_MAP: dict[str, str] = field(
370
+ init=False,
371
+ repr=False,
372
+ default_factory=lambda: {
373
+ "info": "minor",
374
+ "warning": "major",
375
+ "high": "major",
376
+ "critical": "critical",
377
+ },
378
+ )
379
+
380
+ _URL: str = "https://api.incident.io/v1/incidents"
381
+
382
+ def send(
383
+ self,
384
+ title: str,
385
+ message: str,
386
+ severity: str = "warning",
387
+ extra: dict[str, Any] | None = None,
388
+ ) -> None:
389
+ """Create an Incident.io incident."""
390
+ _validate_http_url(self._URL)
391
+ sev = self._SEVERITY_MAP.get(severity.lower(), "major")
392
+ payload: dict[str, Any] = {
393
+ "name": title,
394
+ "summary": message,
395
+ "severity": {"name": sev},
396
+ "visibility": "public",
397
+ }
398
+ if extra:
399
+ payload["custom_field_entries"] = [
400
+ {"custom_field": {"name": str(k)}, "values": [{"value_text": str(v)}]}
401
+ for k, v in extra.items()
402
+ ]
403
+ data = json.dumps(payload).encode()
404
+ req = urllib.request.Request(
405
+ self._URL,
406
+ data=data,
407
+ headers={
408
+ "Content-Type": "application/json",
409
+ "Authorization": f"Bearer {self.api_key}",
410
+ },
411
+ method="POST",
412
+ )
413
+ try:
414
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
415
+ if resp.status not in (200, 201, 202):
416
+ _log.warning("IncidentIOAlerter: unexpected status %s", resp.status)
417
+ except urllib.error.URLError as exc:
418
+ _log.warning("IncidentIOAlerter: request failed: %s", exc)
419
+
420
+
421
+ @dataclass
422
+ class SMSAlerter:
423
+ """Twilio SMS sink (ALT-033). Enterprise tier only.
424
+
425
+ Sends a 160-character-limited SMS via the Twilio REST API.
426
+
427
+ Args:
428
+ account_sid: Twilio Account SID.
429
+ auth_token: Twilio Auth Token. **Never logged.**
430
+ from_number: Twilio phone number (E.164 format, e.g. ``"+15005550006"``).
431
+ to_numbers: List of recipient phone numbers (E.164 format).
432
+ timeout: HTTP timeout in seconds.
433
+ """
434
+
435
+ account_sid: str
436
+ auth_token: str = field(repr=False)
437
+ from_number: str
438
+ to_numbers: list[str] = field(default_factory=list)
439
+ timeout: int = 10
440
+
441
+ def send(
442
+ self,
443
+ title: str,
444
+ message: str,
445
+ severity: str = "warning",
446
+ extra: dict[str, Any] | None = None,
447
+ ) -> None:
448
+ """Send SMS to all configured recipients."""
449
+ if not self.to_numbers:
450
+ _log.warning("SMSAlerter: no recipients configured, skipping")
451
+ return
452
+ body_raw = f"[{severity.upper()}] {title}: {message}"
453
+ body = body_raw[:160]
454
+ url = f"https://api.twilio.com/2010-04-01/Accounts/{self.account_sid}/Messages.json"
455
+ _validate_http_url(url)
456
+ for to_number in self.to_numbers:
457
+ form_data = urllib.parse.urlencode(
458
+ {"From": self.from_number, "To": to_number, "Body": body},
459
+ ).encode()
460
+ # Basic auth: account_sid:auth_token
461
+ cred = f"{self.account_sid}:{self.auth_token}".encode()
462
+ import base64
463
+
464
+ b64 = base64.b64encode(cred).decode()
465
+ req = urllib.request.Request(
466
+ url,
467
+ data=form_data,
468
+ headers={
469
+ "Content-Type": "application/x-www-form-urlencoded",
470
+ "Authorization": f"Basic {b64}",
471
+ },
472
+ method="POST",
473
+ )
474
+ try:
475
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
476
+ if resp.status not in (200, 201):
477
+ _log.warning(
478
+ "SMSAlerter: unexpected status %s for %s",
479
+ resp.status,
480
+ to_number,
481
+ )
482
+ except urllib.error.URLError as exc:
483
+ _log.warning("SMSAlerter: request failed for %s: %s", to_number, exc)
484
+
485
+
486
+ @dataclass
487
+ class TeamsAdaptiveCardAlerter:
488
+ """Enhanced Microsoft Teams Adaptive Card sink (ALT-035).
489
+
490
+ Sends a rich Adaptive Card with a severity colour band, a fact table
491
+ from payload fields, and Acknowledge / Silence action buttons.
492
+
493
+ Args:
494
+ webhook_url: Teams channel Incoming Webhook URL.
495
+ timeout: HTTP timeout in seconds.
496
+ """
497
+
498
+ webhook_url: str
499
+ timeout: int = 10
500
+
501
+ _COLOUR_MAP: dict[str, str] = field(
502
+ init=False,
503
+ repr=False,
504
+ default_factory=lambda: {
505
+ "info": "Good",
506
+ "warning": "Warning",
507
+ "high": "Warning",
508
+ "critical": "Attention",
509
+ },
510
+ )
511
+
512
+ def send(
513
+ self,
514
+ title: str,
515
+ message: str,
516
+ severity: str = "warning",
517
+ extra: dict[str, Any] | None = None,
518
+ ) -> None:
519
+ """POST an Adaptive Card to the Teams webhook."""
520
+ _validate_http_url(self.webhook_url)
521
+ colour = self._COLOUR_MAP.get(severity.lower(), "Warning")
522
+ facts = [{"title": str(k), "value": str(v)} for k, v in (extra or {}).items()]
523
+ card_body: list[dict[str, Any]] = [
524
+ {
525
+ "type": "TextBlock",
526
+ "text": title,
527
+ "weight": "Bolder",
528
+ "size": "Medium",
529
+ "color": colour,
530
+ },
531
+ {"type": "TextBlock", "text": message, "wrap": True},
532
+ ]
533
+ if facts:
534
+ card_body.append(
535
+ {
536
+ "type": "FactSet",
537
+ "facts": facts,
538
+ },
539
+ )
540
+ payload = {
541
+ "type": "message",
542
+ "attachments": [
543
+ {
544
+ "contentType": "application/vnd.microsoft.card.adaptive",
545
+ "content": {
546
+ "$schema": "http://adaptivecards.io/schemas/adaptive-card.json",
547
+ "type": "AdaptiveCard",
548
+ "version": "1.3",
549
+ "body": card_body,
550
+ "actions": [
551
+ {
552
+ "type": "Action.Submit",
553
+ "title": "Acknowledge",
554
+ "data": {"action": "acknowledge"},
555
+ },
556
+ {
557
+ "type": "Action.Submit",
558
+ "title": "Silence",
559
+ "data": {"action": "silence"},
560
+ },
561
+ ],
562
+ },
563
+ },
564
+ ],
565
+ }
566
+ data = json.dumps(payload).encode()
567
+ req = urllib.request.Request(
568
+ self.webhook_url,
569
+ data=data,
570
+ headers={"Content-Type": "application/json"},
571
+ method="POST",
572
+ )
573
+ try:
574
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp: # nosec B310
575
+ if resp.status not in (200, 202):
576
+ _log.warning("TeamsAdaptiveCardAlerter: unexpected status %s", resp.status)
577
+ except urllib.error.URLError as exc:
578
+ _log.warning("TeamsAdaptiveCardAlerter: request failed: %s", exc)
579
+
580
+
581
+ # ---------------------------------------------------------------------------
582
+ # Sink wrapper (circuit breaker per sink)
583
+ # ---------------------------------------------------------------------------
584
+
585
+ #: A type alias for any sink that supports a ``send()`` method.
586
+ _Alerter = Union[
587
+ WebhookAlerter,
588
+ OpsGenieAlerter,
589
+ VictorOpsAlerter,
590
+ IncidentIOAlerter,
591
+ SMSAlerter,
592
+ TeamsAdaptiveCardAlerter,
593
+ Any,
594
+ ]
595
+
596
+
597
+ @dataclass
598
+ class _SinkWrapper:
599
+ """Wraps a sink instance with its own circuit breaker and a name."""
600
+
601
+ alerter: _Alerter
602
+ name: str
603
+ cb: _CircuitBreaker = field(default_factory=_CircuitBreaker)
604
+
605
+ def dispatch(
606
+ self,
607
+ title: str,
608
+ message: str,
609
+ severity: str,
610
+ extra: dict[str, Any] | None = None,
611
+ ) -> bool:
612
+ """Send alert through the wrapped alerter, updating the circuit breaker.
613
+
614
+ Returns:
615
+ ``True`` if the alert was sent successfully.
616
+ """
617
+ if self.cb.is_open():
618
+ _log.debug("_SinkWrapper[%s]: circuit open, skipping", self.name)
619
+ return False
620
+ try:
621
+ if hasattr(self.alerter, "send"):
622
+ try:
623
+ self.alerter.send(title, message, severity=severity, extra=extra)
624
+ except TypeError:
625
+ # Older sinks (from alerts.py) don't accept extra kwarg
626
+ self.alerter.send(title, message, severity=severity)
627
+ except Exception:
628
+ self.cb.record_failure()
629
+ _log.exception("_SinkWrapper[%s]: dispatch error", self.name)
630
+ return False
631
+ else:
632
+ self.cb.record_success()
633
+ return True
634
+
635
+
636
+ # ---------------------------------------------------------------------------
637
+ # Queue item
638
+ # ---------------------------------------------------------------------------
639
+
640
+
641
+ @dataclass
642
+ class _QueueItem:
643
+ alert_id: str
644
+ topic: str
645
+ title: str
646
+ message: str
647
+ severity: str
648
+ project_id: str
649
+ payload: dict[str, Any]
650
+ runbook_url: str | None
651
+ is_escalation: bool = False
652
+
653
+
654
+ # ---------------------------------------------------------------------------
655
+ # SFAlertClient
656
+ # ---------------------------------------------------------------------------
657
+
658
+
659
+ class SFAlertClient(SFServiceClient):
660
+ """SpanForge sf-alert Alert Routing Service client.
661
+
662
+ Topic-based publish/subscribe model with deduplication, escalation policy,
663
+ per-sink circuit breakers, per-project rate limiting, and audit logging.
664
+
665
+ All operations are **thread-safe**.
666
+
667
+ Args:
668
+ config: :class:`~spanforge.sdk._base.SFClientConfig` loaded
669
+ from env or via :func:`~spanforge.sdk.configure`.
670
+ sinks: Optional list of sink instances pre-wired at
671
+ construction time. Sinks are also auto-discovered
672
+ from ``SPANFORGE_ALERT_*`` environment variables.
673
+ dedup_window_seconds: Client-wide deduplication window (default: 300 s).
674
+ rate_limit_per_minute: Per-project alert rate limit (default: 60).
675
+ escalation_wait_seconds: Seconds before a CRITICAL alert auto-escalates
676
+ (default: 900 s = 15 min).
677
+ escalation_sinks: Sink names to route escalated alerts to. If
678
+ empty, all sinks are used for escalation.
679
+
680
+ Environment variables
681
+ ---------------------
682
+ .. code-block:: text
683
+
684
+ SPANFORGE_ALERT_SLACK_WEBHOOK → SlackAlerter (from spanforge.alerts)
685
+ SPANFORGE_ALERT_TEAMS_WEBHOOK → TeamsAdaptiveCardAlerter
686
+ SPANFORGE_ALERT_PAGERDUTY_KEY → PagerDutyAlerter (from spanforge.alerts)
687
+ SPANFORGE_ALERT_OPSGENIE_KEY → OpsGenieAlerter
688
+ SPANFORGE_ALERT_OPSGENIE_REGION → OpsGenieAlerter region (us|eu)
689
+ SPANFORGE_ALERT_VICTOROPS_URL → VictorOpsAlerter
690
+ SPANFORGE_ALERT_WEBHOOK_URL → WebhookAlerter
691
+ SPANFORGE_ALERT_WEBHOOK_SECRET → WebhookAlerter HMAC secret
692
+ SPANFORGE_ALERT_DEDUP_SECONDS → dedup_window_seconds (default: 300)
693
+ SPANFORGE_ALERT_RATE_LIMIT → rate_limit_per_minute (default: 60)
694
+ SPANFORGE_ALERT_ESCALATION_WAIT → escalation_wait_seconds (default: 900)
695
+ """
696
+
697
+ def __init__(
698
+ self,
699
+ config: SFClientConfig,
700
+ sinks: list[_Alerter] | None = None,
701
+ *,
702
+ dedup_window_seconds: float | None = None,
703
+ rate_limit_per_minute: int | None = None,
704
+ escalation_wait_seconds: float | None = None,
705
+ escalation_sinks: list[str] | None = None,
706
+ ) -> None:
707
+ super().__init__(config, "alert")
708
+ self._lock = threading.RLock()
709
+
710
+ # Configuration
711
+ _dedup_raw = os.environ.get("SPANFORGE_ALERT_DEDUP_SECONDS", "")
712
+ self._dedup_window: float = (
713
+ dedup_window_seconds
714
+ if dedup_window_seconds is not None
715
+ else (float(_dedup_raw) if _dedup_raw else _DEDUP_WINDOW_DEFAULT)
716
+ )
717
+ _rl_raw = os.environ.get("SPANFORGE_ALERT_RATE_LIMIT", "")
718
+ self._rate_limit: int = (
719
+ rate_limit_per_minute
720
+ if rate_limit_per_minute is not None
721
+ else (int(_rl_raw) if _rl_raw else _RATE_LIMIT_PER_MINUTE)
722
+ )
723
+ _esc_raw = os.environ.get("SPANFORGE_ALERT_ESCALATION_WAIT", "")
724
+ self._escalation_wait: float = (
725
+ escalation_wait_seconds
726
+ if escalation_wait_seconds is not None
727
+ else (float(_esc_raw) if _esc_raw else _ESCALATION_WAIT_DEFAULT)
728
+ )
729
+ self._escalation_sink_names: list[str] = escalation_sinks or []
730
+
731
+ # Topic registry — pre-populate known topics
732
+ self._topic_registry: dict[str, TopicRegistration] = {}
733
+ for t in KNOWN_TOPICS:
734
+ sev = "critical" if t.endswith((".red", ".critical", ".failed")) else "warning"
735
+ self._topic_registry[t] = TopicRegistration(
736
+ topic=t,
737
+ description=f"Built-in topic: {t}",
738
+ default_severity=sev,
739
+ )
740
+
741
+ # Sinks — env-var discovery + constructor-supplied
742
+ self._sinks: list[_SinkWrapper] = []
743
+ self._build_sinks_from_env()
744
+ for s in sinks or []:
745
+ name = type(s).__name__.lower()
746
+ self._sinks.append(_SinkWrapper(alerter=s, name=name))
747
+
748
+ # Rate limiter (per project_id)
749
+ self._rate_limiter: _SlidingWindowRateLimiter = _SlidingWindowRateLimiter(
750
+ limit=self._rate_limit,
751
+ window_seconds=60.0,
752
+ )
753
+
754
+ # Deduplication state
755
+ self._dedup: dict[tuple[str, str], float] = {}
756
+
757
+ # Alert grouping buffer
758
+ self._group_buffers: dict[tuple[str, str], list[_QueueItem]] = {}
759
+ self._group_timers: dict[tuple[str, str], threading.Timer] = {}
760
+
761
+ # Maintenance windows
762
+ self._maintenance_windows: list[MaintenanceWindow] = []
763
+
764
+ # Escalation tracking
765
+ self._escalation_timers: dict[str, threading.Timer] = {}
766
+ self._pending_escalation: dict[str, _QueueItem] = {}
767
+
768
+ # Alert history (bounded)
769
+ self._history: list[AlertRecord] = []
770
+
771
+ # Session stats
772
+ self._publish_count: int = 0
773
+ self._suppress_count: int = 0
774
+
775
+ # Async dispatch queue + worker thread
776
+ self._queue: queue.Queue[_QueueItem | None] = queue.Queue(maxsize=_QUEUE_MAX)
777
+ self._worker = threading.Thread(
778
+ target=self._worker_loop,
779
+ name="sf-alert-worker",
780
+ daemon=True,
781
+ )
782
+ self._worker.start()
783
+
784
+ # ------------------------------------------------------------------
785
+ # SFServiceClient abstract method
786
+ # ------------------------------------------------------------------
787
+
788
+ # ------------------------------------------------------------------
789
+ # Env-var sink discovery
790
+ # ------------------------------------------------------------------
791
+
792
+ def _build_sinks_from_env(self) -> None:
793
+ """Auto-discover sinks from ``SPANFORGE_ALERT_*`` environment variables."""
794
+ # Slack
795
+ slack_url = os.environ.get("SPANFORGE_ALERT_SLACK_WEBHOOK", "")
796
+ if slack_url:
797
+ try:
798
+ from spanforge.alerts import SlackAlerter
799
+
800
+ self._sinks.append(
801
+ _SinkWrapper(alerter=SlackAlerter(webhook_url=slack_url), name="slack"),
802
+ )
803
+ except Exception:
804
+ _log.warning("Failed to create SlackAlerter from env")
805
+
806
+ # Teams
807
+ teams_url = os.environ.get("SPANFORGE_ALERT_TEAMS_WEBHOOK", "")
808
+ if teams_url:
809
+ self._sinks.append(
810
+ _SinkWrapper(alerter=TeamsAdaptiveCardAlerter(webhook_url=teams_url), name="teams"),
811
+ )
812
+
813
+ # PagerDuty
814
+ pd_key = os.environ.get("SPANFORGE_ALERT_PAGERDUTY_KEY", "")
815
+ if pd_key:
816
+ try:
817
+ from spanforge.alerts import PagerDutyAlerter
818
+
819
+ self._sinks.append(
820
+ _SinkWrapper(
821
+ alerter=PagerDutyAlerter(integration_key=pd_key),
822
+ name="pagerduty",
823
+ ),
824
+ )
825
+ except Exception:
826
+ _log.warning("Failed to create PagerDutyAlerter from env")
827
+
828
+ # OpsGenie
829
+ og_key = os.environ.get("SPANFORGE_ALERT_OPSGENIE_KEY", "")
830
+ if og_key:
831
+ region = os.environ.get("SPANFORGE_ALERT_OPSGENIE_REGION", "us")
832
+ self._sinks.append(
833
+ _SinkWrapper(
834
+ alerter=OpsGenieAlerter(api_key=og_key, region=region),
835
+ name="opsgenie",
836
+ ),
837
+ )
838
+
839
+ # VictorOps
840
+ vo_url = os.environ.get("SPANFORGE_ALERT_VICTOROPS_URL", "")
841
+ if vo_url:
842
+ self._sinks.append(
843
+ _SinkWrapper(alerter=VictorOpsAlerter(rest_endpoint_url=vo_url), name="victorops"),
844
+ )
845
+
846
+ # Generic webhook
847
+ wh_url = os.environ.get("SPANFORGE_ALERT_WEBHOOK_URL", "")
848
+ if wh_url:
849
+ wh_secret = os.environ.get("SPANFORGE_ALERT_WEBHOOK_SECRET", "")
850
+ self._sinks.append(
851
+ _SinkWrapper(alerter=WebhookAlerter(url=wh_url, secret=wh_secret), name="webhook"),
852
+ )
853
+
854
+ # ------------------------------------------------------------------
855
+ # Public API — topic registry (ALT-003)
856
+ # ------------------------------------------------------------------
857
+
858
+ def register_topic(
859
+ self,
860
+ topic: str,
861
+ description: str,
862
+ default_severity: str = "warning",
863
+ *,
864
+ runbook_url: str | None = None,
865
+ dedup_window_seconds: float | None = None,
866
+ ) -> None:
867
+ """Register a custom topic.
868
+
869
+ Args:
870
+ topic: Dot-separated topic string.
871
+ description: Human-readable purpose.
872
+ default_severity: Default severity (``"info"``, ``"warning"``,
873
+ ``"high"``, or ``"critical"``).
874
+ runbook_url: Optional URL to the runbook for this topic.
875
+ dedup_window_seconds: Per-topic dedup window override.
876
+ """
877
+ reg = TopicRegistration(
878
+ topic=topic,
879
+ description=description,
880
+ default_severity=default_severity,
881
+ runbook_url=runbook_url,
882
+ dedup_window_seconds=dedup_window_seconds,
883
+ )
884
+ with self._lock:
885
+ self._topic_registry[topic] = reg
886
+
887
+ # ------------------------------------------------------------------
888
+ # Public API — maintenance windows (ALT-012)
889
+ # ------------------------------------------------------------------
890
+
891
+ def set_maintenance_window(
892
+ self,
893
+ project_id: str,
894
+ start: datetime,
895
+ end: datetime,
896
+ ) -> None:
897
+ """Register a maintenance window.
898
+
899
+ During the window all alerts for *project_id* are suppressed.
900
+
901
+ Args:
902
+ project_id: Project whose alerts should be suppressed.
903
+ start: Window start (UTC-aware recommended).
904
+ end: Window end (UTC-aware recommended).
905
+ """
906
+ mw = MaintenanceWindow(project_id=project_id, start=start, end=end)
907
+ with self._lock:
908
+ self._maintenance_windows.append(mw)
909
+ self._append_audit_record(
910
+ {
911
+ "event": "maintenance_window_set",
912
+ "project_id": project_id,
913
+ "start": start.isoformat(),
914
+ "end": end.isoformat(),
915
+ },
916
+ )
917
+
918
+ def remove_maintenance_windows(self, project_id: str) -> int:
919
+ """Remove all maintenance windows for *project_id*.
920
+
921
+ Returns the number of windows removed.
922
+ """
923
+ with self._lock:
924
+ before = len(self._maintenance_windows)
925
+ self._maintenance_windows = [
926
+ mw for mw in self._maintenance_windows if mw.project_id != project_id
927
+ ]
928
+ return before - len(self._maintenance_windows)
929
+
930
+ # ------------------------------------------------------------------
931
+ # Public API — publish (ALT-001, ALT-050)
932
+ # ------------------------------------------------------------------
933
+
934
+ def publish(
935
+ self,
936
+ topic: str,
937
+ payload: dict[str, Any],
938
+ *,
939
+ severity: str | None = None,
940
+ project_id: str | None = None,
941
+ ) -> PublishResult:
942
+ """Publish an alert to the given *topic*.
943
+
944
+ Steps:
945
+ 1. Resolve topic registration (warn on unknown topics).
946
+ 2. Resolve effective severity.
947
+ 3. Check maintenance window suppression.
948
+ 4. Check per-project rate limit.
949
+ 5. Check deduplication window.
950
+ 6. Enqueue for background dispatch.
951
+
952
+ Args:
953
+ topic: Dot-separated topic identifier.
954
+ payload: Arbitrary payload dict. **Never include raw secrets.**
955
+ severity: Explicit severity override. Defaults to the topic's
956
+ ``default_severity``.
957
+ project_id: Project scope. Defaults to ``config.project_id``.
958
+
959
+ Returns:
960
+ :class:`~spanforge.sdk._types.PublishResult` with ``alert_id``,
961
+ ``routed_to``, and ``suppressed``.
962
+
963
+ Raises:
964
+ :exc:`~spanforge.sdk._exceptions.SFAlertRateLimitedError` when the
965
+ per-project rate limit is exceeded **and** the client is in
966
+ strict mode (``local_fallback_enabled=False``).
967
+ """
968
+ pid = project_id or self._config.project_id or ""
969
+ alert_id = str(uuid.uuid4())
970
+
971
+ with self._lock:
972
+ self._publish_count += 1
973
+
974
+ # Topic lookup
975
+ reg = self._topic_registry.get(topic)
976
+ if reg is None:
977
+ _log.warning(
978
+ "sf-alert: unknown topic %r — routing to catch-all. "
979
+ "Register custom topics with register_topic().",
980
+ topic,
981
+ )
982
+ resolved_severity = severity or (reg.default_severity if reg else "warning")
983
+ runbook_url = reg.runbook_url if reg else None
984
+ per_topic_dedup = reg.dedup_window_seconds if reg else None
985
+ effective_dedup = per_topic_dedup if per_topic_dedup is not None else self._dedup_window
986
+
987
+ # Maintenance window check
988
+ if self._is_maintenance_window(pid):
989
+ self._suppress_count += 1
990
+ _log.debug(
991
+ "sf-alert: suppressed %r — maintenance window for project %r",
992
+ topic,
993
+ pid,
994
+ )
995
+ return PublishResult(alert_id=alert_id, routed_to=[], suppressed=True)
996
+
997
+ # Rate limit check
998
+ if not self._rate_limiter.record(pid or "__global__"):
999
+ self._suppress_count += 1
1000
+ _log.warning(
1001
+ "sf-alert: rate limit %d/min exceeded for project %r; alert suppressed",
1002
+ self._rate_limit,
1003
+ pid,
1004
+ )
1005
+ if not self._config.local_fallback_enabled:
1006
+ raise SFAlertRateLimitedError(pid, self._rate_limit)
1007
+ return PublishResult(alert_id=alert_id, routed_to=[], suppressed=True)
1008
+
1009
+ # Deduplication check
1010
+ dedup_key = (topic, pid)
1011
+ last_ts = self._dedup.get(dedup_key, 0.0)
1012
+ if time.monotonic() - last_ts < effective_dedup:
1013
+ self._suppress_count += 1
1014
+ _log.debug("sf-alert: suppressed %r (dedup window %.0fs)", topic, effective_dedup)
1015
+ return PublishResult(alert_id=alert_id, routed_to=[], suppressed=True)
1016
+ self._dedup[dedup_key] = time.monotonic()
1017
+
1018
+ # Build summary message
1019
+ title = f"[{resolved_severity.upper()}] {topic}"
1020
+ message = _build_message(topic, payload, runbook_url)
1021
+ item = _QueueItem(
1022
+ alert_id=alert_id,
1023
+ topic=topic,
1024
+ title=title,
1025
+ message=message,
1026
+ severity=resolved_severity,
1027
+ project_id=pid,
1028
+ payload=payload,
1029
+ runbook_url=runbook_url,
1030
+ )
1031
+
1032
+ # Alert grouping check (ALT-011)
1033
+ # The FIRST alert in a group is dispatched immediately.
1034
+ # Subsequent alerts sharing the same (topic_prefix, project_id) within
1035
+ # _GROUP_WINDOW_SECS are coalesced and flushed as one notification.
1036
+ group_key = (_topic_prefix(topic), pid)
1037
+ with self._lock:
1038
+ if group_key in self._group_buffers:
1039
+ # Add to existing group buffer; dispatch deferred until flush
1040
+ self._group_buffers[group_key].append(item)
1041
+ _log.debug("sf-alert: grouped %r into existing group %r", topic, group_key)
1042
+ return PublishResult(alert_id=alert_id, routed_to=[], suppressed=False)
1043
+ # Start a new group window; the first item is enqueued immediately
1044
+ self._group_buffers[group_key] = [] # buffer for SUBSEQUENT items only
1045
+ timer = threading.Timer(
1046
+ _GROUP_WINDOW_SECS,
1047
+ self._flush_group,
1048
+ args=(group_key,),
1049
+ )
1050
+ timer.daemon = True
1051
+ self._group_timers[group_key] = timer
1052
+ timer.start()
1053
+
1054
+ # Enqueue the first item for immediate dispatch
1055
+ try:
1056
+ self._queue.put_nowait(item)
1057
+ except queue.Full:
1058
+ try:
1059
+ self._queue.get_nowait()
1060
+ except queue.Empty:
1061
+ pass
1062
+ try:
1063
+ self._queue.put_nowait(item)
1064
+ except queue.Full:
1065
+ pass
1066
+ _log.warning("sf-alert: alert queue full (%d items), oldest item dropped", _QUEUE_MAX)
1067
+
1068
+ return PublishResult(alert_id=alert_id, routed_to=[], suppressed=False)
1069
+
1070
+ # ------------------------------------------------------------------
1071
+ # Public API — acknowledge (ALT-020)
1072
+ # ------------------------------------------------------------------
1073
+
1074
+ def acknowledge(self, alert_id: str) -> bool:
1075
+ """Acknowledge a CRITICAL alert, cancelling its escalation timer.
1076
+
1077
+ Args:
1078
+ alert_id: The UUID returned by :meth:`publish`.
1079
+
1080
+ Returns:
1081
+ ``True`` if a pending escalation timer was found and cancelled.
1082
+ """
1083
+ with self._lock:
1084
+ timer = self._escalation_timers.pop(alert_id, None)
1085
+ self._pending_escalation.pop(alert_id, None)
1086
+ if timer is not None:
1087
+ timer.cancel()
1088
+ _log.debug("sf-alert: escalation cancelled for alert %s", alert_id)
1089
+ # Update status in history
1090
+ self._update_history_status(alert_id, "acknowledged")
1091
+ return True
1092
+ return False
1093
+
1094
+ # ------------------------------------------------------------------
1095
+ # Public API — alert history (ALT-042)
1096
+ # ------------------------------------------------------------------
1097
+
1098
+ def get_alert_history(
1099
+ self,
1100
+ *,
1101
+ project_id: str | None = None,
1102
+ topic: str | None = None,
1103
+ from_dt: datetime | None = None,
1104
+ to_dt: datetime | None = None,
1105
+ status: str | None = None,
1106
+ limit: int = 100,
1107
+ ) -> list[AlertRecord]:
1108
+ """Query the in-memory alert history.
1109
+
1110
+ Args:
1111
+ project_id: Filter by project.
1112
+ topic: Filter by topic.
1113
+ from_dt: Include alerts at or after this UTC datetime.
1114
+ to_dt: Include alerts at or before this UTC datetime.
1115
+ status: Filter by status: ``"open"``, ``"acknowledged"``,
1116
+ or ``"resolved"``.
1117
+ limit: Maximum number of results (default: 100).
1118
+
1119
+ Returns:
1120
+ Most-recent-first list of matching :class:`~spanforge.sdk._types.AlertRecord`.
1121
+ """
1122
+ with self._lock:
1123
+ results = list(self._history)
1124
+
1125
+ # Filter
1126
+ if project_id:
1127
+ results = [r for r in results if r.project_id == project_id]
1128
+ if topic:
1129
+ results = [r for r in results if r.topic == topic]
1130
+ if status:
1131
+ results = [r for r in results if r.status == status]
1132
+ if from_dt:
1133
+ from_str = from_dt.isoformat()
1134
+ results = [r for r in results if r.timestamp >= from_str]
1135
+ if to_dt:
1136
+ to_str = to_dt.isoformat()
1137
+ results = [r for r in results if r.timestamp <= to_str]
1138
+
1139
+ # Most recent first
1140
+ results.sort(key=lambda r: r.timestamp, reverse=True)
1141
+ return results[:limit]
1142
+
1143
+ # ------------------------------------------------------------------
1144
+ # publish_async (F-10)
1145
+ # ------------------------------------------------------------------
1146
+
1147
+ async def publish_async(
1148
+ self,
1149
+ topic: str,
1150
+ payload: dict[str, Any],
1151
+ *,
1152
+ severity: str | None = None,
1153
+ project_id: str | None = None,
1154
+ ) -> PublishResult:
1155
+ """Async variant of :meth:`publish`.
1156
+
1157
+ Dispatches the alert enqueue in the default executor so the event
1158
+ loop is not blocked by rate-limit checks or deduplication lookups.
1159
+
1160
+ Args:
1161
+ topic: Alert topic string.
1162
+ payload: Alert payload dict.
1163
+ severity: Optional severity override.
1164
+ project_id: Optional project scope override.
1165
+
1166
+ Returns:
1167
+ :class:`~spanforge.sdk._types.PublishResult`.
1168
+ """
1169
+ import asyncio
1170
+ import functools
1171
+
1172
+ loop = asyncio.get_event_loop()
1173
+ return await loop.run_in_executor(
1174
+ None,
1175
+ functools.partial(
1176
+ self.publish,
1177
+ topic,
1178
+ payload,
1179
+ severity=severity,
1180
+ project_id=project_id,
1181
+ ),
1182
+ )
1183
+
1184
+ # ------------------------------------------------------------------
1185
+ # Public API — status / health
1186
+ # ------------------------------------------------------------------
1187
+
1188
+ def get_status(self) -> AlertStatusInfo:
1189
+ """Return health and session statistics."""
1190
+ with self._lock:
1191
+ publish_count = self._publish_count
1192
+ suppress_count = self._suppress_count
1193
+ now = datetime.now(timezone.utc)
1194
+ active_mw = sum(1 for mw in self._maintenance_windows if mw.start <= now <= mw.end)
1195
+ registered = len(self._topic_registry)
1196
+
1197
+ queue_depth = self._queue.qsize()
1198
+ all_healthy = all(not w.cb.is_open() for w in self._sinks)
1199
+ status = "ok" if all_healthy else "degraded"
1200
+ return AlertStatusInfo(
1201
+ status=status,
1202
+ publish_count=publish_count,
1203
+ suppress_count=suppress_count,
1204
+ queue_depth=queue_depth,
1205
+ registered_topics=registered,
1206
+ active_maintenance_windows=active_mw,
1207
+ healthy=all_healthy,
1208
+ )
1209
+
1210
+ @property
1211
+ def healthy(self) -> bool:
1212
+ """``True`` when no sink circuit breaker is open."""
1213
+ return all(not w.cb.is_open() for w in self._sinks)
1214
+
1215
+ # ------------------------------------------------------------------
1216
+ # Public API — sink management
1217
+ # ------------------------------------------------------------------
1218
+
1219
+ def add_sink(self, alerter: _Alerter, name: str | None = None) -> None:
1220
+ """Add a sink at runtime.
1221
+
1222
+ Args:
1223
+ alerter: Sink instance with a ``send()`` method.
1224
+ name: Optional display name (defaults to class name).
1225
+ """
1226
+ sink_name = name or type(alerter).__name__.lower()
1227
+ with self._lock:
1228
+ self._sinks.append(_SinkWrapper(alerter=alerter, name=sink_name))
1229
+
1230
+ # ------------------------------------------------------------------
1231
+ # Graceful shutdown
1232
+ # ------------------------------------------------------------------
1233
+
1234
+ def shutdown(self, timeout: float = 5.0) -> None:
1235
+ """Drain the queue and stop the worker thread.
1236
+
1237
+ Args:
1238
+ timeout: Seconds to wait for the worker to finish (default: 5.0).
1239
+ """
1240
+ # Cancel all escalation timers
1241
+ with self._lock:
1242
+ timers = list(self._escalation_timers.values())
1243
+ self._escalation_timers.clear()
1244
+ for t in timers:
1245
+ t.cancel()
1246
+ # Flush all groups
1247
+ with self._lock:
1248
+ group_keys = list(self._group_buffers.keys())
1249
+ for gk in group_keys:
1250
+ self._flush_group(gk)
1251
+ # Signal worker to stop
1252
+ try:
1253
+ self._queue.put_nowait(None)
1254
+ except queue.Full:
1255
+ pass
1256
+ self._worker.join(timeout=timeout)
1257
+
1258
+ # ------------------------------------------------------------------
1259
+ # Internal — group flushing
1260
+ # ------------------------------------------------------------------
1261
+
1262
+ def _flush_group(self, group_key: tuple[str, str]) -> None:
1263
+ """Flush a group window: coalesce buffered secondary items and enqueue one dispatch task."""
1264
+ with self._lock:
1265
+ items = self._group_buffers.pop(group_key, [])
1266
+ timer = self._group_timers.pop(group_key, None)
1267
+ if timer is not None:
1268
+ timer.cancel()
1269
+ if not items:
1270
+ # No secondary alerts buffered; the first item was already dispatched
1271
+ return
1272
+
1273
+ # Coalesce: use the first item as the representative
1274
+ first = items[0]
1275
+ if len(items) > 1:
1276
+ extra_topics = ", ".join(i.topic for i in items[1:])
1277
+ first = _QueueItem(
1278
+ alert_id=first.alert_id,
1279
+ topic=first.topic,
1280
+ title=first.title,
1281
+ message=f"{first.message}\n(+{len(items) - 1} grouped: {extra_topics})",
1282
+ severity=max(
1283
+ (i.severity for i in items),
1284
+ key=lambda s: _SEVERITY_RANK.get(s, 0),
1285
+ ),
1286
+ project_id=first.project_id,
1287
+ payload=first.payload,
1288
+ runbook_url=first.runbook_url,
1289
+ )
1290
+
1291
+ try:
1292
+ self._queue.put_nowait(first)
1293
+ except queue.Full:
1294
+ # Drop oldest
1295
+ try:
1296
+ self._queue.get_nowait()
1297
+ except queue.Empty:
1298
+ pass
1299
+ try:
1300
+ self._queue.put_nowait(first)
1301
+ except queue.Full:
1302
+ pass
1303
+ _log.warning("sf-alert: alert queue full (%d items), oldest item dropped", _QUEUE_MAX)
1304
+
1305
+ # ------------------------------------------------------------------
1306
+ # Internal — worker loop (ALT-050)
1307
+ # ------------------------------------------------------------------
1308
+
1309
+ def _worker_loop(self) -> None:
1310
+ """Background thread: drain queue and dispatch to sinks."""
1311
+ while True:
1312
+ try:
1313
+ item = self._queue.get(timeout=1.0)
1314
+ except queue.Empty:
1315
+ continue
1316
+ if item is None:
1317
+ # Shutdown sentinel
1318
+ break
1319
+ try:
1320
+ self._dispatch(item)
1321
+ except Exception:
1322
+ _log.exception("sf-alert: unhandled error dispatching %r", item.topic)
1323
+ finally:
1324
+ self._queue.task_done()
1325
+
1326
+ def _dispatch(self, item: _QueueItem) -> None:
1327
+ """Dispatch an alert to all configured sinks."""
1328
+ with self._lock:
1329
+ sinks = list(self._sinks)
1330
+ escalation_names = list(self._escalation_sink_names)
1331
+
1332
+ if item.is_escalation and escalation_names:
1333
+ sinks = [s for s in sinks if s.name in escalation_names] or sinks
1334
+
1335
+ extra: dict[str, Any] = {
1336
+ "alert_id": item.alert_id,
1337
+ "topic": item.topic,
1338
+ "project_id": item.project_id,
1339
+ }
1340
+ if item.runbook_url:
1341
+ extra["runbook_url"] = item.runbook_url
1342
+
1343
+ routed_to: list[str] = []
1344
+ for sink in sinks:
1345
+ ok = sink.dispatch(item.title, item.message, item.severity, extra)
1346
+ if ok:
1347
+ routed_to.append(sink.name)
1348
+
1349
+ # Record in history
1350
+ record = AlertRecord(
1351
+ alert_id=item.alert_id,
1352
+ topic=item.topic,
1353
+ severity=item.severity,
1354
+ project_id=item.project_id,
1355
+ payload=item.payload,
1356
+ sinks_notified=routed_to,
1357
+ suppressed=False,
1358
+ status="open",
1359
+ timestamp=datetime.now(timezone.utc).isoformat(),
1360
+ )
1361
+ with self._lock:
1362
+ self._history.append(record)
1363
+ if len(self._history) > _HISTORY_MAX:
1364
+ self._history = self._history[-_HISTORY_MAX:]
1365
+
1366
+ # Audit log
1367
+ self._append_audit_record(
1368
+ {
1369
+ "event": "alert.published",
1370
+ "alert_id": item.alert_id,
1371
+ "topic": item.topic,
1372
+ "severity": item.severity,
1373
+ "project_id": item.project_id,
1374
+ "sinks_notified": routed_to,
1375
+ "suppressed": False,
1376
+ },
1377
+ )
1378
+
1379
+ # Schedule escalation for CRITICAL (ALT-020)
1380
+ is_critical = _SEVERITY_RANK.get(item.severity, 0) >= _SEVERITY_RANK["critical"]
1381
+ if is_critical and not item.is_escalation:
1382
+ self._schedule_escalation(item)
1383
+
1384
+ # ------------------------------------------------------------------
1385
+ # Internal — escalation (ALT-020)
1386
+ # ------------------------------------------------------------------
1387
+
1388
+ def _schedule_escalation(self, item: _QueueItem) -> None:
1389
+ """Start a timer to escalate *item* after :attr:`_escalation_wait` seconds."""
1390
+ timer = threading.Timer(
1391
+ self._escalation_wait,
1392
+ self._fire_escalation,
1393
+ args=(item.alert_id,),
1394
+ )
1395
+ timer.daemon = True
1396
+ with self._lock:
1397
+ self._escalation_timers[item.alert_id] = timer
1398
+ self._pending_escalation[item.alert_id] = item
1399
+ timer.start()
1400
+ _log.debug(
1401
+ "sf-alert: escalation scheduled for %s in %.0fs",
1402
+ item.alert_id,
1403
+ self._escalation_wait,
1404
+ )
1405
+
1406
+ def _fire_escalation(self, alert_id: str) -> None:
1407
+ """Escalation timer callback — re-dispatch with ``is_escalation=True``."""
1408
+ with self._lock:
1409
+ item = self._pending_escalation.pop(alert_id, None)
1410
+ self._escalation_timers.pop(alert_id, None)
1411
+ if item is None:
1412
+ return
1413
+ escalated = _QueueItem(
1414
+ alert_id=item.alert_id,
1415
+ topic=item.topic,
1416
+ title=f"[ESCALATED] {item.title}",
1417
+ message=f"[AUTO-ESCALATED after {self._escalation_wait:.0f}s]\n{item.message}",
1418
+ severity=item.severity,
1419
+ project_id=item.project_id,
1420
+ payload=item.payload,
1421
+ runbook_url=item.runbook_url,
1422
+ is_escalation=True,
1423
+ )
1424
+ _log.warning(
1425
+ "sf-alert: CRITICAL alert %r not acknowledged in %.0fs — escalating",
1426
+ alert_id,
1427
+ self._escalation_wait,
1428
+ )
1429
+ try:
1430
+ self._queue.put_nowait(escalated)
1431
+ except queue.Full:
1432
+ _log.warning("sf-alert: queue full during escalation; escalated alert dropped")
1433
+
1434
+ # ------------------------------------------------------------------
1435
+ # Internal — audit log (ALT-053)
1436
+ # ------------------------------------------------------------------
1437
+
1438
+ def _append_audit_record(self, record: dict[str, Any]) -> None:
1439
+ """Append *record* to sf-audit schema ``spanforge.alert.v1`` (best-effort)."""
1440
+ try:
1441
+ from spanforge.sdk import sf_audit
1442
+
1443
+ sf_audit.append(record, "spanforge.alert.v1")
1444
+ except Exception:
1445
+ _log.debug("sf-alert: audit append skipped (sf_audit unavailable or error)")
1446
+
1447
+ # ------------------------------------------------------------------
1448
+ # Internal — helper predicates
1449
+ # ------------------------------------------------------------------
1450
+
1451
+ def _is_maintenance_window(self, project_id: str) -> bool:
1452
+ """Return ``True`` when *project_id* is currently in a maintenance window.
1453
+
1454
+ Must be called with ``self._lock`` held **or** within a context that
1455
+ doesn't need the lock (the caller holds it).
1456
+ """
1457
+ now = datetime.now(timezone.utc)
1458
+ for mw in self._maintenance_windows:
1459
+ if mw.project_id == project_id and mw.start <= now <= mw.end:
1460
+ return True
1461
+ return False
1462
+
1463
+ def _update_history_status(self, alert_id: str, status: str) -> None:
1464
+ """Update the status field of a history record (best-effort)."""
1465
+ with self._lock:
1466
+ for i, rec in enumerate(self._history):
1467
+ if rec.alert_id == alert_id:
1468
+ # Dataclass is frozen — replace the record
1469
+ self._history[i] = AlertRecord(
1470
+ alert_id=rec.alert_id,
1471
+ topic=rec.topic,
1472
+ severity=rec.severity,
1473
+ project_id=rec.project_id,
1474
+ payload=rec.payload,
1475
+ sinks_notified=rec.sinks_notified,
1476
+ suppressed=rec.suppressed,
1477
+ status=status,
1478
+ timestamp=rec.timestamp,
1479
+ )
1480
+ break
1481
+
1482
+ # ------------------------------------------------------------------
1483
+ # SFServiceClient — abstract requirement
1484
+ # ------------------------------------------------------------------
1485
+
1486
+ def _request(
1487
+ self,
1488
+ method: str,
1489
+ path: str,
1490
+ body: dict[str, Any] | None = None,
1491
+ ) -> dict[str, Any]:
1492
+ """Not used directly; alert routing is purely outbound push."""
1493
+ raise NotImplementedError("SFAlertClient does not expose a request interface")
1494
+
1495
+
1496
+ # ---------------------------------------------------------------------------
1497
+ # Module-level helpers
1498
+ # ---------------------------------------------------------------------------
1499
+
1500
+
1501
+ def _topic_prefix(topic: str) -> str:
1502
+ """Return everything before the last dot in *topic*."""
1503
+ idx = topic.rfind(".")
1504
+ return topic[:idx] if idx != -1 else topic
1505
+
1506
+
1507
+ def _build_message(topic: str, payload: dict[str, Any], runbook_url: str | None) -> str:
1508
+ """Construct a human-readable alert message."""
1509
+ lines: list[str] = [f"Topic: {topic}"]
1510
+ for key, value in payload.items():
1511
+ lines.append(f" {key}: {value}")
1512
+ if runbook_url:
1513
+ lines.append(f"Runbook: {runbook_url}")
1514
+ return "\n".join(lines)