nthlayer-workers 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. nthlayer_workers/__init__.py +5 -0
  2. nthlayer_workers/cli.py +234 -0
  3. nthlayer_workers/correlate/__init__.py +1 -0
  4. nthlayer_workers/correlate/cli.py +847 -0
  5. nthlayer_workers/correlate/config.py +111 -0
  6. nthlayer_workers/correlate/correlation/__init__.py +1 -0
  7. nthlayer_workers/correlate/correlation/changes.py +87 -0
  8. nthlayer_workers/correlate/correlation/dedup.py +62 -0
  9. nthlayer_workers/correlate/correlation/engine.py +244 -0
  10. nthlayer_workers/correlate/correlation/temporal.py +79 -0
  11. nthlayer_workers/correlate/correlation/topology.py +104 -0
  12. nthlayer_workers/correlate/ingestion/__init__.py +1 -0
  13. nthlayer_workers/correlate/ingestion/protocol.py +10 -0
  14. nthlayer_workers/correlate/ingestion/severity.py +18 -0
  15. nthlayer_workers/correlate/ingestion/webhook.py +197 -0
  16. nthlayer_workers/correlate/notifications.py +85 -0
  17. nthlayer_workers/correlate/prometheus.py +234 -0
  18. nthlayer_workers/correlate/reasoning.py +375 -0
  19. nthlayer_workers/correlate/session.py +189 -0
  20. nthlayer_workers/correlate/snapshot/__init__.py +1 -0
  21. nthlayer_workers/correlate/snapshot/generator.py +170 -0
  22. nthlayer_workers/correlate/snapshot/model.py +177 -0
  23. nthlayer_workers/correlate/snapshot/token.py +14 -0
  24. nthlayer_workers/correlate/state.py +88 -0
  25. nthlayer_workers/correlate/store/__init__.py +5 -0
  26. nthlayer_workers/correlate/store/protocol.py +48 -0
  27. nthlayer_workers/correlate/store/sqlite.py +443 -0
  28. nthlayer_workers/correlate/summary.py +180 -0
  29. nthlayer_workers/correlate/traces/__init__.py +1 -0
  30. nthlayer_workers/correlate/traces/protocol.py +120 -0
  31. nthlayer_workers/correlate/traces/tempo.py +667 -0
  32. nthlayer_workers/correlate/traces/topology.py +39 -0
  33. nthlayer_workers/correlate/types.py +77 -0
  34. nthlayer_workers/correlate/worker.py +630 -0
  35. nthlayer_workers/learn/__init__.py +5 -0
  36. nthlayer_workers/learn/__main__.py +5 -0
  37. nthlayer_workers/learn/cli.py +164 -0
  38. nthlayer_workers/learn/retrospective.py +381 -0
  39. nthlayer_workers/learn/trends.py +102 -0
  40. nthlayer_workers/learn/worker.py +366 -0
  41. nthlayer_workers/measure/__init__.py +3 -0
  42. nthlayer_workers/measure/__main__.py +5 -0
  43. nthlayer_workers/measure/_parsing.py +15 -0
  44. nthlayer_workers/measure/adapters/__init__.py +0 -0
  45. nthlayer_workers/measure/adapters/_util.py +24 -0
  46. nthlayer_workers/measure/adapters/devin.py +119 -0
  47. nthlayer_workers/measure/adapters/gastown.py +88 -0
  48. nthlayer_workers/measure/adapters/prometheus.py +277 -0
  49. nthlayer_workers/measure/adapters/protocol.py +20 -0
  50. nthlayer_workers/measure/adapters/webhook.py +161 -0
  51. nthlayer_workers/measure/api/__init__.py +0 -0
  52. nthlayer_workers/measure/api/normalise.py +50 -0
  53. nthlayer_workers/measure/api/queue.py +243 -0
  54. nthlayer_workers/measure/api/response.py +51 -0
  55. nthlayer_workers/measure/api/server.py +504 -0
  56. nthlayer_workers/measure/calibration/__init__.py +0 -0
  57. nthlayer_workers/measure/calibration/loop.py +62 -0
  58. nthlayer_workers/measure/calibration/slos.py +212 -0
  59. nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
  60. nthlayer_workers/measure/cli.py +753 -0
  61. nthlayer_workers/measure/config.py +191 -0
  62. nthlayer_workers/measure/detection/__init__.py +6 -0
  63. nthlayer_workers/measure/detection/detector.py +82 -0
  64. nthlayer_workers/measure/detection/protocol.py +29 -0
  65. nthlayer_workers/measure/governance/__init__.py +0 -0
  66. nthlayer_workers/measure/governance/engine.py +163 -0
  67. nthlayer_workers/measure/manifest.py +77 -0
  68. nthlayer_workers/measure/notifications.py +53 -0
  69. nthlayer_workers/measure/pipeline/__init__.py +0 -0
  70. nthlayer_workers/measure/pipeline/evaluator.py +155 -0
  71. nthlayer_workers/measure/pipeline/router.py +160 -0
  72. nthlayer_workers/measure/store/__init__.py +0 -0
  73. nthlayer_workers/measure/store/protocol.py +38 -0
  74. nthlayer_workers/measure/store/sqlite.py +276 -0
  75. nthlayer_workers/measure/telemetry.py +116 -0
  76. nthlayer_workers/measure/tiering/__init__.py +0 -0
  77. nthlayer_workers/measure/tiering/classifier.py +58 -0
  78. nthlayer_workers/measure/tiering/promotion.py +118 -0
  79. nthlayer_workers/measure/trends/__init__.py +0 -0
  80. nthlayer_workers/measure/trends/tracker.py +72 -0
  81. nthlayer_workers/measure/types.py +75 -0
  82. nthlayer_workers/measure/worker.py +439 -0
  83. nthlayer_workers/observe/__init__.py +25 -0
  84. nthlayer_workers/observe/__main__.py +5 -0
  85. nthlayer_workers/observe/api/__init__.py +1 -0
  86. nthlayer_workers/observe/assessment.py +95 -0
  87. nthlayer_workers/observe/cli.py +737 -0
  88. nthlayer_workers/observe/config.py +11 -0
  89. nthlayer_workers/observe/db/__init__.py +1 -0
  90. nthlayer_workers/observe/decision_records.py +220 -0
  91. nthlayer_workers/observe/dependencies/__init__.py +18 -0
  92. nthlayer_workers/observe/dependencies/discovery.py +294 -0
  93. nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
  94. nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
  95. nthlayer_workers/observe/dependencies/providers/base.py +76 -0
  96. nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
  97. nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
  98. nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
  99. nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
  100. nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
  101. nthlayer_workers/observe/deployments/__init__.py +1 -0
  102. nthlayer_workers/observe/discovery/__init__.py +14 -0
  103. nthlayer_workers/observe/discovery/classifier.py +66 -0
  104. nthlayer_workers/observe/discovery/client.py +189 -0
  105. nthlayer_workers/observe/discovery/models.py +53 -0
  106. nthlayer_workers/observe/drift/__init__.py +26 -0
  107. nthlayer_workers/observe/drift/analyzer.py +383 -0
  108. nthlayer_workers/observe/drift/models.py +174 -0
  109. nthlayer_workers/observe/drift/patterns.py +88 -0
  110. nthlayer_workers/observe/explanation.py +118 -0
  111. nthlayer_workers/observe/gate/__init__.py +39 -0
  112. nthlayer_workers/observe/gate/conditions.py +92 -0
  113. nthlayer_workers/observe/gate/correlator.py +154 -0
  114. nthlayer_workers/observe/gate/evaluator.py +192 -0
  115. nthlayer_workers/observe/gate/policies.py +226 -0
  116. nthlayer_workers/observe/gate_adapter.py +40 -0
  117. nthlayer_workers/observe/incident.py +36 -0
  118. nthlayer_workers/observe/portfolio/__init__.py +17 -0
  119. nthlayer_workers/observe/portfolio/aggregator.py +168 -0
  120. nthlayer_workers/observe/portfolio/scorer.py +13 -0
  121. nthlayer_workers/observe/slo/__init__.py +19 -0
  122. nthlayer_workers/observe/slo/collector.py +235 -0
  123. nthlayer_workers/observe/slo/spec_loader.py +40 -0
  124. nthlayer_workers/observe/sqlite_store.py +152 -0
  125. nthlayer_workers/observe/store.py +92 -0
  126. nthlayer_workers/observe/verification/__init__.py +22 -0
  127. nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
  128. nthlayer_workers/observe/verification/extractor.py +127 -0
  129. nthlayer_workers/observe/verification/models.py +101 -0
  130. nthlayer_workers/observe/verification/verifier.py +111 -0
  131. nthlayer_workers/observe/worker.py +332 -0
  132. nthlayer_workers/respond/__init__.py +2 -0
  133. nthlayer_workers/respond/__main__.py +4 -0
  134. nthlayer_workers/respond/agents/__init__.py +0 -0
  135. nthlayer_workers/respond/agents/base.py +556 -0
  136. nthlayer_workers/respond/agents/communication.py +115 -0
  137. nthlayer_workers/respond/agents/investigation.py +124 -0
  138. nthlayer_workers/respond/agents/remediation.py +219 -0
  139. nthlayer_workers/respond/agents/triage.py +132 -0
  140. nthlayer_workers/respond/cli.py +772 -0
  141. nthlayer_workers/respond/config.py +135 -0
  142. nthlayer_workers/respond/context_store.py +256 -0
  143. nthlayer_workers/respond/coordinator.py +487 -0
  144. nthlayer_workers/respond/metrics.py +104 -0
  145. nthlayer_workers/respond/notification_backends/__init__.py +1 -0
  146. nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
  147. nthlayer_workers/respond/notification_backends/protocol.py +59 -0
  148. nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
  149. nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
  150. nthlayer_workers/respond/notifications.py +247 -0
  151. nthlayer_workers/respond/oncall/__init__.py +1 -0
  152. nthlayer_workers/respond/oncall/escalation.py +103 -0
  153. nthlayer_workers/respond/oncall/runner.py +193 -0
  154. nthlayer_workers/respond/oncall/schedule.py +243 -0
  155. nthlayer_workers/respond/safe_actions/__init__.py +0 -0
  156. nthlayer_workers/respond/safe_actions/actions.py +139 -0
  157. nthlayer_workers/respond/safe_actions/registry.py +171 -0
  158. nthlayer_workers/respond/safe_actions/webhook.py +194 -0
  159. nthlayer_workers/respond/server.py +357 -0
  160. nthlayer_workers/respond/sre/__init__.py +1 -0
  161. nthlayer_workers/respond/sre/brief.py +175 -0
  162. nthlayer_workers/respond/sre/delegation.py +101 -0
  163. nthlayer_workers/respond/sre/post_incident.py +146 -0
  164. nthlayer_workers/respond/sre/shift_report.py +129 -0
  165. nthlayer_workers/respond/sre/suppression.py +91 -0
  166. nthlayer_workers/respond/types.py +109 -0
  167. nthlayer_workers/respond/verdict_submission.py +56 -0
  168. nthlayer_workers/respond/worker.py +533 -0
  169. nthlayer_workers/respond/worker_helpers.py +140 -0
  170. nthlayer_workers/runner.py +198 -0
  171. nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
  172. nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
  173. nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
  174. nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
  175. nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,10 @@
1
+ """Ingester protocol — the ingestion contract for SitRep."""
2
+ from __future__ import annotations
3
+ from typing import Awaitable, Callable, Protocol, Union
4
+ from nthlayer_workers.correlate.types import SitRepEvent
5
+
6
+
7
+ class Ingester(Protocol):
8
+ async def start(self) -> None: ...
9
+ async def stop(self) -> None: ...
10
+ def on_event(self, handler: Callable[[SitRepEvent], Union[Awaitable[None], None]]) -> None: ...
@@ -0,0 +1,18 @@
1
+ """Severity pre-scoring from SLO targets. Pure arithmetic, no judgment."""
2
+ from __future__ import annotations
3
+ from nthlayer_workers.correlate.types import SitRepEvent
4
+
5
+
6
+ def pre_score(event: SitRepEvent, slo_targets: dict | None) -> float:
7
+ """Pre-score severity using SLO targets.
8
+
9
+ severity = min(1.0, max(0.0, (current_value - target_value) / target_value))
10
+ Returns event.severity unchanged if no SLO context available.
11
+ """
12
+ if slo_targets is None or event.service not in slo_targets:
13
+ return event.severity
14
+ value = event.payload.get("value")
15
+ threshold = event.payload.get("threshold")
16
+ if value is None or threshold is None or threshold == 0:
17
+ return event.severity
18
+ return min(1.0, max(0.0, (value - threshold) / threshold))
@@ -0,0 +1,197 @@
1
+ """WebhookIngester — raw asyncio TCP HTTP server for event ingestion."""
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import json
6
+ import uuid
7
+ from dataclasses import replace
8
+ from datetime import datetime, timezone
9
+ from typing import Callable, Awaitable
10
+
11
+ from nthlayer_workers.correlate.ingestion import severity as _severity
12
+ from nthlayer_workers.correlate.types import EventType, SitRepEvent
13
+
14
+ _MAX_HEADER_SIZE = 65_536 # 64 KB
15
+ _MAX_BODY_SIZE = 10 * 1024 * 1024 # 10 MB
16
+ _CONNECTION_TIMEOUT = 30 # seconds — per-connection deadline to prevent slowloris
17
+
18
+ _REQUIRED_FIELDS = {"source", "type", "service", "payload"}
19
+
20
+
21
+ class WebhookIngester:
22
+ """Accepts HTTP POST requests and dispatches SitRepEvents to a registered handler."""
23
+
24
+ def __init__(
25
+ self,
26
+ host: str = "127.0.0.1",
27
+ port: int = 8081,
28
+ slo_targets: dict | None = None,
29
+ ) -> None:
30
+ self._host = host
31
+ self._port = port
32
+ self._slo_targets = slo_targets
33
+ self._handler: Callable[[SitRepEvent], Awaitable[None]] | None = None
34
+ self._server: asyncio.Server | None = None
35
+
36
+ def on_event(self, handler: Callable[[SitRepEvent], Awaitable[None]]) -> None:
37
+ """Register the async event handler called for each valid incoming event."""
38
+ self._handler = handler
39
+
40
+ async def start(self) -> None:
41
+ """Start listening for connections."""
42
+ self._server = await asyncio.start_server(
43
+ self._handle_connection, self._host, self._port
44
+ )
45
+
46
+ async def stop(self) -> None:
47
+ """Stop the server and wait for it to close."""
48
+ if self._server:
49
+ self._server.close()
50
+ await self._server.wait_closed()
51
+ self._server = None
52
+
53
+ # ------------------------------------------------------------------
54
+ # Internal helpers
55
+ # ------------------------------------------------------------------
56
+
57
+ def _parse_body(self, body: bytes) -> SitRepEvent:
58
+ """Parse, validate, and construct a SitRepEvent from raw JSON bytes."""
59
+ data = json.loads(body)
60
+ missing = _REQUIRED_FIELDS - set(data.keys())
61
+ if missing:
62
+ raise ValueError(f"Missing required fields: {sorted(missing)}")
63
+
64
+ # Normalise event type
65
+ raw_type = data["type"]
66
+ try:
67
+ event_type = EventType(raw_type)
68
+ except ValueError:
69
+ raise ValueError(f"Unknown event type: {raw_type!r}")
70
+
71
+ # Auto-generate id and timestamp when absent
72
+ event_id = data.get("id") or str(uuid.uuid4())
73
+ timestamp = data.get("timestamp") or datetime.now(timezone.utc).isoformat()
74
+
75
+ event = SitRepEvent(
76
+ id=event_id,
77
+ timestamp=timestamp,
78
+ source=data["source"],
79
+ type=event_type,
80
+ service=data["service"],
81
+ environment=data.get("environment", ""),
82
+ severity=max(0.0, min(1.0, float(data.get("severity", 0.5)))),
83
+ payload=data["payload"],
84
+ dependencies=data.get("dependencies", []),
85
+ dependents=data.get("dependents", []),
86
+ ttl=int(data.get("ttl", 86400)),
87
+ )
88
+
89
+ # Apply arithmetic severity pre-scoring if SLO targets are configured
90
+ scored = _severity.pre_score(event, self._slo_targets)
91
+ if scored != event.severity:
92
+ event = replace(event, severity=scored)
93
+
94
+ return event
95
+
96
+ async def _handle_connection(
97
+ self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter
98
+ ) -> None:
99
+ """Handle a single HTTP connection with per-connection timeout."""
100
+ try:
101
+ await asyncio.wait_for(
102
+ self._handle_request(reader, writer), timeout=_CONNECTION_TIMEOUT
103
+ )
104
+ except asyncio.TimeoutError:
105
+ writer.close()
106
+ try:
107
+ await writer.wait_closed()
108
+ except Exception:
109
+ pass
110
+
111
+ async def _handle_request(
112
+ self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter
113
+ ) -> None:
114
+ """Inner request handler, called within a per-connection timeout."""
115
+ try:
116
+ # --- Read headers until \r\n\r\n ---
117
+ header_data = b""
118
+ while b"\r\n\r\n" not in header_data:
119
+ chunk = await reader.read(4096)
120
+ if not chunk:
121
+ return
122
+ header_data += chunk
123
+ if len(header_data) > _MAX_HEADER_SIZE:
124
+ writer.write(
125
+ b"HTTP/1.1 431 Request Header Fields Too Large\r\nContent-Length: 0\r\n\r\n"
126
+ )
127
+ await writer.drain()
128
+ return
129
+
130
+ header_part, _, body_start = header_data.partition(b"\r\n\r\n")
131
+ headers_text = header_part.decode("utf-8", errors="replace")
132
+ lines = headers_text.split("\r\n")
133
+ request_line = lines[0] if lines else ""
134
+
135
+ # --- Parse Content-Length ---
136
+ content_length = 0
137
+ for line in lines[1:]:
138
+ if line.lower().startswith("content-length:"):
139
+ content_length = int(line.split(":", 1)[1].strip())
140
+ break
141
+
142
+ if content_length > _MAX_BODY_SIZE:
143
+ writer.write(
144
+ b"HTTP/1.1 413 Payload Too Large\r\nContent-Length: 0\r\n\r\n"
145
+ )
146
+ await writer.drain()
147
+ return
148
+
149
+ # --- Read remaining body bytes ---
150
+ body = body_start
151
+ while len(body) < content_length:
152
+ chunk = await reader.read(content_length - len(body))
153
+ if not chunk:
154
+ break
155
+ body += chunk
156
+
157
+ # --- Method check ---
158
+ if not request_line.startswith("POST"):
159
+ writer.write(
160
+ b"HTTP/1.1 405 Method Not Allowed\r\nContent-Length: 0\r\n\r\n"
161
+ )
162
+ await writer.drain()
163
+ return
164
+
165
+ # --- Parse, validate, and dispatch ---
166
+ try:
167
+ event = self._parse_body(body)
168
+ except (json.JSONDecodeError, ValueError, KeyError) as exc:
169
+ error_body = json.dumps({"error": str(exc)}).encode()
170
+ response = (
171
+ f"HTTP/1.1 400 Bad Request\r\nContent-Type: application/json\r\n"
172
+ f"Content-Length: {len(error_body)}\r\n\r\n"
173
+ ).encode() + error_body
174
+ writer.write(response)
175
+ await writer.drain()
176
+ return
177
+
178
+ # Call the registered handler (async or sync)
179
+ if self._handler is not None:
180
+ result = self._handler(event)
181
+ if asyncio.iscoroutine(result):
182
+ await result
183
+
184
+ ok_body = b'{"status":"ok"}'
185
+ response = (
186
+ f"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n"
187
+ f"Content-Length: {len(ok_body)}\r\n\r\n"
188
+ ).encode() + ok_body
189
+ writer.write(response)
190
+ await writer.drain()
191
+
192
+ finally:
193
+ writer.close()
194
+ try:
195
+ await writer.wait_closed()
196
+ except Exception:
197
+ pass
@@ -0,0 +1,85 @@
1
+ """Slack block builders for nthlayer-correlate verdicts."""
2
+ from __future__ import annotations
3
+
4
+
5
+ def build_correlation_blocks(verdict) -> tuple[list[dict], str]:
6
+ """Build Slack blocks for root cause identification.
7
+
8
+ Returns (blocks, fallback_text).
9
+ """
10
+ custom = getattr(verdict.metadata, "custom", {}) or {}
11
+ service = verdict.subject.ref or "unknown"
12
+ root_causes = custom.get("root_causes", [])
13
+ blast_radius = custom.get("blast_radius", [])
14
+ confidence = verdict.judgment.confidence
15
+
16
+ rc_text = root_causes[0].get("service", "unknown") if root_causes else "under investigation"
17
+ blast_count = len(blast_radius)
18
+ blast_services = ", ".join(
19
+ b.get("service", b) if isinstance(b, dict) else b
20
+ for b in blast_radius[:5]
21
+ )
22
+
23
+ text = f"\U0001f50d Root cause: {rc_text} \u2014 {blast_count} services in blast radius"
24
+
25
+ blocks = [
26
+ {
27
+ "type": "section",
28
+ "text": {
29
+ "type": "mrkdwn",
30
+ "text": f"*\U0001f50d ROOT CAUSE IDENTIFIED \u00b7 {service}*",
31
+ },
32
+ },
33
+ {
34
+ "type": "section",
35
+ "text": {
36
+ "type": "mrkdwn",
37
+ "text": (
38
+ f"*Root cause:* {rc_text}\n"
39
+ f"*Blast radius:* {blast_count} services \u2014 {blast_services}\n"
40
+ "NthLayer correlated the breach with the service dependency graph."
41
+ ),
42
+ },
43
+ },
44
+ {
45
+ "type": "context",
46
+ "elements": [
47
+ {
48
+ "type": "mrkdwn",
49
+ "text": f"nthlayer-correlate \u00b7 confidence {confidence:.2f} \u00b7 {verdict.id}",
50
+ },
51
+ ],
52
+ },
53
+ ]
54
+
55
+ return blocks, text
56
+
57
+
58
+ def find_slack_thread_ts(verdict_store, verdict_ids: list[str]) -> str | None:
59
+ """Walk verdict lineage to find slack_thread_ts from the earliest verdict.
60
+
61
+ Returns None if no thread_ts found in lineage.
62
+ """
63
+ for vid in verdict_ids:
64
+ try:
65
+ v = verdict_store.get(vid)
66
+ if v is None:
67
+ continue
68
+ custom = getattr(v.metadata, "custom", {}) or {}
69
+ ts = custom.get("slack_thread_ts")
70
+ if ts:
71
+ return ts
72
+ # Walk up lineage
73
+ for ctx_id in (v.lineage.context or []):
74
+ try:
75
+ ctx_v = verdict_store.get(ctx_id)
76
+ if ctx_v:
77
+ ctx_custom = getattr(ctx_v.metadata, "custom", {}) or {}
78
+ ts = ctx_custom.get("slack_thread_ts")
79
+ if ts:
80
+ return ts
81
+ except Exception:
82
+ pass
83
+ except Exception:
84
+ pass
85
+ return None
@@ -0,0 +1,234 @@
1
+ """Prometheus query helpers for live correlation."""
2
+ from __future__ import annotations
3
+
4
+ import uuid
5
+ from datetime import datetime, timezone
6
+ import httpx
7
+ import structlog
8
+
9
+ from nthlayer_workers.correlate.types import EventType, SitRepEvent
10
+
11
+ logger = structlog.get_logger(__name__)
12
+
13
+
14
+ async def fetch_alerts(
15
+ client: httpx.AsyncClient,
16
+ prometheus_url: str,
17
+ services: set[str],
18
+ ) -> list[SitRepEvent]:
19
+ """Fetch currently firing alerts from Prometheus, filtered to given services."""
20
+ events: list[SitRepEvent] = []
21
+ try:
22
+ resp = await client.get(f"{prometheus_url}/api/v1/alerts", timeout=10.0)
23
+ resp.raise_for_status()
24
+ alerts = resp.json().get("data", {}).get("alerts", [])
25
+ for alert in alerts:
26
+ if alert.get("state") != "firing":
27
+ continue
28
+ labels = alert.get("labels", {})
29
+ service = labels.get("service", "")
30
+ if service not in services:
31
+ continue
32
+ events.append(SitRepEvent(
33
+ id=f"prom-alert-{uuid.uuid4().hex[:8]}",
34
+ timestamp=alert.get("activeAt", datetime.now(timezone.utc).isoformat()),
35
+ source="prometheus",
36
+ type=EventType.ALERT,
37
+ service=service,
38
+ environment=labels.get("environment", "production"),
39
+ severity=_alert_severity(labels.get("severity", "warning")),
40
+ payload={
41
+ "alert_name": labels.get("alertname", "unknown"),
42
+ "labels": labels,
43
+ "annotations": alert.get("annotations", {}),
44
+ },
45
+ ))
46
+ except (httpx.HTTPError, ValueError, KeyError) as exc:
47
+ logger.debug("Failed to fetch Prometheus alerts", error=str(exc))
48
+ return events
49
+
50
+
51
+ async def fetch_metric_breaches(
52
+ client: httpx.AsyncClient,
53
+ prometheus_url: str,
54
+ services: set[str],
55
+ window_minutes: int = 30,
56
+ ) -> list[SitRepEvent]:
57
+ """Query Prometheus for SLO metric breaches on given services."""
58
+ events: list[SitRepEvent] = []
59
+ now = datetime.now(timezone.utc)
60
+
61
+ for service in services:
62
+ # Check error budget
63
+ budget = await _query_instant(
64
+ client, prometheus_url,
65
+ f'slo:error_budget:ratio{{service="{service}"}}',
66
+ )
67
+ if budget is not None and budget < 0.0:
68
+ events.append(SitRepEvent(
69
+ id=f"prom-metric-{uuid.uuid4().hex[:8]}",
70
+ timestamp=now.isoformat(),
71
+ source="prometheus",
72
+ type=EventType.METRIC_BREACH,
73
+ service=service,
74
+ environment="production",
75
+ severity=min(1.0, abs(budget) * 5), # 20% deficit → severity 1.0
76
+ payload={
77
+ "metric": "slo:error_budget:ratio",
78
+ "value": budget,
79
+ "breach": "error_budget_exhausted",
80
+ },
81
+ ))
82
+
83
+ # Check p99 latency
84
+ p99 = await _query_instant(
85
+ client, prometheus_url,
86
+ f'slo:http_request_duration_seconds:p99{{service="{service}"}}',
87
+ )
88
+ if p99 is not None and p99 > 0.5: # >500ms is concerning
89
+ events.append(SitRepEvent(
90
+ id=f"prom-metric-{uuid.uuid4().hex[:8]}",
91
+ timestamp=now.isoformat(),
92
+ source="prometheus",
93
+ type=EventType.METRIC_BREACH,
94
+ service=service,
95
+ environment="production",
96
+ severity=min(1.0, p99 / 2.0), # 2s p99 → severity 1.0
97
+ payload={
98
+ "metric": "slo:http_request_duration_seconds:p99",
99
+ "value": p99,
100
+ "breach": "latency_exceeded",
101
+ },
102
+ ))
103
+
104
+ # Check error rate
105
+ error_rate = await _query_instant(
106
+ client, prometheus_url,
107
+ f'service:http_errors:rate5m{{service="{service}"}}',
108
+ )
109
+ if error_rate is not None and error_rate > 0.01: # >1% error rate
110
+ events.append(SitRepEvent(
111
+ id=f"prom-metric-{uuid.uuid4().hex[:8]}",
112
+ timestamp=now.isoformat(),
113
+ source="prometheus",
114
+ type=EventType.METRIC_BREACH,
115
+ service=service,
116
+ environment="production",
117
+ severity=min(1.0, error_rate * 10), # 10% error rate → severity 1.0
118
+ payload={
119
+ "metric": "service:http_errors:rate5m",
120
+ "value": error_rate,
121
+ "breach": "error_rate_elevated",
122
+ },
123
+ ))
124
+
125
+ return events
126
+
127
+
128
+ def verdict_to_event(verdict) -> SitRepEvent:
129
+ """Convert a verdict from the verdict store into a SitRepEvent."""
130
+ custom = getattr(verdict.metadata, "custom", {}) or {}
131
+ return SitRepEvent(
132
+ id=f"verdict-{verdict.id}",
133
+ timestamp=verdict.timestamp.isoformat() if hasattr(verdict.timestamp, "isoformat") else str(verdict.timestamp),
134
+ source=verdict.producer.system,
135
+ type=EventType.VERDICT,
136
+ service=verdict.subject.ref or verdict.subject.service or "unknown",
137
+ environment="production",
138
+ severity=verdict.judgment.confidence if verdict.judgment.action == "flag" else 0.2,
139
+ payload={
140
+ "verdict_id": verdict.id,
141
+ "action": verdict.judgment.action,
142
+ "confidence": verdict.judgment.confidence,
143
+ "slo_name": custom.get("slo_name"),
144
+ "slo_type": custom.get("slo_type"),
145
+ "breach": custom.get("breach"),
146
+ },
147
+ )
148
+
149
+
150
+ def load_dependency_graph(specs_dir: str) -> dict[str, dict]:
151
+ """Load service dependency graph from OpenSRM specs directory.
152
+
153
+ Returns dict mapping service name → {tier, dependencies, dependents}.
154
+ """
155
+ import yaml
156
+ from pathlib import Path
157
+
158
+ graph: dict[str, dict] = {}
159
+ specs_path = Path(specs_dir)
160
+ if not specs_path.is_dir():
161
+ return graph
162
+
163
+ for spec_file in sorted(specs_path.glob("*.yaml")):
164
+ try:
165
+ raw = yaml.safe_load(spec_file.read_text())
166
+ except Exception:
167
+ continue
168
+ if not isinstance(raw, dict):
169
+ continue
170
+
171
+ metadata = raw.get("metadata", {})
172
+ service = metadata.get("name", spec_file.stem)
173
+ spec = raw.get("spec", {})
174
+ tier = metadata.get("tier", "standard")
175
+ deps = [d["name"] for d in spec.get("dependencies", []) if isinstance(d, dict)]
176
+
177
+ graph[service] = {"tier": tier, "dependencies": deps, "dependents": []}
178
+
179
+ # Build reverse dependencies
180
+ for svc, info in graph.items():
181
+ for dep in info["dependencies"]:
182
+ if dep in graph:
183
+ graph[dep]["dependents"].append(svc)
184
+
185
+ return graph
186
+
187
+
188
+ def blast_radius_services(
189
+ trigger_service: str,
190
+ dependency_graph: dict[str, dict],
191
+ ) -> set[str]:
192
+ """Compute blast radius: trigger service + dependents (upstream consumers) + dependencies (downstream)."""
193
+ affected = {trigger_service}
194
+ # Walk dependents (who depends on the trigger service?)
195
+ to_visit = list(dependency_graph.get(trigger_service, {}).get("dependents", []))
196
+ while to_visit:
197
+ svc = to_visit.pop(0)
198
+ if svc not in affected:
199
+ affected.add(svc)
200
+ to_visit.extend(dependency_graph.get(svc, {}).get("dependents", []))
201
+ # Also include dependencies (downstream services)
202
+ for dep in dependency_graph.get(trigger_service, {}).get("dependencies", []):
203
+ affected.add(dep)
204
+ return affected
205
+
206
+
207
+ async def _query_instant(
208
+ client: httpx.AsyncClient,
209
+ prometheus_url: str,
210
+ query: str,
211
+ ) -> float | None:
212
+ """Execute a PromQL instant query and return the scalar value."""
213
+ try:
214
+ resp = await client.get(
215
+ f"{prometheus_url}/api/v1/query",
216
+ params={"query": query},
217
+ timeout=10.0,
218
+ )
219
+ resp.raise_for_status()
220
+ results = resp.json().get("data", {}).get("result", [])
221
+ if not results:
222
+ return None
223
+ val = float(results[0].get("value", [None, None])[1])
224
+ if val != val: # NaN
225
+ return None
226
+ return val
227
+ except (httpx.HTTPError, ValueError, KeyError, IndexError, TypeError):
228
+ return None
229
+
230
+
231
+ def _alert_severity(severity_label: str) -> float:
232
+ """Map Prometheus alert severity label to 0.0-1.0."""
233
+ mapping = {"critical": 0.95, "warning": 0.6, "info": 0.3}
234
+ return mapping.get(severity_label, 0.5)