nthlayer-workers 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nthlayer_workers/__init__.py +5 -0
- nthlayer_workers/cli.py +234 -0
- nthlayer_workers/correlate/__init__.py +1 -0
- nthlayer_workers/correlate/cli.py +847 -0
- nthlayer_workers/correlate/config.py +111 -0
- nthlayer_workers/correlate/correlation/__init__.py +1 -0
- nthlayer_workers/correlate/correlation/changes.py +87 -0
- nthlayer_workers/correlate/correlation/dedup.py +62 -0
- nthlayer_workers/correlate/correlation/engine.py +244 -0
- nthlayer_workers/correlate/correlation/temporal.py +79 -0
- nthlayer_workers/correlate/correlation/topology.py +104 -0
- nthlayer_workers/correlate/ingestion/__init__.py +1 -0
- nthlayer_workers/correlate/ingestion/protocol.py +10 -0
- nthlayer_workers/correlate/ingestion/severity.py +18 -0
- nthlayer_workers/correlate/ingestion/webhook.py +197 -0
- nthlayer_workers/correlate/notifications.py +85 -0
- nthlayer_workers/correlate/prometheus.py +234 -0
- nthlayer_workers/correlate/reasoning.py +375 -0
- nthlayer_workers/correlate/session.py +189 -0
- nthlayer_workers/correlate/snapshot/__init__.py +1 -0
- nthlayer_workers/correlate/snapshot/generator.py +170 -0
- nthlayer_workers/correlate/snapshot/model.py +177 -0
- nthlayer_workers/correlate/snapshot/token.py +14 -0
- nthlayer_workers/correlate/state.py +88 -0
- nthlayer_workers/correlate/store/__init__.py +5 -0
- nthlayer_workers/correlate/store/protocol.py +48 -0
- nthlayer_workers/correlate/store/sqlite.py +443 -0
- nthlayer_workers/correlate/summary.py +180 -0
- nthlayer_workers/correlate/traces/__init__.py +1 -0
- nthlayer_workers/correlate/traces/protocol.py +120 -0
- nthlayer_workers/correlate/traces/tempo.py +667 -0
- nthlayer_workers/correlate/traces/topology.py +39 -0
- nthlayer_workers/correlate/types.py +77 -0
- nthlayer_workers/correlate/worker.py +630 -0
- nthlayer_workers/learn/__init__.py +5 -0
- nthlayer_workers/learn/__main__.py +5 -0
- nthlayer_workers/learn/cli.py +164 -0
- nthlayer_workers/learn/retrospective.py +381 -0
- nthlayer_workers/learn/trends.py +102 -0
- nthlayer_workers/learn/worker.py +366 -0
- nthlayer_workers/measure/__init__.py +3 -0
- nthlayer_workers/measure/__main__.py +5 -0
- nthlayer_workers/measure/_parsing.py +15 -0
- nthlayer_workers/measure/adapters/__init__.py +0 -0
- nthlayer_workers/measure/adapters/_util.py +24 -0
- nthlayer_workers/measure/adapters/devin.py +119 -0
- nthlayer_workers/measure/adapters/gastown.py +88 -0
- nthlayer_workers/measure/adapters/prometheus.py +277 -0
- nthlayer_workers/measure/adapters/protocol.py +20 -0
- nthlayer_workers/measure/adapters/webhook.py +161 -0
- nthlayer_workers/measure/api/__init__.py +0 -0
- nthlayer_workers/measure/api/normalise.py +50 -0
- nthlayer_workers/measure/api/queue.py +243 -0
- nthlayer_workers/measure/api/response.py +51 -0
- nthlayer_workers/measure/api/server.py +504 -0
- nthlayer_workers/measure/calibration/__init__.py +0 -0
- nthlayer_workers/measure/calibration/loop.py +62 -0
- nthlayer_workers/measure/calibration/slos.py +212 -0
- nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
- nthlayer_workers/measure/cli.py +753 -0
- nthlayer_workers/measure/config.py +191 -0
- nthlayer_workers/measure/detection/__init__.py +6 -0
- nthlayer_workers/measure/detection/detector.py +82 -0
- nthlayer_workers/measure/detection/protocol.py +29 -0
- nthlayer_workers/measure/governance/__init__.py +0 -0
- nthlayer_workers/measure/governance/engine.py +163 -0
- nthlayer_workers/measure/manifest.py +77 -0
- nthlayer_workers/measure/notifications.py +53 -0
- nthlayer_workers/measure/pipeline/__init__.py +0 -0
- nthlayer_workers/measure/pipeline/evaluator.py +155 -0
- nthlayer_workers/measure/pipeline/router.py +160 -0
- nthlayer_workers/measure/store/__init__.py +0 -0
- nthlayer_workers/measure/store/protocol.py +38 -0
- nthlayer_workers/measure/store/sqlite.py +276 -0
- nthlayer_workers/measure/telemetry.py +116 -0
- nthlayer_workers/measure/tiering/__init__.py +0 -0
- nthlayer_workers/measure/tiering/classifier.py +58 -0
- nthlayer_workers/measure/tiering/promotion.py +118 -0
- nthlayer_workers/measure/trends/__init__.py +0 -0
- nthlayer_workers/measure/trends/tracker.py +72 -0
- nthlayer_workers/measure/types.py +75 -0
- nthlayer_workers/measure/worker.py +439 -0
- nthlayer_workers/observe/__init__.py +25 -0
- nthlayer_workers/observe/__main__.py +5 -0
- nthlayer_workers/observe/api/__init__.py +1 -0
- nthlayer_workers/observe/assessment.py +95 -0
- nthlayer_workers/observe/cli.py +737 -0
- nthlayer_workers/observe/config.py +11 -0
- nthlayer_workers/observe/db/__init__.py +1 -0
- nthlayer_workers/observe/decision_records.py +220 -0
- nthlayer_workers/observe/dependencies/__init__.py +18 -0
- nthlayer_workers/observe/dependencies/discovery.py +294 -0
- nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
- nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
- nthlayer_workers/observe/dependencies/providers/base.py +76 -0
- nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
- nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
- nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
- nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
- nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
- nthlayer_workers/observe/deployments/__init__.py +1 -0
- nthlayer_workers/observe/discovery/__init__.py +14 -0
- nthlayer_workers/observe/discovery/classifier.py +66 -0
- nthlayer_workers/observe/discovery/client.py +189 -0
- nthlayer_workers/observe/discovery/models.py +53 -0
- nthlayer_workers/observe/drift/__init__.py +26 -0
- nthlayer_workers/observe/drift/analyzer.py +383 -0
- nthlayer_workers/observe/drift/models.py +174 -0
- nthlayer_workers/observe/drift/patterns.py +88 -0
- nthlayer_workers/observe/explanation.py +118 -0
- nthlayer_workers/observe/gate/__init__.py +39 -0
- nthlayer_workers/observe/gate/conditions.py +92 -0
- nthlayer_workers/observe/gate/correlator.py +154 -0
- nthlayer_workers/observe/gate/evaluator.py +192 -0
- nthlayer_workers/observe/gate/policies.py +226 -0
- nthlayer_workers/observe/gate_adapter.py +40 -0
- nthlayer_workers/observe/incident.py +36 -0
- nthlayer_workers/observe/portfolio/__init__.py +17 -0
- nthlayer_workers/observe/portfolio/aggregator.py +168 -0
- nthlayer_workers/observe/portfolio/scorer.py +13 -0
- nthlayer_workers/observe/slo/__init__.py +19 -0
- nthlayer_workers/observe/slo/collector.py +235 -0
- nthlayer_workers/observe/slo/spec_loader.py +40 -0
- nthlayer_workers/observe/sqlite_store.py +152 -0
- nthlayer_workers/observe/store.py +92 -0
- nthlayer_workers/observe/verification/__init__.py +22 -0
- nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
- nthlayer_workers/observe/verification/extractor.py +127 -0
- nthlayer_workers/observe/verification/models.py +101 -0
- nthlayer_workers/observe/verification/verifier.py +111 -0
- nthlayer_workers/observe/worker.py +332 -0
- nthlayer_workers/respond/__init__.py +2 -0
- nthlayer_workers/respond/__main__.py +4 -0
- nthlayer_workers/respond/agents/__init__.py +0 -0
- nthlayer_workers/respond/agents/base.py +556 -0
- nthlayer_workers/respond/agents/communication.py +115 -0
- nthlayer_workers/respond/agents/investigation.py +124 -0
- nthlayer_workers/respond/agents/remediation.py +219 -0
- nthlayer_workers/respond/agents/triage.py +132 -0
- nthlayer_workers/respond/cli.py +772 -0
- nthlayer_workers/respond/config.py +135 -0
- nthlayer_workers/respond/context_store.py +256 -0
- nthlayer_workers/respond/coordinator.py +487 -0
- nthlayer_workers/respond/metrics.py +104 -0
- nthlayer_workers/respond/notification_backends/__init__.py +1 -0
- nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
- nthlayer_workers/respond/notification_backends/protocol.py +59 -0
- nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
- nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
- nthlayer_workers/respond/notifications.py +247 -0
- nthlayer_workers/respond/oncall/__init__.py +1 -0
- nthlayer_workers/respond/oncall/escalation.py +103 -0
- nthlayer_workers/respond/oncall/runner.py +193 -0
- nthlayer_workers/respond/oncall/schedule.py +243 -0
- nthlayer_workers/respond/safe_actions/__init__.py +0 -0
- nthlayer_workers/respond/safe_actions/actions.py +139 -0
- nthlayer_workers/respond/safe_actions/registry.py +171 -0
- nthlayer_workers/respond/safe_actions/webhook.py +194 -0
- nthlayer_workers/respond/server.py +357 -0
- nthlayer_workers/respond/sre/__init__.py +1 -0
- nthlayer_workers/respond/sre/brief.py +175 -0
- nthlayer_workers/respond/sre/delegation.py +101 -0
- nthlayer_workers/respond/sre/post_incident.py +146 -0
- nthlayer_workers/respond/sre/shift_report.py +129 -0
- nthlayer_workers/respond/sre/suppression.py +91 -0
- nthlayer_workers/respond/types.py +109 -0
- nthlayer_workers/respond/verdict_submission.py +56 -0
- nthlayer_workers/respond/worker.py +533 -0
- nthlayer_workers/respond/worker_helpers.py +140 -0
- nthlayer_workers/runner.py +198 -0
- nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
- nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
- nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
- nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
- nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Ingester protocol — the ingestion contract for SitRep."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import Awaitable, Callable, Protocol, Union
|
|
4
|
+
from nthlayer_workers.correlate.types import SitRepEvent
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Ingester(Protocol):
|
|
8
|
+
async def start(self) -> None: ...
|
|
9
|
+
async def stop(self) -> None: ...
|
|
10
|
+
def on_event(self, handler: Callable[[SitRepEvent], Union[Awaitable[None], None]]) -> None: ...
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Severity pre-scoring from SLO targets. Pure arithmetic, no judgment."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from nthlayer_workers.correlate.types import SitRepEvent
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def pre_score(event: SitRepEvent, slo_targets: dict | None) -> float:
|
|
7
|
+
"""Pre-score severity using SLO targets.
|
|
8
|
+
|
|
9
|
+
severity = min(1.0, max(0.0, (current_value - target_value) / target_value))
|
|
10
|
+
Returns event.severity unchanged if no SLO context available.
|
|
11
|
+
"""
|
|
12
|
+
if slo_targets is None or event.service not in slo_targets:
|
|
13
|
+
return event.severity
|
|
14
|
+
value = event.payload.get("value")
|
|
15
|
+
threshold = event.payload.get("threshold")
|
|
16
|
+
if value is None or threshold is None or threshold == 0:
|
|
17
|
+
return event.severity
|
|
18
|
+
return min(1.0, max(0.0, (value - threshold) / threshold))
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""WebhookIngester — raw asyncio TCP HTTP server for event ingestion."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import json
|
|
6
|
+
import uuid
|
|
7
|
+
from dataclasses import replace
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from typing import Callable, Awaitable
|
|
10
|
+
|
|
11
|
+
from nthlayer_workers.correlate.ingestion import severity as _severity
|
|
12
|
+
from nthlayer_workers.correlate.types import EventType, SitRepEvent
|
|
13
|
+
|
|
14
|
+
_MAX_HEADER_SIZE = 65_536 # 64 KB
|
|
15
|
+
_MAX_BODY_SIZE = 10 * 1024 * 1024 # 10 MB
|
|
16
|
+
_CONNECTION_TIMEOUT = 30 # seconds — per-connection deadline to prevent slowloris
|
|
17
|
+
|
|
18
|
+
_REQUIRED_FIELDS = {"source", "type", "service", "payload"}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class WebhookIngester:
|
|
22
|
+
"""Accepts HTTP POST requests and dispatches SitRepEvents to a registered handler."""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
host: str = "127.0.0.1",
|
|
27
|
+
port: int = 8081,
|
|
28
|
+
slo_targets: dict | None = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
self._host = host
|
|
31
|
+
self._port = port
|
|
32
|
+
self._slo_targets = slo_targets
|
|
33
|
+
self._handler: Callable[[SitRepEvent], Awaitable[None]] | None = None
|
|
34
|
+
self._server: asyncio.Server | None = None
|
|
35
|
+
|
|
36
|
+
def on_event(self, handler: Callable[[SitRepEvent], Awaitable[None]]) -> None:
|
|
37
|
+
"""Register the async event handler called for each valid incoming event."""
|
|
38
|
+
self._handler = handler
|
|
39
|
+
|
|
40
|
+
async def start(self) -> None:
|
|
41
|
+
"""Start listening for connections."""
|
|
42
|
+
self._server = await asyncio.start_server(
|
|
43
|
+
self._handle_connection, self._host, self._port
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
async def stop(self) -> None:
|
|
47
|
+
"""Stop the server and wait for it to close."""
|
|
48
|
+
if self._server:
|
|
49
|
+
self._server.close()
|
|
50
|
+
await self._server.wait_closed()
|
|
51
|
+
self._server = None
|
|
52
|
+
|
|
53
|
+
# ------------------------------------------------------------------
|
|
54
|
+
# Internal helpers
|
|
55
|
+
# ------------------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
def _parse_body(self, body: bytes) -> SitRepEvent:
|
|
58
|
+
"""Parse, validate, and construct a SitRepEvent from raw JSON bytes."""
|
|
59
|
+
data = json.loads(body)
|
|
60
|
+
missing = _REQUIRED_FIELDS - set(data.keys())
|
|
61
|
+
if missing:
|
|
62
|
+
raise ValueError(f"Missing required fields: {sorted(missing)}")
|
|
63
|
+
|
|
64
|
+
# Normalise event type
|
|
65
|
+
raw_type = data["type"]
|
|
66
|
+
try:
|
|
67
|
+
event_type = EventType(raw_type)
|
|
68
|
+
except ValueError:
|
|
69
|
+
raise ValueError(f"Unknown event type: {raw_type!r}")
|
|
70
|
+
|
|
71
|
+
# Auto-generate id and timestamp when absent
|
|
72
|
+
event_id = data.get("id") or str(uuid.uuid4())
|
|
73
|
+
timestamp = data.get("timestamp") or datetime.now(timezone.utc).isoformat()
|
|
74
|
+
|
|
75
|
+
event = SitRepEvent(
|
|
76
|
+
id=event_id,
|
|
77
|
+
timestamp=timestamp,
|
|
78
|
+
source=data["source"],
|
|
79
|
+
type=event_type,
|
|
80
|
+
service=data["service"],
|
|
81
|
+
environment=data.get("environment", ""),
|
|
82
|
+
severity=max(0.0, min(1.0, float(data.get("severity", 0.5)))),
|
|
83
|
+
payload=data["payload"],
|
|
84
|
+
dependencies=data.get("dependencies", []),
|
|
85
|
+
dependents=data.get("dependents", []),
|
|
86
|
+
ttl=int(data.get("ttl", 86400)),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Apply arithmetic severity pre-scoring if SLO targets are configured
|
|
90
|
+
scored = _severity.pre_score(event, self._slo_targets)
|
|
91
|
+
if scored != event.severity:
|
|
92
|
+
event = replace(event, severity=scored)
|
|
93
|
+
|
|
94
|
+
return event
|
|
95
|
+
|
|
96
|
+
async def _handle_connection(
|
|
97
|
+
self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter
|
|
98
|
+
) -> None:
|
|
99
|
+
"""Handle a single HTTP connection with per-connection timeout."""
|
|
100
|
+
try:
|
|
101
|
+
await asyncio.wait_for(
|
|
102
|
+
self._handle_request(reader, writer), timeout=_CONNECTION_TIMEOUT
|
|
103
|
+
)
|
|
104
|
+
except asyncio.TimeoutError:
|
|
105
|
+
writer.close()
|
|
106
|
+
try:
|
|
107
|
+
await writer.wait_closed()
|
|
108
|
+
except Exception:
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
async def _handle_request(
|
|
112
|
+
self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter
|
|
113
|
+
) -> None:
|
|
114
|
+
"""Inner request handler, called within a per-connection timeout."""
|
|
115
|
+
try:
|
|
116
|
+
# --- Read headers until \r\n\r\n ---
|
|
117
|
+
header_data = b""
|
|
118
|
+
while b"\r\n\r\n" not in header_data:
|
|
119
|
+
chunk = await reader.read(4096)
|
|
120
|
+
if not chunk:
|
|
121
|
+
return
|
|
122
|
+
header_data += chunk
|
|
123
|
+
if len(header_data) > _MAX_HEADER_SIZE:
|
|
124
|
+
writer.write(
|
|
125
|
+
b"HTTP/1.1 431 Request Header Fields Too Large\r\nContent-Length: 0\r\n\r\n"
|
|
126
|
+
)
|
|
127
|
+
await writer.drain()
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
header_part, _, body_start = header_data.partition(b"\r\n\r\n")
|
|
131
|
+
headers_text = header_part.decode("utf-8", errors="replace")
|
|
132
|
+
lines = headers_text.split("\r\n")
|
|
133
|
+
request_line = lines[0] if lines else ""
|
|
134
|
+
|
|
135
|
+
# --- Parse Content-Length ---
|
|
136
|
+
content_length = 0
|
|
137
|
+
for line in lines[1:]:
|
|
138
|
+
if line.lower().startswith("content-length:"):
|
|
139
|
+
content_length = int(line.split(":", 1)[1].strip())
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
if content_length > _MAX_BODY_SIZE:
|
|
143
|
+
writer.write(
|
|
144
|
+
b"HTTP/1.1 413 Payload Too Large\r\nContent-Length: 0\r\n\r\n"
|
|
145
|
+
)
|
|
146
|
+
await writer.drain()
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
# --- Read remaining body bytes ---
|
|
150
|
+
body = body_start
|
|
151
|
+
while len(body) < content_length:
|
|
152
|
+
chunk = await reader.read(content_length - len(body))
|
|
153
|
+
if not chunk:
|
|
154
|
+
break
|
|
155
|
+
body += chunk
|
|
156
|
+
|
|
157
|
+
# --- Method check ---
|
|
158
|
+
if not request_line.startswith("POST"):
|
|
159
|
+
writer.write(
|
|
160
|
+
b"HTTP/1.1 405 Method Not Allowed\r\nContent-Length: 0\r\n\r\n"
|
|
161
|
+
)
|
|
162
|
+
await writer.drain()
|
|
163
|
+
return
|
|
164
|
+
|
|
165
|
+
# --- Parse, validate, and dispatch ---
|
|
166
|
+
try:
|
|
167
|
+
event = self._parse_body(body)
|
|
168
|
+
except (json.JSONDecodeError, ValueError, KeyError) as exc:
|
|
169
|
+
error_body = json.dumps({"error": str(exc)}).encode()
|
|
170
|
+
response = (
|
|
171
|
+
f"HTTP/1.1 400 Bad Request\r\nContent-Type: application/json\r\n"
|
|
172
|
+
f"Content-Length: {len(error_body)}\r\n\r\n"
|
|
173
|
+
).encode() + error_body
|
|
174
|
+
writer.write(response)
|
|
175
|
+
await writer.drain()
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
# Call the registered handler (async or sync)
|
|
179
|
+
if self._handler is not None:
|
|
180
|
+
result = self._handler(event)
|
|
181
|
+
if asyncio.iscoroutine(result):
|
|
182
|
+
await result
|
|
183
|
+
|
|
184
|
+
ok_body = b'{"status":"ok"}'
|
|
185
|
+
response = (
|
|
186
|
+
f"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n"
|
|
187
|
+
f"Content-Length: {len(ok_body)}\r\n\r\n"
|
|
188
|
+
).encode() + ok_body
|
|
189
|
+
writer.write(response)
|
|
190
|
+
await writer.drain()
|
|
191
|
+
|
|
192
|
+
finally:
|
|
193
|
+
writer.close()
|
|
194
|
+
try:
|
|
195
|
+
await writer.wait_closed()
|
|
196
|
+
except Exception:
|
|
197
|
+
pass
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Slack block builders for nthlayer-correlate verdicts."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def build_correlation_blocks(verdict) -> tuple[list[dict], str]:
|
|
6
|
+
"""Build Slack blocks for root cause identification.
|
|
7
|
+
|
|
8
|
+
Returns (blocks, fallback_text).
|
|
9
|
+
"""
|
|
10
|
+
custom = getattr(verdict.metadata, "custom", {}) or {}
|
|
11
|
+
service = verdict.subject.ref or "unknown"
|
|
12
|
+
root_causes = custom.get("root_causes", [])
|
|
13
|
+
blast_radius = custom.get("blast_radius", [])
|
|
14
|
+
confidence = verdict.judgment.confidence
|
|
15
|
+
|
|
16
|
+
rc_text = root_causes[0].get("service", "unknown") if root_causes else "under investigation"
|
|
17
|
+
blast_count = len(blast_radius)
|
|
18
|
+
blast_services = ", ".join(
|
|
19
|
+
b.get("service", b) if isinstance(b, dict) else b
|
|
20
|
+
for b in blast_radius[:5]
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
text = f"\U0001f50d Root cause: {rc_text} \u2014 {blast_count} services in blast radius"
|
|
24
|
+
|
|
25
|
+
blocks = [
|
|
26
|
+
{
|
|
27
|
+
"type": "section",
|
|
28
|
+
"text": {
|
|
29
|
+
"type": "mrkdwn",
|
|
30
|
+
"text": f"*\U0001f50d ROOT CAUSE IDENTIFIED \u00b7 {service}*",
|
|
31
|
+
},
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"type": "section",
|
|
35
|
+
"text": {
|
|
36
|
+
"type": "mrkdwn",
|
|
37
|
+
"text": (
|
|
38
|
+
f"*Root cause:* {rc_text}\n"
|
|
39
|
+
f"*Blast radius:* {blast_count} services \u2014 {blast_services}\n"
|
|
40
|
+
"NthLayer correlated the breach with the service dependency graph."
|
|
41
|
+
),
|
|
42
|
+
},
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"type": "context",
|
|
46
|
+
"elements": [
|
|
47
|
+
{
|
|
48
|
+
"type": "mrkdwn",
|
|
49
|
+
"text": f"nthlayer-correlate \u00b7 confidence {confidence:.2f} \u00b7 {verdict.id}",
|
|
50
|
+
},
|
|
51
|
+
],
|
|
52
|
+
},
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
return blocks, text
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def find_slack_thread_ts(verdict_store, verdict_ids: list[str]) -> str | None:
|
|
59
|
+
"""Walk verdict lineage to find slack_thread_ts from the earliest verdict.
|
|
60
|
+
|
|
61
|
+
Returns None if no thread_ts found in lineage.
|
|
62
|
+
"""
|
|
63
|
+
for vid in verdict_ids:
|
|
64
|
+
try:
|
|
65
|
+
v = verdict_store.get(vid)
|
|
66
|
+
if v is None:
|
|
67
|
+
continue
|
|
68
|
+
custom = getattr(v.metadata, "custom", {}) or {}
|
|
69
|
+
ts = custom.get("slack_thread_ts")
|
|
70
|
+
if ts:
|
|
71
|
+
return ts
|
|
72
|
+
# Walk up lineage
|
|
73
|
+
for ctx_id in (v.lineage.context or []):
|
|
74
|
+
try:
|
|
75
|
+
ctx_v = verdict_store.get(ctx_id)
|
|
76
|
+
if ctx_v:
|
|
77
|
+
ctx_custom = getattr(ctx_v.metadata, "custom", {}) or {}
|
|
78
|
+
ts = ctx_custom.get("slack_thread_ts")
|
|
79
|
+
if ts:
|
|
80
|
+
return ts
|
|
81
|
+
except Exception:
|
|
82
|
+
pass
|
|
83
|
+
except Exception:
|
|
84
|
+
pass
|
|
85
|
+
return None
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""Prometheus query helpers for live correlation."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import uuid
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
import httpx
|
|
7
|
+
import structlog
|
|
8
|
+
|
|
9
|
+
from nthlayer_workers.correlate.types import EventType, SitRepEvent
|
|
10
|
+
|
|
11
|
+
logger = structlog.get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def fetch_alerts(
|
|
15
|
+
client: httpx.AsyncClient,
|
|
16
|
+
prometheus_url: str,
|
|
17
|
+
services: set[str],
|
|
18
|
+
) -> list[SitRepEvent]:
|
|
19
|
+
"""Fetch currently firing alerts from Prometheus, filtered to given services."""
|
|
20
|
+
events: list[SitRepEvent] = []
|
|
21
|
+
try:
|
|
22
|
+
resp = await client.get(f"{prometheus_url}/api/v1/alerts", timeout=10.0)
|
|
23
|
+
resp.raise_for_status()
|
|
24
|
+
alerts = resp.json().get("data", {}).get("alerts", [])
|
|
25
|
+
for alert in alerts:
|
|
26
|
+
if alert.get("state") != "firing":
|
|
27
|
+
continue
|
|
28
|
+
labels = alert.get("labels", {})
|
|
29
|
+
service = labels.get("service", "")
|
|
30
|
+
if service not in services:
|
|
31
|
+
continue
|
|
32
|
+
events.append(SitRepEvent(
|
|
33
|
+
id=f"prom-alert-{uuid.uuid4().hex[:8]}",
|
|
34
|
+
timestamp=alert.get("activeAt", datetime.now(timezone.utc).isoformat()),
|
|
35
|
+
source="prometheus",
|
|
36
|
+
type=EventType.ALERT,
|
|
37
|
+
service=service,
|
|
38
|
+
environment=labels.get("environment", "production"),
|
|
39
|
+
severity=_alert_severity(labels.get("severity", "warning")),
|
|
40
|
+
payload={
|
|
41
|
+
"alert_name": labels.get("alertname", "unknown"),
|
|
42
|
+
"labels": labels,
|
|
43
|
+
"annotations": alert.get("annotations", {}),
|
|
44
|
+
},
|
|
45
|
+
))
|
|
46
|
+
except (httpx.HTTPError, ValueError, KeyError) as exc:
|
|
47
|
+
logger.debug("Failed to fetch Prometheus alerts", error=str(exc))
|
|
48
|
+
return events
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
async def fetch_metric_breaches(
|
|
52
|
+
client: httpx.AsyncClient,
|
|
53
|
+
prometheus_url: str,
|
|
54
|
+
services: set[str],
|
|
55
|
+
window_minutes: int = 30,
|
|
56
|
+
) -> list[SitRepEvent]:
|
|
57
|
+
"""Query Prometheus for SLO metric breaches on given services."""
|
|
58
|
+
events: list[SitRepEvent] = []
|
|
59
|
+
now = datetime.now(timezone.utc)
|
|
60
|
+
|
|
61
|
+
for service in services:
|
|
62
|
+
# Check error budget
|
|
63
|
+
budget = await _query_instant(
|
|
64
|
+
client, prometheus_url,
|
|
65
|
+
f'slo:error_budget:ratio{{service="{service}"}}',
|
|
66
|
+
)
|
|
67
|
+
if budget is not None and budget < 0.0:
|
|
68
|
+
events.append(SitRepEvent(
|
|
69
|
+
id=f"prom-metric-{uuid.uuid4().hex[:8]}",
|
|
70
|
+
timestamp=now.isoformat(),
|
|
71
|
+
source="prometheus",
|
|
72
|
+
type=EventType.METRIC_BREACH,
|
|
73
|
+
service=service,
|
|
74
|
+
environment="production",
|
|
75
|
+
severity=min(1.0, abs(budget) * 5), # 20% deficit → severity 1.0
|
|
76
|
+
payload={
|
|
77
|
+
"metric": "slo:error_budget:ratio",
|
|
78
|
+
"value": budget,
|
|
79
|
+
"breach": "error_budget_exhausted",
|
|
80
|
+
},
|
|
81
|
+
))
|
|
82
|
+
|
|
83
|
+
# Check p99 latency
|
|
84
|
+
p99 = await _query_instant(
|
|
85
|
+
client, prometheus_url,
|
|
86
|
+
f'slo:http_request_duration_seconds:p99{{service="{service}"}}',
|
|
87
|
+
)
|
|
88
|
+
if p99 is not None and p99 > 0.5: # >500ms is concerning
|
|
89
|
+
events.append(SitRepEvent(
|
|
90
|
+
id=f"prom-metric-{uuid.uuid4().hex[:8]}",
|
|
91
|
+
timestamp=now.isoformat(),
|
|
92
|
+
source="prometheus",
|
|
93
|
+
type=EventType.METRIC_BREACH,
|
|
94
|
+
service=service,
|
|
95
|
+
environment="production",
|
|
96
|
+
severity=min(1.0, p99 / 2.0), # 2s p99 → severity 1.0
|
|
97
|
+
payload={
|
|
98
|
+
"metric": "slo:http_request_duration_seconds:p99",
|
|
99
|
+
"value": p99,
|
|
100
|
+
"breach": "latency_exceeded",
|
|
101
|
+
},
|
|
102
|
+
))
|
|
103
|
+
|
|
104
|
+
# Check error rate
|
|
105
|
+
error_rate = await _query_instant(
|
|
106
|
+
client, prometheus_url,
|
|
107
|
+
f'service:http_errors:rate5m{{service="{service}"}}',
|
|
108
|
+
)
|
|
109
|
+
if error_rate is not None and error_rate > 0.01: # >1% error rate
|
|
110
|
+
events.append(SitRepEvent(
|
|
111
|
+
id=f"prom-metric-{uuid.uuid4().hex[:8]}",
|
|
112
|
+
timestamp=now.isoformat(),
|
|
113
|
+
source="prometheus",
|
|
114
|
+
type=EventType.METRIC_BREACH,
|
|
115
|
+
service=service,
|
|
116
|
+
environment="production",
|
|
117
|
+
severity=min(1.0, error_rate * 10), # 10% error rate → severity 1.0
|
|
118
|
+
payload={
|
|
119
|
+
"metric": "service:http_errors:rate5m",
|
|
120
|
+
"value": error_rate,
|
|
121
|
+
"breach": "error_rate_elevated",
|
|
122
|
+
},
|
|
123
|
+
))
|
|
124
|
+
|
|
125
|
+
return events
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def verdict_to_event(verdict) -> SitRepEvent:
|
|
129
|
+
"""Convert a verdict from the verdict store into a SitRepEvent."""
|
|
130
|
+
custom = getattr(verdict.metadata, "custom", {}) or {}
|
|
131
|
+
return SitRepEvent(
|
|
132
|
+
id=f"verdict-{verdict.id}",
|
|
133
|
+
timestamp=verdict.timestamp.isoformat() if hasattr(verdict.timestamp, "isoformat") else str(verdict.timestamp),
|
|
134
|
+
source=verdict.producer.system,
|
|
135
|
+
type=EventType.VERDICT,
|
|
136
|
+
service=verdict.subject.ref or verdict.subject.service or "unknown",
|
|
137
|
+
environment="production",
|
|
138
|
+
severity=verdict.judgment.confidence if verdict.judgment.action == "flag" else 0.2,
|
|
139
|
+
payload={
|
|
140
|
+
"verdict_id": verdict.id,
|
|
141
|
+
"action": verdict.judgment.action,
|
|
142
|
+
"confidence": verdict.judgment.confidence,
|
|
143
|
+
"slo_name": custom.get("slo_name"),
|
|
144
|
+
"slo_type": custom.get("slo_type"),
|
|
145
|
+
"breach": custom.get("breach"),
|
|
146
|
+
},
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def load_dependency_graph(specs_dir: str) -> dict[str, dict]:
|
|
151
|
+
"""Load service dependency graph from OpenSRM specs directory.
|
|
152
|
+
|
|
153
|
+
Returns dict mapping service name → {tier, dependencies, dependents}.
|
|
154
|
+
"""
|
|
155
|
+
import yaml
|
|
156
|
+
from pathlib import Path
|
|
157
|
+
|
|
158
|
+
graph: dict[str, dict] = {}
|
|
159
|
+
specs_path = Path(specs_dir)
|
|
160
|
+
if not specs_path.is_dir():
|
|
161
|
+
return graph
|
|
162
|
+
|
|
163
|
+
for spec_file in sorted(specs_path.glob("*.yaml")):
|
|
164
|
+
try:
|
|
165
|
+
raw = yaml.safe_load(spec_file.read_text())
|
|
166
|
+
except Exception:
|
|
167
|
+
continue
|
|
168
|
+
if not isinstance(raw, dict):
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
metadata = raw.get("metadata", {})
|
|
172
|
+
service = metadata.get("name", spec_file.stem)
|
|
173
|
+
spec = raw.get("spec", {})
|
|
174
|
+
tier = metadata.get("tier", "standard")
|
|
175
|
+
deps = [d["name"] for d in spec.get("dependencies", []) if isinstance(d, dict)]
|
|
176
|
+
|
|
177
|
+
graph[service] = {"tier": tier, "dependencies": deps, "dependents": []}
|
|
178
|
+
|
|
179
|
+
# Build reverse dependencies
|
|
180
|
+
for svc, info in graph.items():
|
|
181
|
+
for dep in info["dependencies"]:
|
|
182
|
+
if dep in graph:
|
|
183
|
+
graph[dep]["dependents"].append(svc)
|
|
184
|
+
|
|
185
|
+
return graph
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def blast_radius_services(
|
|
189
|
+
trigger_service: str,
|
|
190
|
+
dependency_graph: dict[str, dict],
|
|
191
|
+
) -> set[str]:
|
|
192
|
+
"""Compute blast radius: trigger service + dependents (upstream consumers) + dependencies (downstream)."""
|
|
193
|
+
affected = {trigger_service}
|
|
194
|
+
# Walk dependents (who depends on the trigger service?)
|
|
195
|
+
to_visit = list(dependency_graph.get(trigger_service, {}).get("dependents", []))
|
|
196
|
+
while to_visit:
|
|
197
|
+
svc = to_visit.pop(0)
|
|
198
|
+
if svc not in affected:
|
|
199
|
+
affected.add(svc)
|
|
200
|
+
to_visit.extend(dependency_graph.get(svc, {}).get("dependents", []))
|
|
201
|
+
# Also include dependencies (downstream services)
|
|
202
|
+
for dep in dependency_graph.get(trigger_service, {}).get("dependencies", []):
|
|
203
|
+
affected.add(dep)
|
|
204
|
+
return affected
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
async def _query_instant(
|
|
208
|
+
client: httpx.AsyncClient,
|
|
209
|
+
prometheus_url: str,
|
|
210
|
+
query: str,
|
|
211
|
+
) -> float | None:
|
|
212
|
+
"""Execute a PromQL instant query and return the scalar value."""
|
|
213
|
+
try:
|
|
214
|
+
resp = await client.get(
|
|
215
|
+
f"{prometheus_url}/api/v1/query",
|
|
216
|
+
params={"query": query},
|
|
217
|
+
timeout=10.0,
|
|
218
|
+
)
|
|
219
|
+
resp.raise_for_status()
|
|
220
|
+
results = resp.json().get("data", {}).get("result", [])
|
|
221
|
+
if not results:
|
|
222
|
+
return None
|
|
223
|
+
val = float(results[0].get("value", [None, None])[1])
|
|
224
|
+
if val != val: # NaN
|
|
225
|
+
return None
|
|
226
|
+
return val
|
|
227
|
+
except (httpx.HTTPError, ValueError, KeyError, IndexError, TypeError):
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _alert_severity(severity_label: str) -> float:
|
|
232
|
+
"""Map Prometheus alert severity label to 0.0-1.0."""
|
|
233
|
+
mapping = {"critical": 0.95, "warning": 0.6, "info": 0.3}
|
|
234
|
+
return mapping.get(severity_label, 0.5)
|