nthlayer-workers 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nthlayer_workers/__init__.py +5 -0
- nthlayer_workers/cli.py +234 -0
- nthlayer_workers/correlate/__init__.py +1 -0
- nthlayer_workers/correlate/cli.py +847 -0
- nthlayer_workers/correlate/config.py +111 -0
- nthlayer_workers/correlate/correlation/__init__.py +1 -0
- nthlayer_workers/correlate/correlation/changes.py +87 -0
- nthlayer_workers/correlate/correlation/dedup.py +62 -0
- nthlayer_workers/correlate/correlation/engine.py +244 -0
- nthlayer_workers/correlate/correlation/temporal.py +79 -0
- nthlayer_workers/correlate/correlation/topology.py +104 -0
- nthlayer_workers/correlate/ingestion/__init__.py +1 -0
- nthlayer_workers/correlate/ingestion/protocol.py +10 -0
- nthlayer_workers/correlate/ingestion/severity.py +18 -0
- nthlayer_workers/correlate/ingestion/webhook.py +197 -0
- nthlayer_workers/correlate/notifications.py +85 -0
- nthlayer_workers/correlate/prometheus.py +234 -0
- nthlayer_workers/correlate/reasoning.py +375 -0
- nthlayer_workers/correlate/session.py +189 -0
- nthlayer_workers/correlate/snapshot/__init__.py +1 -0
- nthlayer_workers/correlate/snapshot/generator.py +170 -0
- nthlayer_workers/correlate/snapshot/model.py +177 -0
- nthlayer_workers/correlate/snapshot/token.py +14 -0
- nthlayer_workers/correlate/state.py +88 -0
- nthlayer_workers/correlate/store/__init__.py +5 -0
- nthlayer_workers/correlate/store/protocol.py +48 -0
- nthlayer_workers/correlate/store/sqlite.py +443 -0
- nthlayer_workers/correlate/summary.py +180 -0
- nthlayer_workers/correlate/traces/__init__.py +1 -0
- nthlayer_workers/correlate/traces/protocol.py +120 -0
- nthlayer_workers/correlate/traces/tempo.py +667 -0
- nthlayer_workers/correlate/traces/topology.py +39 -0
- nthlayer_workers/correlate/types.py +77 -0
- nthlayer_workers/correlate/worker.py +630 -0
- nthlayer_workers/learn/__init__.py +5 -0
- nthlayer_workers/learn/__main__.py +5 -0
- nthlayer_workers/learn/cli.py +164 -0
- nthlayer_workers/learn/retrospective.py +381 -0
- nthlayer_workers/learn/trends.py +102 -0
- nthlayer_workers/learn/worker.py +366 -0
- nthlayer_workers/measure/__init__.py +3 -0
- nthlayer_workers/measure/__main__.py +5 -0
- nthlayer_workers/measure/_parsing.py +15 -0
- nthlayer_workers/measure/adapters/__init__.py +0 -0
- nthlayer_workers/measure/adapters/_util.py +24 -0
- nthlayer_workers/measure/adapters/devin.py +119 -0
- nthlayer_workers/measure/adapters/gastown.py +88 -0
- nthlayer_workers/measure/adapters/prometheus.py +277 -0
- nthlayer_workers/measure/adapters/protocol.py +20 -0
- nthlayer_workers/measure/adapters/webhook.py +161 -0
- nthlayer_workers/measure/api/__init__.py +0 -0
- nthlayer_workers/measure/api/normalise.py +50 -0
- nthlayer_workers/measure/api/queue.py +243 -0
- nthlayer_workers/measure/api/response.py +51 -0
- nthlayer_workers/measure/api/server.py +504 -0
- nthlayer_workers/measure/calibration/__init__.py +0 -0
- nthlayer_workers/measure/calibration/loop.py +62 -0
- nthlayer_workers/measure/calibration/slos.py +212 -0
- nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
- nthlayer_workers/measure/cli.py +753 -0
- nthlayer_workers/measure/config.py +191 -0
- nthlayer_workers/measure/detection/__init__.py +6 -0
- nthlayer_workers/measure/detection/detector.py +82 -0
- nthlayer_workers/measure/detection/protocol.py +29 -0
- nthlayer_workers/measure/governance/__init__.py +0 -0
- nthlayer_workers/measure/governance/engine.py +163 -0
- nthlayer_workers/measure/manifest.py +77 -0
- nthlayer_workers/measure/notifications.py +53 -0
- nthlayer_workers/measure/pipeline/__init__.py +0 -0
- nthlayer_workers/measure/pipeline/evaluator.py +155 -0
- nthlayer_workers/measure/pipeline/router.py +160 -0
- nthlayer_workers/measure/store/__init__.py +0 -0
- nthlayer_workers/measure/store/protocol.py +38 -0
- nthlayer_workers/measure/store/sqlite.py +276 -0
- nthlayer_workers/measure/telemetry.py +116 -0
- nthlayer_workers/measure/tiering/__init__.py +0 -0
- nthlayer_workers/measure/tiering/classifier.py +58 -0
- nthlayer_workers/measure/tiering/promotion.py +118 -0
- nthlayer_workers/measure/trends/__init__.py +0 -0
- nthlayer_workers/measure/trends/tracker.py +72 -0
- nthlayer_workers/measure/types.py +75 -0
- nthlayer_workers/measure/worker.py +439 -0
- nthlayer_workers/observe/__init__.py +25 -0
- nthlayer_workers/observe/__main__.py +5 -0
- nthlayer_workers/observe/api/__init__.py +1 -0
- nthlayer_workers/observe/assessment.py +95 -0
- nthlayer_workers/observe/cli.py +737 -0
- nthlayer_workers/observe/config.py +11 -0
- nthlayer_workers/observe/db/__init__.py +1 -0
- nthlayer_workers/observe/decision_records.py +220 -0
- nthlayer_workers/observe/dependencies/__init__.py +18 -0
- nthlayer_workers/observe/dependencies/discovery.py +294 -0
- nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
- nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
- nthlayer_workers/observe/dependencies/providers/base.py +76 -0
- nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
- nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
- nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
- nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
- nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
- nthlayer_workers/observe/deployments/__init__.py +1 -0
- nthlayer_workers/observe/discovery/__init__.py +14 -0
- nthlayer_workers/observe/discovery/classifier.py +66 -0
- nthlayer_workers/observe/discovery/client.py +189 -0
- nthlayer_workers/observe/discovery/models.py +53 -0
- nthlayer_workers/observe/drift/__init__.py +26 -0
- nthlayer_workers/observe/drift/analyzer.py +383 -0
- nthlayer_workers/observe/drift/models.py +174 -0
- nthlayer_workers/observe/drift/patterns.py +88 -0
- nthlayer_workers/observe/explanation.py +118 -0
- nthlayer_workers/observe/gate/__init__.py +39 -0
- nthlayer_workers/observe/gate/conditions.py +92 -0
- nthlayer_workers/observe/gate/correlator.py +154 -0
- nthlayer_workers/observe/gate/evaluator.py +192 -0
- nthlayer_workers/observe/gate/policies.py +226 -0
- nthlayer_workers/observe/gate_adapter.py +40 -0
- nthlayer_workers/observe/incident.py +36 -0
- nthlayer_workers/observe/portfolio/__init__.py +17 -0
- nthlayer_workers/observe/portfolio/aggregator.py +168 -0
- nthlayer_workers/observe/portfolio/scorer.py +13 -0
- nthlayer_workers/observe/slo/__init__.py +19 -0
- nthlayer_workers/observe/slo/collector.py +235 -0
- nthlayer_workers/observe/slo/spec_loader.py +40 -0
- nthlayer_workers/observe/sqlite_store.py +152 -0
- nthlayer_workers/observe/store.py +92 -0
- nthlayer_workers/observe/verification/__init__.py +22 -0
- nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
- nthlayer_workers/observe/verification/extractor.py +127 -0
- nthlayer_workers/observe/verification/models.py +101 -0
- nthlayer_workers/observe/verification/verifier.py +111 -0
- nthlayer_workers/observe/worker.py +332 -0
- nthlayer_workers/respond/__init__.py +2 -0
- nthlayer_workers/respond/__main__.py +4 -0
- nthlayer_workers/respond/agents/__init__.py +0 -0
- nthlayer_workers/respond/agents/base.py +556 -0
- nthlayer_workers/respond/agents/communication.py +115 -0
- nthlayer_workers/respond/agents/investigation.py +124 -0
- nthlayer_workers/respond/agents/remediation.py +219 -0
- nthlayer_workers/respond/agents/triage.py +132 -0
- nthlayer_workers/respond/cli.py +772 -0
- nthlayer_workers/respond/config.py +135 -0
- nthlayer_workers/respond/context_store.py +256 -0
- nthlayer_workers/respond/coordinator.py +487 -0
- nthlayer_workers/respond/metrics.py +104 -0
- nthlayer_workers/respond/notification_backends/__init__.py +1 -0
- nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
- nthlayer_workers/respond/notification_backends/protocol.py +59 -0
- nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
- nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
- nthlayer_workers/respond/notifications.py +247 -0
- nthlayer_workers/respond/oncall/__init__.py +1 -0
- nthlayer_workers/respond/oncall/escalation.py +103 -0
- nthlayer_workers/respond/oncall/runner.py +193 -0
- nthlayer_workers/respond/oncall/schedule.py +243 -0
- nthlayer_workers/respond/safe_actions/__init__.py +0 -0
- nthlayer_workers/respond/safe_actions/actions.py +139 -0
- nthlayer_workers/respond/safe_actions/registry.py +171 -0
- nthlayer_workers/respond/safe_actions/webhook.py +194 -0
- nthlayer_workers/respond/server.py +357 -0
- nthlayer_workers/respond/sre/__init__.py +1 -0
- nthlayer_workers/respond/sre/brief.py +175 -0
- nthlayer_workers/respond/sre/delegation.py +101 -0
- nthlayer_workers/respond/sre/post_incident.py +146 -0
- nthlayer_workers/respond/sre/shift_report.py +129 -0
- nthlayer_workers/respond/sre/suppression.py +91 -0
- nthlayer_workers/respond/types.py +109 -0
- nthlayer_workers/respond/verdict_submission.py +56 -0
- nthlayer_workers/respond/worker.py +533 -0
- nthlayer_workers/respond/worker_helpers.py +140 -0
- nthlayer_workers/runner.py +198 -0
- nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
- nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
- nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
- nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
- nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,847 @@
|
|
|
1
|
+
"""SitRep CLI — serve, status, replay."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import asyncio
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import signal
|
|
9
|
+
import sys
|
|
10
|
+
import tempfile
|
|
11
|
+
from dataclasses import replace
|
|
12
|
+
from datetime import datetime, timedelta, timezone
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import structlog
|
|
16
|
+
import yaml
|
|
17
|
+
|
|
18
|
+
from nthlayer_workers.correlate.config import SitRepConfig, load_config
|
|
19
|
+
from nthlayer_workers.correlate.correlation.changes import find_change_candidates
|
|
20
|
+
from nthlayer_workers.correlate.correlation.dedup import deduplicate
|
|
21
|
+
from nthlayer_workers.correlate.correlation.engine import CorrelationEngine
|
|
22
|
+
from nthlayer_workers.correlate.correlation.temporal import group_temporal
|
|
23
|
+
from nthlayer_workers.correlate.correlation.topology import group_topology
|
|
24
|
+
from nthlayer_workers.correlate.ingestion.severity import pre_score
|
|
25
|
+
from nthlayer_workers.correlate.snapshot.generator import SnapshotBudget, SnapshotGenerator
|
|
26
|
+
from nthlayer_workers.correlate.snapshot.model import ModelInterface
|
|
27
|
+
from nthlayer_workers.correlate.state import StateMachine
|
|
28
|
+
from nthlayer_workers.correlate.store.sqlite import SQLiteEventStore
|
|
29
|
+
from nthlayer_workers.correlate.types import AgentState, EventType, SitRepEvent
|
|
30
|
+
|
|
31
|
+
logger = structlog.get_logger()
|
|
32
|
+
|
|
33
|
+
REFERENCE_TIME = datetime(2026, 1, 1, tzinfo=timezone.utc)
|
|
34
|
+
|
|
35
|
+
# Confidence decay window for temporal proximity heuristic (30 minutes)
|
|
36
|
+
_PROXIMITY_WINDOW_SECONDS = 1800.0
|
|
37
|
+
|
|
38
|
+
_DURATION_UNITS = {"ms": 0.001, "s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _parse_duration(value: str) -> timedelta:
|
|
42
|
+
"""Parse a duration string (e.g. '1h', '30m', '2h') to timedelta."""
|
|
43
|
+
for suffix, multiplier in sorted(_DURATION_UNITS.items(), key=lambda x: -len(x[0])):
|
|
44
|
+
if value.endswith(suffix):
|
|
45
|
+
return timedelta(seconds=float(value[:-len(suffix)]) * multiplier)
|
|
46
|
+
return timedelta(hours=1) # default fallback
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _proximity_confidence(seconds: float | None) -> float:
|
|
50
|
+
"""Heuristic confidence from temporal proximity of a change to a signal.
|
|
51
|
+
|
|
52
|
+
Decays linearly from 1.0 (simultaneous) to 0.0 (30 minutes apart).
|
|
53
|
+
Returns 0.5 for unknown proximity.
|
|
54
|
+
"""
|
|
55
|
+
if seconds is None:
|
|
56
|
+
return 0.5
|
|
57
|
+
from nthlayer_common.parsing import clamp
|
|
58
|
+
return clamp(1.0 - seconds / _PROXIMITY_WINDOW_SECONDS)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def parse_relative_time(at_str: str) -> str:
|
|
62
|
+
"""Parse 'T+Nm' to ISO 8601."""
|
|
63
|
+
match = re.match(r"T\+(\d+)m", at_str)
|
|
64
|
+
if not match:
|
|
65
|
+
raise ValueError(f"Invalid time format: {at_str}")
|
|
66
|
+
minutes = int(match.group(1))
|
|
67
|
+
ts = REFERENCE_TIME + timedelta(minutes=minutes)
|
|
68
|
+
return ts.isoformat()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def scenario_event_to_sitrep(evt_data: dict, index: int) -> SitRepEvent:
|
|
72
|
+
"""Convert a scenario event dict to a SitRepEvent."""
|
|
73
|
+
ts = parse_relative_time(evt_data["at"])
|
|
74
|
+
payload = evt_data["payload"]
|
|
75
|
+
service = payload.get("service", "unknown")
|
|
76
|
+
return SitRepEvent(
|
|
77
|
+
id=f"scenario-evt-{index:04d}",
|
|
78
|
+
timestamp=ts,
|
|
79
|
+
source="scenario",
|
|
80
|
+
type=EventType(evt_data["type"]),
|
|
81
|
+
service=service,
|
|
82
|
+
environment="production",
|
|
83
|
+
severity=0.5,
|
|
84
|
+
payload=payload,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _build_topology_dict(scenario: dict) -> dict[str, Any] | None:
|
|
89
|
+
"""Build the topology dict expected by the correlation engine from scenario YAML."""
|
|
90
|
+
topo_section = scenario.get("topology")
|
|
91
|
+
if not topo_section:
|
|
92
|
+
return None
|
|
93
|
+
services = topo_section.get("services", [])
|
|
94
|
+
if not services:
|
|
95
|
+
return None
|
|
96
|
+
result: dict[str, Any] = {}
|
|
97
|
+
for svc in services:
|
|
98
|
+
name = svc["name"]
|
|
99
|
+
result[name] = {
|
|
100
|
+
"tier": svc.get("tier", "standard"),
|
|
101
|
+
"dependencies": svc.get("dependencies", []),
|
|
102
|
+
"dependents": svc.get("dependents", []),
|
|
103
|
+
}
|
|
104
|
+
return result
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def replay_command(
|
|
108
|
+
scenario_path: str,
|
|
109
|
+
config_path: str | None,
|
|
110
|
+
no_model: bool,
|
|
111
|
+
store_dir: str,
|
|
112
|
+
) -> int:
|
|
113
|
+
"""Replay a scenario fixture. Returns exit code."""
|
|
114
|
+
# Load scenario
|
|
115
|
+
try:
|
|
116
|
+
with open(scenario_path) as f:
|
|
117
|
+
raw = yaml.safe_load(f)
|
|
118
|
+
except (FileNotFoundError, OSError) as exc:
|
|
119
|
+
logger.error("scenario_load_failed", path=scenario_path, error=str(exc))
|
|
120
|
+
return 2
|
|
121
|
+
|
|
122
|
+
scenario = raw.get("scenario", raw)
|
|
123
|
+
|
|
124
|
+
# Parse events
|
|
125
|
+
events_data = scenario.get("events", [])
|
|
126
|
+
events: list[SitRepEvent] = []
|
|
127
|
+
for i, evt_data in enumerate(events_data):
|
|
128
|
+
try:
|
|
129
|
+
event = scenario_event_to_sitrep(evt_data, i)
|
|
130
|
+
events.append(event)
|
|
131
|
+
except (ValueError, KeyError) as exc:
|
|
132
|
+
logger.warning("event_parse_failed", index=i, error=str(exc))
|
|
133
|
+
|
|
134
|
+
# Build topology
|
|
135
|
+
topology = _build_topology_dict(scenario)
|
|
136
|
+
|
|
137
|
+
# Open temp store
|
|
138
|
+
db_path = os.path.join(store_dir, "replay.db")
|
|
139
|
+
store = SQLiteEventStore(db_path)
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
# Insert events
|
|
143
|
+
if events:
|
|
144
|
+
store.insert_batch(events)
|
|
145
|
+
|
|
146
|
+
# Compute the actual time window from events for correlation
|
|
147
|
+
if events:
|
|
148
|
+
timestamps = [e.timestamp for e in events]
|
|
149
|
+
start_ts = min(timestamps)
|
|
150
|
+
end_ts = max(timestamps)
|
|
151
|
+
# Add a buffer to ensure the window captures all events
|
|
152
|
+
end_dt = datetime.fromisoformat(end_ts.replace("Z", "+00:00"))
|
|
153
|
+
end_buffered = (end_dt + timedelta(minutes=1)).isoformat()
|
|
154
|
+
|
|
155
|
+
# Run correlation sub-steps manually (since engine uses datetime.now())
|
|
156
|
+
all_events = store.get_by_time_window(start_ts, end_buffered)
|
|
157
|
+
|
|
158
|
+
if all_events:
|
|
159
|
+
# Deduplicate
|
|
160
|
+
deduped = deduplicate(all_events)
|
|
161
|
+
|
|
162
|
+
# Severity enrichment
|
|
163
|
+
enriched = []
|
|
164
|
+
for event in deduped:
|
|
165
|
+
new_severity = pre_score(event, None)
|
|
166
|
+
if new_severity != event.severity:
|
|
167
|
+
event = replace(event, severity=new_severity)
|
|
168
|
+
enriched.append(event)
|
|
169
|
+
|
|
170
|
+
# Compute window minutes from the event spread
|
|
171
|
+
start_dt = datetime.fromisoformat(start_ts.replace("Z", "+00:00"))
|
|
172
|
+
window_minutes = max(
|
|
173
|
+
int((end_dt - start_dt).total_seconds() / 60) + 1,
|
|
174
|
+
5,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Temporal grouping
|
|
178
|
+
temporal_groups = group_temporal(enriched, window_minutes=window_minutes)
|
|
179
|
+
|
|
180
|
+
# Topology grouping
|
|
181
|
+
topology_correlations = group_topology(temporal_groups, topology)
|
|
182
|
+
|
|
183
|
+
# Change candidate indexing
|
|
184
|
+
# Since get_recent_changes uses datetime('now'), we query the store
|
|
185
|
+
# directly for change events in our scenario window
|
|
186
|
+
change_candidates_map = find_change_candidates(
|
|
187
|
+
store, temporal_groups, topology=topology,
|
|
188
|
+
window_minutes=window_minutes + 30,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Assemble correlation groups using the engine helper
|
|
192
|
+
engine = CorrelationEngine()
|
|
193
|
+
groups = engine.assemble_groups(
|
|
194
|
+
temporal_groups, topology_correlations,
|
|
195
|
+
change_candidates_map, topology,
|
|
196
|
+
)
|
|
197
|
+
else:
|
|
198
|
+
groups = []
|
|
199
|
+
else:
|
|
200
|
+
groups = []
|
|
201
|
+
|
|
202
|
+
# Report
|
|
203
|
+
scenario_id = scenario.get("id", "unknown")
|
|
204
|
+
print(f"\n=== Replay: {scenario_id} ===")
|
|
205
|
+
print(f"Events inserted: {len(events)}")
|
|
206
|
+
print(f"Correlation groups found: {len(groups)}")
|
|
207
|
+
|
|
208
|
+
services_affected: set[str] = set()
|
|
209
|
+
total_changes = 0
|
|
210
|
+
for g in groups:
|
|
211
|
+
services_affected.update(g.services)
|
|
212
|
+
total_changes += len(g.change_candidates)
|
|
213
|
+
print(f" [{g.id}] P{g.priority}: {g.summary}")
|
|
214
|
+
|
|
215
|
+
print(f"Services affected: {sorted(services_affected)}")
|
|
216
|
+
print(f"Change candidates: {total_changes}")
|
|
217
|
+
|
|
218
|
+
if not no_model:
|
|
219
|
+
# Model-enabled path
|
|
220
|
+
config = load_config(config_path) if config_path else SitRepConfig()
|
|
221
|
+
generator = SnapshotGenerator(SnapshotBudget(config.token_budget))
|
|
222
|
+
model = ModelInterface(config.model_name, config.model_max_tokens)
|
|
223
|
+
|
|
224
|
+
if groups:
|
|
225
|
+
prompt, cache_hit = generator.generate(groups, AgentState.WATCHING)
|
|
226
|
+
try:
|
|
227
|
+
verdicts = asyncio.run(model.interpret(prompt, groups))
|
|
228
|
+
print(f"Verdicts created: {len(verdicts)}")
|
|
229
|
+
except Exception as exc:
|
|
230
|
+
logger.warning("model_call_failed", error=str(exc))
|
|
231
|
+
print("Model call failed, skipping verdicts")
|
|
232
|
+
else:
|
|
233
|
+
print("Model: skipped (--no-model)")
|
|
234
|
+
|
|
235
|
+
print()
|
|
236
|
+
return 0
|
|
237
|
+
|
|
238
|
+
finally:
|
|
239
|
+
store.close()
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def status_command(config_path: str | None, store_dir: str | None = None) -> int:
|
|
244
|
+
"""Show current SitRep status. Returns exit code."""
|
|
245
|
+
config = load_config(config_path) if config_path else SitRepConfig()
|
|
246
|
+
|
|
247
|
+
# Use store_dir if provided (for testing), otherwise use config path
|
|
248
|
+
if store_dir:
|
|
249
|
+
db_path = os.path.join(store_dir, "sitrep-events.db")
|
|
250
|
+
else:
|
|
251
|
+
db_path = config.store_path
|
|
252
|
+
|
|
253
|
+
store = SQLiteEventStore(db_path)
|
|
254
|
+
try:
|
|
255
|
+
stats = store.get_stats()
|
|
256
|
+
|
|
257
|
+
print("\n=== SitRep Status ===")
|
|
258
|
+
print(f"Agent state: {AgentState.WATCHING.value}")
|
|
259
|
+
print(f"Event count: {stats['event_count']}")
|
|
260
|
+
print(f"Oldest event: {stats['min_timestamp'] or 'none'}")
|
|
261
|
+
print(f"Newest event: {stats['max_timestamp'] or 'none'}")
|
|
262
|
+
|
|
263
|
+
# DB file size
|
|
264
|
+
if os.path.exists(db_path):
|
|
265
|
+
size_bytes = os.path.getsize(db_path)
|
|
266
|
+
if size_bytes < 1024:
|
|
267
|
+
size_str = f"{size_bytes} B"
|
|
268
|
+
elif size_bytes < 1024 * 1024:
|
|
269
|
+
size_str = f"{size_bytes / 1024:.1f} KB"
|
|
270
|
+
else:
|
|
271
|
+
size_str = f"{size_bytes / (1024 * 1024):.1f} MB"
|
|
272
|
+
print(f"DB size: {size_str}")
|
|
273
|
+
else:
|
|
274
|
+
print("DB size: 0 B")
|
|
275
|
+
|
|
276
|
+
print()
|
|
277
|
+
return 0
|
|
278
|
+
|
|
279
|
+
finally:
|
|
280
|
+
store.close()
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
async def _serve_loop(config: SitRepConfig) -> None:
|
|
284
|
+
"""Run the full serve pipeline."""
|
|
285
|
+
from nthlayer_workers.correlate.ingestion.webhook import WebhookIngester
|
|
286
|
+
|
|
287
|
+
store = SQLiteEventStore(config.store_path)
|
|
288
|
+
ingester = WebhookIngester(config.ingestion_host, config.ingestion_port)
|
|
289
|
+
engine = CorrelationEngine()
|
|
290
|
+
generator = SnapshotGenerator(SnapshotBudget(config.token_budget))
|
|
291
|
+
model = ModelInterface(config.model_name, config.model_max_tokens)
|
|
292
|
+
state_machine = StateMachine()
|
|
293
|
+
|
|
294
|
+
# Buffer events via queue to avoid concurrent SQLite writes
|
|
295
|
+
event_queue: asyncio.Queue = asyncio.Queue(maxsize=10000)
|
|
296
|
+
ingester.on_event(lambda event: event_queue.put_nowait(event))
|
|
297
|
+
|
|
298
|
+
# Start ingester
|
|
299
|
+
await ingester.start()
|
|
300
|
+
logger.info(
|
|
301
|
+
"sitrep_started",
|
|
302
|
+
host=config.ingestion_host,
|
|
303
|
+
port=config.ingestion_port,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Shutdown event
|
|
307
|
+
shutdown = asyncio.Event()
|
|
308
|
+
|
|
309
|
+
def _handle_signal() -> None:
|
|
310
|
+
logger.info("shutdown_requested")
|
|
311
|
+
shutdown.set()
|
|
312
|
+
|
|
313
|
+
loop = asyncio.get_running_loop()
|
|
314
|
+
for sig in (signal.SIGTERM, signal.SIGINT):
|
|
315
|
+
loop.add_signal_handler(sig, _handle_signal)
|
|
316
|
+
|
|
317
|
+
# Optionally open verdict store
|
|
318
|
+
verdict_store = None
|
|
319
|
+
try:
|
|
320
|
+
from nthlayer_learn.store import VerdictStore
|
|
321
|
+
verdict_store = VerdictStore(config.verdict_store_path)
|
|
322
|
+
except Exception:
|
|
323
|
+
logger.info("verdict_store_not_available")
|
|
324
|
+
|
|
325
|
+
try:
|
|
326
|
+
while not shutdown.is_set():
|
|
327
|
+
interval = state_machine.get_interval()
|
|
328
|
+
try:
|
|
329
|
+
await asyncio.wait_for(shutdown.wait(), timeout=interval)
|
|
330
|
+
break # shutdown requested
|
|
331
|
+
except asyncio.TimeoutError:
|
|
332
|
+
pass # normal cycle
|
|
333
|
+
|
|
334
|
+
# Drain buffered events into store (single-threaded, safe)
|
|
335
|
+
while not event_queue.empty():
|
|
336
|
+
try:
|
|
337
|
+
event = event_queue.get_nowait()
|
|
338
|
+
store.insert(event)
|
|
339
|
+
except asyncio.QueueEmpty:
|
|
340
|
+
break
|
|
341
|
+
|
|
342
|
+
# Run correlation
|
|
343
|
+
try:
|
|
344
|
+
groups = engine.correlate(
|
|
345
|
+
store,
|
|
346
|
+
config.correlation_window_minutes,
|
|
347
|
+
topology=None, # No manifests in Tier 1
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
model_healthy = True
|
|
351
|
+
state_machine.update(groups, model_healthy)
|
|
352
|
+
|
|
353
|
+
prompt, cache_hit = generator.generate(groups, state_machine.state)
|
|
354
|
+
|
|
355
|
+
if not cache_hit and groups:
|
|
356
|
+
try:
|
|
357
|
+
await model.interpret(prompt, groups, verdict_store)
|
|
358
|
+
except Exception as exc:
|
|
359
|
+
logger.warning("model_call_failed", error=str(exc))
|
|
360
|
+
state_machine.update(groups, model_healthy=False)
|
|
361
|
+
|
|
362
|
+
logger.info(
|
|
363
|
+
"correlation_cycle",
|
|
364
|
+
state=state_machine.state.value,
|
|
365
|
+
groups=len(groups),
|
|
366
|
+
cache_hit=cache_hit,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
except Exception as exc:
|
|
370
|
+
logger.error("correlation_error", error=str(exc))
|
|
371
|
+
|
|
372
|
+
finally:
|
|
373
|
+
await ingester.stop()
|
|
374
|
+
store.close()
|
|
375
|
+
logger.info("sitrep_stopped")
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def serve_command(config_path: str | None) -> int:
|
|
379
|
+
"""Start the full SitRep pipeline. Returns exit code."""
|
|
380
|
+
config = load_config(config_path) if config_path else SitRepConfig()
|
|
381
|
+
try:
|
|
382
|
+
asyncio.run(_serve_loop(config))
|
|
383
|
+
return 0
|
|
384
|
+
except KeyboardInterrupt:
|
|
385
|
+
return 0
|
|
386
|
+
except Exception as exc:
|
|
387
|
+
logger.error("serve_failed", error=str(exc))
|
|
388
|
+
return 2
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def correlate_command(
|
|
392
|
+
trigger_verdict_id: str,
|
|
393
|
+
prometheus_url: str,
|
|
394
|
+
specs_dir: str,
|
|
395
|
+
verdict_store_path: str,
|
|
396
|
+
respond_args: str | None = None,
|
|
397
|
+
reasoning: bool = True,
|
|
398
|
+
reasoning_model: str | None = None,
|
|
399
|
+
decision_store_path: str | None = None,
|
|
400
|
+
trace_backend: object | None = None, # TraceBackend Protocol (not runtime_checkable)
|
|
401
|
+
trace_baseline_window: str = "1h",
|
|
402
|
+
) -> int:
|
|
403
|
+
"""Correlate signals from a trigger evaluation verdict.
|
|
404
|
+
|
|
405
|
+
Reads the trigger verdict from the store, queries Prometheus for correlated
|
|
406
|
+
signals across the blast radius, runs the correlation engine, and writes
|
|
407
|
+
a correlation verdict.
|
|
408
|
+
"""
|
|
409
|
+
from nthlayer_common.verdicts import (
|
|
410
|
+
SQLiteVerdictStore,
|
|
411
|
+
VerdictFilter,
|
|
412
|
+
create as verdict_create,
|
|
413
|
+
link as verdict_link,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
from nthlayer_workers.correlate.prometheus import (
|
|
417
|
+
blast_radius_services,
|
|
418
|
+
fetch_alerts,
|
|
419
|
+
fetch_metric_breaches,
|
|
420
|
+
load_dependency_graph,
|
|
421
|
+
verdict_to_event,
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
log = structlog.get_logger("correlate_command")
|
|
425
|
+
|
|
426
|
+
# Open verdict store
|
|
427
|
+
verdict_store = SQLiteVerdictStore(verdict_store_path)
|
|
428
|
+
|
|
429
|
+
# Read trigger verdict
|
|
430
|
+
trigger = verdict_store.get(trigger_verdict_id)
|
|
431
|
+
if trigger is None:
|
|
432
|
+
log.error("Trigger verdict not found", verdict_id=trigger_verdict_id)
|
|
433
|
+
return 1
|
|
434
|
+
|
|
435
|
+
trigger_service = trigger.subject.ref or "unknown"
|
|
436
|
+
trigger_custom = getattr(trigger.metadata, "custom", {}) or {}
|
|
437
|
+
log.info(
|
|
438
|
+
"Trigger verdict loaded",
|
|
439
|
+
service=trigger_service,
|
|
440
|
+
slo_name=trigger_custom.get("slo_name"),
|
|
441
|
+
breach=trigger_custom.get("breach"),
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Load dependency graph from specs
|
|
445
|
+
dep_graph = load_dependency_graph(specs_dir)
|
|
446
|
+
|
|
447
|
+
# Compute blast radius
|
|
448
|
+
affected = blast_radius_services(trigger_service, dep_graph)
|
|
449
|
+
log.info("Blast radius computed", affected_services=sorted(affected))
|
|
450
|
+
|
|
451
|
+
# Gather events from Prometheus and verdict store (+ optional trace evidence)
|
|
452
|
+
async def _gather():
|
|
453
|
+
import httpx
|
|
454
|
+
|
|
455
|
+
events: list[SitRepEvent] = []
|
|
456
|
+
trace_evidence = None
|
|
457
|
+
|
|
458
|
+
async with httpx.AsyncClient() as client:
|
|
459
|
+
# 1. Prometheus alerts on affected services
|
|
460
|
+
alerts = await fetch_alerts(client, prometheus_url, affected)
|
|
461
|
+
events.extend(alerts)
|
|
462
|
+
|
|
463
|
+
# 2. Prometheus metric breaches on affected services
|
|
464
|
+
breaches = await fetch_metric_breaches(client, prometheus_url, affected)
|
|
465
|
+
events.extend(breaches)
|
|
466
|
+
|
|
467
|
+
# 3. Recent evaluation verdicts from store as events
|
|
468
|
+
recent = verdict_store.query(VerdictFilter(
|
|
469
|
+
producer_system="nthlayer-measure",
|
|
470
|
+
subject_type="evaluation",
|
|
471
|
+
limit=50,
|
|
472
|
+
))
|
|
473
|
+
for v in recent:
|
|
474
|
+
svc = v.subject.ref or v.subject.service or ""
|
|
475
|
+
if svc in affected:
|
|
476
|
+
events.append(verdict_to_event(v))
|
|
477
|
+
|
|
478
|
+
# 4. Trace evidence (optional, graceful degradation)
|
|
479
|
+
if trace_backend is not None:
|
|
480
|
+
try:
|
|
481
|
+
baseline_td = _parse_duration(trace_baseline_window)
|
|
482
|
+
end = datetime.now(tz=timezone.utc)
|
|
483
|
+
start = end - timedelta(minutes=30)
|
|
484
|
+
trace_evidence = await trace_backend.get_trace_evidence(
|
|
485
|
+
services=sorted(affected),
|
|
486
|
+
start=start,
|
|
487
|
+
end=end,
|
|
488
|
+
baseline_window=baseline_td,
|
|
489
|
+
)
|
|
490
|
+
except Exception as exc:
|
|
491
|
+
log.warning("trace_evidence_unavailable", error=str(exc))
|
|
492
|
+
|
|
493
|
+
return events, trace_evidence
|
|
494
|
+
|
|
495
|
+
events, trace_evidence = asyncio.run(_gather())
|
|
496
|
+
|
|
497
|
+
# Close trace backend client to release connections
|
|
498
|
+
if trace_backend is not None and hasattr(trace_backend, "aclose"):
|
|
499
|
+
asyncio.run(trace_backend.aclose())
|
|
500
|
+
|
|
501
|
+
log.info("Gathered events", count=len(events))
|
|
502
|
+
|
|
503
|
+
# Compute topology divergence (declared vs observed) for reasoning enrichment
|
|
504
|
+
if trace_evidence and dep_graph:
|
|
505
|
+
from nthlayer_workers.correlate.traces.topology import detect_topology_divergence
|
|
506
|
+
trace_evidence.topology_divergence = detect_topology_divergence(trace_evidence, dep_graph)
|
|
507
|
+
|
|
508
|
+
if not events:
|
|
509
|
+
log.info("No correlated events found, no correlation verdict needed")
|
|
510
|
+
return 0
|
|
511
|
+
|
|
512
|
+
# Insert events into temp store and run correlation engine
|
|
513
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
514
|
+
tmp_store = SQLiteEventStore(os.path.join(tmp_dir, "correlation.db"))
|
|
515
|
+
tmp_store.insert_batch(events)
|
|
516
|
+
|
|
517
|
+
engine = CorrelationEngine()
|
|
518
|
+
topology = dep_graph if dep_graph else None
|
|
519
|
+
groups = engine.correlate(tmp_store, window_minutes=30, topology=topology)
|
|
520
|
+
tmp_store.close()
|
|
521
|
+
|
|
522
|
+
if not groups:
|
|
523
|
+
log.info("No correlation groups formed")
|
|
524
|
+
return 0
|
|
525
|
+
|
|
526
|
+
log.info("Correlation groups", count=len(groups))
|
|
527
|
+
for g in groups:
|
|
528
|
+
log.info(
|
|
529
|
+
"Group",
|
|
530
|
+
priority=g.priority,
|
|
531
|
+
services=g.services,
|
|
532
|
+
event_count=g.event_count,
|
|
533
|
+
summary=g.summary,
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
# Reasoning layer: model-based causal analysis if enabled, else heuristic
|
|
537
|
+
reasoning_result = None
|
|
538
|
+
reasoning_mode = "heuristic"
|
|
539
|
+
|
|
540
|
+
if reasoning:
|
|
541
|
+
from nthlayer_workers.correlate.reasoning import reason_about_correlations, reasoning_available
|
|
542
|
+
|
|
543
|
+
if reasoning_available():
|
|
544
|
+
kwargs = {}
|
|
545
|
+
if reasoning_model:
|
|
546
|
+
kwargs["model"] = reasoning_model
|
|
547
|
+
if trace_evidence:
|
|
548
|
+
kwargs["trace_evidence"] = trace_evidence
|
|
549
|
+
reasoning_result = asyncio.run(
|
|
550
|
+
reason_about_correlations(groups, dep_graph, **kwargs)
|
|
551
|
+
)
|
|
552
|
+
if not reasoning_result.get("degraded", False):
|
|
553
|
+
reasoning_mode = "model"
|
|
554
|
+
log.info("Reasoning complete", mode=reasoning_mode,
|
|
555
|
+
overall_confidence=reasoning_result.get("overall_confidence"))
|
|
556
|
+
else:
|
|
557
|
+
log.info("Reasoning degraded, falling back to heuristic",
|
|
558
|
+
reason=reasoning_result.get("overall_assessment"))
|
|
559
|
+
reasoning_result = None
|
|
560
|
+
else:
|
|
561
|
+
log.info("Reasoning skipped: no LLM API key set (ANTHROPIC_API_KEY or OPENAI_API_KEY)")
|
|
562
|
+
else:
|
|
563
|
+
log.info("Reasoning disabled via --no-reasoning")
|
|
564
|
+
|
|
565
|
+
# Build root causes and confidence from reasoning or heuristic
|
|
566
|
+
reasoning_by_group = {}
|
|
567
|
+
if reasoning_result and reasoning_mode == "model":
|
|
568
|
+
for ga in reasoning_result.get("groups", []):
|
|
569
|
+
reasoning_by_group[ga["group_id"]] = ga
|
|
570
|
+
|
|
571
|
+
root_causes = []
|
|
572
|
+
for g in groups:
|
|
573
|
+
ga = reasoning_by_group.get(g.id)
|
|
574
|
+
if ga and ga.get("root_cause"):
|
|
575
|
+
# Model-provided root cause
|
|
576
|
+
root_causes.append({
|
|
577
|
+
"service": g.services[0] if g.services else trigger_service,
|
|
578
|
+
"type": ga["root_cause"],
|
|
579
|
+
"confidence": ga["confidence"],
|
|
580
|
+
"evidence": ga.get("reasoning", ""),
|
|
581
|
+
"recommended_actions": ga.get("recommended_actions", []),
|
|
582
|
+
})
|
|
583
|
+
else:
|
|
584
|
+
# Heuristic fallback: temporal proximity
|
|
585
|
+
for cc in g.change_candidates:
|
|
586
|
+
root_causes.append({
|
|
587
|
+
"service": cc.change.service,
|
|
588
|
+
"type": cc.change.payload.get("change_type", "unknown"),
|
|
589
|
+
"confidence": _proximity_confidence(cc.temporal_proximity_seconds),
|
|
590
|
+
"evidence": cc.change.payload.get("detail", ""),
|
|
591
|
+
})
|
|
592
|
+
|
|
593
|
+
blast_list = [
|
|
594
|
+
{
|
|
595
|
+
"service": svc,
|
|
596
|
+
"impact": "direct" if svc == trigger_service else "downstream",
|
|
597
|
+
"slo_breached": any(
|
|
598
|
+
getattr(e, "service", "") == svc
|
|
599
|
+
for e in events
|
|
600
|
+
if getattr(e, "type", None) == EventType.METRIC_BREACH
|
|
601
|
+
),
|
|
602
|
+
}
|
|
603
|
+
for svc in sorted(affected)
|
|
604
|
+
]
|
|
605
|
+
|
|
606
|
+
# Overall confidence: model reasoning or peak severity heuristic
|
|
607
|
+
if reasoning_result and reasoning_mode == "model":
|
|
608
|
+
overall_confidence = reasoning_result["overall_confidence"]
|
|
609
|
+
else:
|
|
610
|
+
overall_confidence = min(1.0, max(
|
|
611
|
+
max(s.peak_severity for s in g.signals) for g in groups
|
|
612
|
+
)) if groups else 0.5
|
|
613
|
+
|
|
614
|
+
# Build verdict summary: prefer model assessment over template
|
|
615
|
+
if reasoning_result and reasoning_mode == "model" and reasoning_result.get("overall_assessment"):
|
|
616
|
+
verdict_summary = reasoning_result["overall_assessment"]
|
|
617
|
+
elif root_causes:
|
|
618
|
+
verdict_summary = (
|
|
619
|
+
f"{root_causes[0].get('service', trigger_service)} "
|
|
620
|
+
f"{root_causes[0].get('type', 'incident')} — "
|
|
621
|
+
f"{len(blast_list)} services in blast radius"
|
|
622
|
+
)
|
|
623
|
+
else:
|
|
624
|
+
verdict_summary = f"{trigger_service} incident — {len(blast_list)} services affected"
|
|
625
|
+
|
|
626
|
+
corr_verdict = verdict_create(
|
|
627
|
+
subject={
|
|
628
|
+
"type": "correlation",
|
|
629
|
+
"ref": trigger_service,
|
|
630
|
+
"summary": verdict_summary,
|
|
631
|
+
},
|
|
632
|
+
judgment={
|
|
633
|
+
"action": "flag" if any(g.priority <= 1 for g in groups) else "escalate",
|
|
634
|
+
"confidence": overall_confidence,
|
|
635
|
+
},
|
|
636
|
+
producer={"system": "nthlayer-correlate"},
|
|
637
|
+
metadata={"custom": {
|
|
638
|
+
"trigger_verdict": trigger_verdict_id,
|
|
639
|
+
"root_causes": root_causes,
|
|
640
|
+
"blast_radius": blast_list,
|
|
641
|
+
"groups": len(groups),
|
|
642
|
+
"events_gathered": len(events),
|
|
643
|
+
"reasoning_mode": reasoning_mode,
|
|
644
|
+
"reasoning": reasoning_result if reasoning_mode == "model" else None,
|
|
645
|
+
"evidence_sources": {
|
|
646
|
+
"prometheus": True,
|
|
647
|
+
"verdict_store": True,
|
|
648
|
+
"trace_backend": trace_evidence.backend if trace_evidence else None,
|
|
649
|
+
},
|
|
650
|
+
"trace_query_time_ms": trace_evidence.query_time_ms if trace_evidence else None,
|
|
651
|
+
}},
|
|
652
|
+
)
|
|
653
|
+
verdict_link(corr_verdict, context=[trigger_verdict_id])
|
|
654
|
+
verdict_store.put(corr_verdict)
|
|
655
|
+
|
|
656
|
+
# Write content-addressed decision record
|
|
657
|
+
if decision_store_path:
|
|
658
|
+
from nthlayer_common.records.sqlite_store import SQLiteDecisionRecordStore
|
|
659
|
+
from nthlayer_common.records.verdict_bridge import write_decision_verdict
|
|
660
|
+
|
|
661
|
+
ds = SQLiteDecisionRecordStore(decision_store_path)
|
|
662
|
+
write_decision_verdict(
|
|
663
|
+
ds,
|
|
664
|
+
agent="correlate",
|
|
665
|
+
incident_id=getattr(corr_verdict.subject, "ref", "") or trigger_service,
|
|
666
|
+
timestamp=corr_verdict.timestamp,
|
|
667
|
+
model=reasoning_model or os.environ.get("NTHLAYER_MODEL", "heuristic"),
|
|
668
|
+
reasoning=getattr(corr_verdict.judgment, "reasoning", "") or verdict_summary,
|
|
669
|
+
action={
|
|
670
|
+
"root_causes": root_causes[:3],
|
|
671
|
+
"blast_radius_count": len(blast_list),
|
|
672
|
+
"trace_backend": trace_evidence.backend if trace_evidence else None,
|
|
673
|
+
"trace_services_count": len(trace_evidence.services) if trace_evidence else 0,
|
|
674
|
+
},
|
|
675
|
+
prompt_text=f"correlate {trigger_service} mode={reasoning_mode}",
|
|
676
|
+
response_text=str(reasoning_result) if reasoning_result else "heuristic",
|
|
677
|
+
summaries_technical=(
|
|
678
|
+
f"Correlation: {trigger_service}, {len(groups)} groups, {len(blast_list)} blast radius"
|
|
679
|
+
+ (f", trace evidence from {trace_evidence.backend}" if trace_evidence else "")
|
|
680
|
+
),
|
|
681
|
+
summaries_plain=verdict_summary[:280],
|
|
682
|
+
summaries_executive=(
|
|
683
|
+
f"{trigger_service} correlation — {reasoning_mode}"
|
|
684
|
+
+ (" + traces" if trace_evidence else "")
|
|
685
|
+
),
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Slack notification for correlation verdict
|
|
689
|
+
slack_url = os.environ.get("SLACK_WEBHOOK_URL", "")
|
|
690
|
+
if slack_url:
|
|
691
|
+
from nthlayer_common.slack import SlackNotifier
|
|
692
|
+
from nthlayer_workers.correlate.notifications import build_correlation_blocks, find_slack_thread_ts
|
|
693
|
+
|
|
694
|
+
thread_ts = find_slack_thread_ts(verdict_store, [trigger_verdict_id])
|
|
695
|
+
blocks, text = build_correlation_blocks(corr_verdict)
|
|
696
|
+
notifier = SlackNotifier(slack_url)
|
|
697
|
+
new_ts = asyncio.run(notifier.send(blocks, text, thread_ts=thread_ts))
|
|
698
|
+
if new_ts and not thread_ts:
|
|
699
|
+
corr_verdict.metadata.custom["slack_thread_ts"] = new_ts
|
|
700
|
+
verdict_store.put(corr_verdict)
|
|
701
|
+
|
|
702
|
+
print(f"Correlation verdict: {corr_verdict.id}")
|
|
703
|
+
print(f" Groups: {len(groups)}, Events: {len(events)}, Blast radius: {len(affected)} services")
|
|
704
|
+
|
|
705
|
+
# Forward to nthlayer-respond if respond_args is set
|
|
706
|
+
if respond_args:
|
|
707
|
+
import json
|
|
708
|
+
import subprocess
|
|
709
|
+
|
|
710
|
+
try:
|
|
711
|
+
args_dict = json.loads(respond_args)
|
|
712
|
+
except json.JSONDecodeError:
|
|
713
|
+
log.error("Invalid --respond-args JSON", raw=respond_args)
|
|
714
|
+
return 1
|
|
715
|
+
|
|
716
|
+
# Only allow known respond flags to prevent injection
|
|
717
|
+
allowed_keys = {"specs-dir", "config", "notify"}
|
|
718
|
+
for key in args_dict:
|
|
719
|
+
if key not in allowed_keys:
|
|
720
|
+
log.warning("Ignoring unknown respond arg", key=key)
|
|
721
|
+
|
|
722
|
+
cmd = [
|
|
723
|
+
"nthlayer-respond", "respond",
|
|
724
|
+
"--trigger-verdict", corr_verdict.id,
|
|
725
|
+
"--verdict-store", verdict_store_path,
|
|
726
|
+
]
|
|
727
|
+
for key, value in args_dict.items():
|
|
728
|
+
if key in allowed_keys:
|
|
729
|
+
cmd.extend([f"--{key}", str(value)])
|
|
730
|
+
|
|
731
|
+
log.info("Invoking nthlayer-respond", cmd=cmd)
|
|
732
|
+
try:
|
|
733
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
734
|
+
if result.returncode != 0:
|
|
735
|
+
log.error("nthlayer-respond failed", returncode=result.returncode, stderr=result.stderr)
|
|
736
|
+
else:
|
|
737
|
+
print(result.stdout)
|
|
738
|
+
except FileNotFoundError:
|
|
739
|
+
log.error("nthlayer-respond not found on PATH")
|
|
740
|
+
|
|
741
|
+
return 0
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
745
|
+
"""Build the CLI argument parser."""
|
|
746
|
+
parser = argparse.ArgumentParser(
|
|
747
|
+
prog="nthlayer-correlate",
|
|
748
|
+
description="SitRep — Situational awareness through automated signal correlation",
|
|
749
|
+
)
|
|
750
|
+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
751
|
+
|
|
752
|
+
# serve
|
|
753
|
+
serve_parser = subparsers.add_parser("serve", help="Start the full SitRep pipeline")
|
|
754
|
+
serve_parser.add_argument(
|
|
755
|
+
"--config", default=None, help="Path to sitrep.yaml config file"
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
# status
|
|
759
|
+
status_parser = subparsers.add_parser("status", help="Show current SitRep status")
|
|
760
|
+
status_parser.add_argument(
|
|
761
|
+
"--config", default=None, help="Path to sitrep.yaml config file"
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
# replay
|
|
765
|
+
replay_parser = subparsers.add_parser("replay", help="Replay a scenario fixture")
|
|
766
|
+
replay_parser.add_argument(
|
|
767
|
+
"--scenario", required=True, help="Path to scenario YAML file"
|
|
768
|
+
)
|
|
769
|
+
replay_parser.add_argument(
|
|
770
|
+
"--config", default=None, help="Path to sitrep.yaml config file"
|
|
771
|
+
)
|
|
772
|
+
replay_parser.add_argument(
|
|
773
|
+
"--no-model", action="store_true", help="Skip model calls"
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
# correlate (live data — trigger from verdict)
|
|
777
|
+
corr_parser = subparsers.add_parser("correlate", help="Correlate signals from a trigger verdict")
|
|
778
|
+
corr_parser.add_argument("--trigger-verdict", required=True, help="Evaluation verdict ID that triggered correlation")
|
|
779
|
+
corr_parser.add_argument("--prometheus-url", required=True, help="Prometheus base URL")
|
|
780
|
+
corr_parser.add_argument("--specs-dir", required=True, help="Directory of OpenSRM spec YAMLs")
|
|
781
|
+
corr_parser.add_argument("--verdict-store", default="verdicts.db", help="Path to verdict SQLite DB")
|
|
782
|
+
# Reasoning layer flags
|
|
783
|
+
reasoning_group = corr_parser.add_mutually_exclusive_group()
|
|
784
|
+
reasoning_group.add_argument("--reasoning", action="store_true", default=True, help="Enable model-based causal reasoning (default if API key set)")
|
|
785
|
+
reasoning_group.add_argument("--no-reasoning", action="store_true", help="Disable model reasoning, use heuristic only")
|
|
786
|
+
corr_parser.add_argument("--model", default=None, help="Model for reasoning (e.g. 'openai/gpt-4o', 'anthropic/claude-sonnet-4-20250514')")
|
|
787
|
+
# Forward flags for downstream components (passed through, not parsed by correlate)
|
|
788
|
+
corr_parser.add_argument("--respond-args", default=None, help="JSON-encoded args to forward to nthlayer-respond")
|
|
789
|
+
corr_parser.add_argument("--decision-store", default=None, help="Path to decision record SQLite DB for content-addressed records")
|
|
790
|
+
# Trace backend flags
|
|
791
|
+
corr_parser.add_argument("--trace-backend", default=None, choices=["tempo"], help="Trace backend to query for evidence (e.g. 'tempo')")
|
|
792
|
+
corr_parser.add_argument("--tempo-endpoint", default=None, help="Tempo query endpoint (e.g. 'http://tempo:3200')")
|
|
793
|
+
corr_parser.add_argument("--trace-detail", default="full", choices=["summary", "full"], help="Trace evidence detail level")
|
|
794
|
+
|
|
795
|
+
return parser
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def main() -> None:
|
|
799
|
+
"""CLI entry point."""
|
|
800
|
+
parser = _build_parser()
|
|
801
|
+
args = parser.parse_args()
|
|
802
|
+
|
|
803
|
+
if args.command is None:
|
|
804
|
+
parser.print_help()
|
|
805
|
+
sys.exit(2)
|
|
806
|
+
|
|
807
|
+
if args.command == "serve":
|
|
808
|
+
sys.exit(serve_command(args.config))
|
|
809
|
+
elif args.command == "status":
|
|
810
|
+
sys.exit(status_command(args.config))
|
|
811
|
+
elif args.command == "replay":
|
|
812
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
813
|
+
sys.exit(
|
|
814
|
+
replay_command(
|
|
815
|
+
scenario_path=args.scenario,
|
|
816
|
+
config_path=args.config,
|
|
817
|
+
no_model=args.no_model,
|
|
818
|
+
store_dir=tmp_dir,
|
|
819
|
+
)
|
|
820
|
+
)
|
|
821
|
+
elif args.command == "correlate":
|
|
822
|
+
# Construct trace backend if requested
|
|
823
|
+
_trace_backend = None
|
|
824
|
+
if args.trace_backend == "tempo":
|
|
825
|
+
from nthlayer_workers.correlate.traces.tempo import TempoTraceBackend
|
|
826
|
+
_trace_backend = TempoTraceBackend(
|
|
827
|
+
endpoint=args.tempo_endpoint,
|
|
828
|
+
use_service_graphs=True,
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
sys.exit(
|
|
832
|
+
correlate_command(
|
|
833
|
+
trigger_verdict_id=args.trigger_verdict,
|
|
834
|
+
prometheus_url=args.prometheus_url,
|
|
835
|
+
specs_dir=args.specs_dir,
|
|
836
|
+
verdict_store_path=args.verdict_store,
|
|
837
|
+
respond_args=args.respond_args,
|
|
838
|
+
reasoning=not args.no_reasoning,
|
|
839
|
+
reasoning_model=args.model,
|
|
840
|
+
decision_store_path=getattr(args, "decision_store", None),
|
|
841
|
+
trace_backend=_trace_backend,
|
|
842
|
+
)
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
if __name__ == "__main__":
|
|
847
|
+
main()
|