nthlayer-workers 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. nthlayer_workers/__init__.py +5 -0
  2. nthlayer_workers/cli.py +234 -0
  3. nthlayer_workers/correlate/__init__.py +1 -0
  4. nthlayer_workers/correlate/cli.py +847 -0
  5. nthlayer_workers/correlate/config.py +111 -0
  6. nthlayer_workers/correlate/correlation/__init__.py +1 -0
  7. nthlayer_workers/correlate/correlation/changes.py +87 -0
  8. nthlayer_workers/correlate/correlation/dedup.py +62 -0
  9. nthlayer_workers/correlate/correlation/engine.py +244 -0
  10. nthlayer_workers/correlate/correlation/temporal.py +79 -0
  11. nthlayer_workers/correlate/correlation/topology.py +104 -0
  12. nthlayer_workers/correlate/ingestion/__init__.py +1 -0
  13. nthlayer_workers/correlate/ingestion/protocol.py +10 -0
  14. nthlayer_workers/correlate/ingestion/severity.py +18 -0
  15. nthlayer_workers/correlate/ingestion/webhook.py +197 -0
  16. nthlayer_workers/correlate/notifications.py +85 -0
  17. nthlayer_workers/correlate/prometheus.py +234 -0
  18. nthlayer_workers/correlate/reasoning.py +375 -0
  19. nthlayer_workers/correlate/session.py +189 -0
  20. nthlayer_workers/correlate/snapshot/__init__.py +1 -0
  21. nthlayer_workers/correlate/snapshot/generator.py +170 -0
  22. nthlayer_workers/correlate/snapshot/model.py +177 -0
  23. nthlayer_workers/correlate/snapshot/token.py +14 -0
  24. nthlayer_workers/correlate/state.py +88 -0
  25. nthlayer_workers/correlate/store/__init__.py +5 -0
  26. nthlayer_workers/correlate/store/protocol.py +48 -0
  27. nthlayer_workers/correlate/store/sqlite.py +443 -0
  28. nthlayer_workers/correlate/summary.py +180 -0
  29. nthlayer_workers/correlate/traces/__init__.py +1 -0
  30. nthlayer_workers/correlate/traces/protocol.py +120 -0
  31. nthlayer_workers/correlate/traces/tempo.py +667 -0
  32. nthlayer_workers/correlate/traces/topology.py +39 -0
  33. nthlayer_workers/correlate/types.py +77 -0
  34. nthlayer_workers/correlate/worker.py +630 -0
  35. nthlayer_workers/learn/__init__.py +5 -0
  36. nthlayer_workers/learn/__main__.py +5 -0
  37. nthlayer_workers/learn/cli.py +164 -0
  38. nthlayer_workers/learn/retrospective.py +381 -0
  39. nthlayer_workers/learn/trends.py +102 -0
  40. nthlayer_workers/learn/worker.py +366 -0
  41. nthlayer_workers/measure/__init__.py +3 -0
  42. nthlayer_workers/measure/__main__.py +5 -0
  43. nthlayer_workers/measure/_parsing.py +15 -0
  44. nthlayer_workers/measure/adapters/__init__.py +0 -0
  45. nthlayer_workers/measure/adapters/_util.py +24 -0
  46. nthlayer_workers/measure/adapters/devin.py +119 -0
  47. nthlayer_workers/measure/adapters/gastown.py +88 -0
  48. nthlayer_workers/measure/adapters/prometheus.py +277 -0
  49. nthlayer_workers/measure/adapters/protocol.py +20 -0
  50. nthlayer_workers/measure/adapters/webhook.py +161 -0
  51. nthlayer_workers/measure/api/__init__.py +0 -0
  52. nthlayer_workers/measure/api/normalise.py +50 -0
  53. nthlayer_workers/measure/api/queue.py +243 -0
  54. nthlayer_workers/measure/api/response.py +51 -0
  55. nthlayer_workers/measure/api/server.py +504 -0
  56. nthlayer_workers/measure/calibration/__init__.py +0 -0
  57. nthlayer_workers/measure/calibration/loop.py +62 -0
  58. nthlayer_workers/measure/calibration/slos.py +212 -0
  59. nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
  60. nthlayer_workers/measure/cli.py +753 -0
  61. nthlayer_workers/measure/config.py +191 -0
  62. nthlayer_workers/measure/detection/__init__.py +6 -0
  63. nthlayer_workers/measure/detection/detector.py +82 -0
  64. nthlayer_workers/measure/detection/protocol.py +29 -0
  65. nthlayer_workers/measure/governance/__init__.py +0 -0
  66. nthlayer_workers/measure/governance/engine.py +163 -0
  67. nthlayer_workers/measure/manifest.py +77 -0
  68. nthlayer_workers/measure/notifications.py +53 -0
  69. nthlayer_workers/measure/pipeline/__init__.py +0 -0
  70. nthlayer_workers/measure/pipeline/evaluator.py +155 -0
  71. nthlayer_workers/measure/pipeline/router.py +160 -0
  72. nthlayer_workers/measure/store/__init__.py +0 -0
  73. nthlayer_workers/measure/store/protocol.py +38 -0
  74. nthlayer_workers/measure/store/sqlite.py +276 -0
  75. nthlayer_workers/measure/telemetry.py +116 -0
  76. nthlayer_workers/measure/tiering/__init__.py +0 -0
  77. nthlayer_workers/measure/tiering/classifier.py +58 -0
  78. nthlayer_workers/measure/tiering/promotion.py +118 -0
  79. nthlayer_workers/measure/trends/__init__.py +0 -0
  80. nthlayer_workers/measure/trends/tracker.py +72 -0
  81. nthlayer_workers/measure/types.py +75 -0
  82. nthlayer_workers/measure/worker.py +439 -0
  83. nthlayer_workers/observe/__init__.py +25 -0
  84. nthlayer_workers/observe/__main__.py +5 -0
  85. nthlayer_workers/observe/api/__init__.py +1 -0
  86. nthlayer_workers/observe/assessment.py +95 -0
  87. nthlayer_workers/observe/cli.py +737 -0
  88. nthlayer_workers/observe/config.py +11 -0
  89. nthlayer_workers/observe/db/__init__.py +1 -0
  90. nthlayer_workers/observe/decision_records.py +220 -0
  91. nthlayer_workers/observe/dependencies/__init__.py +18 -0
  92. nthlayer_workers/observe/dependencies/discovery.py +294 -0
  93. nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
  94. nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
  95. nthlayer_workers/observe/dependencies/providers/base.py +76 -0
  96. nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
  97. nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
  98. nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
  99. nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
  100. nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
  101. nthlayer_workers/observe/deployments/__init__.py +1 -0
  102. nthlayer_workers/observe/discovery/__init__.py +14 -0
  103. nthlayer_workers/observe/discovery/classifier.py +66 -0
  104. nthlayer_workers/observe/discovery/client.py +189 -0
  105. nthlayer_workers/observe/discovery/models.py +53 -0
  106. nthlayer_workers/observe/drift/__init__.py +26 -0
  107. nthlayer_workers/observe/drift/analyzer.py +383 -0
  108. nthlayer_workers/observe/drift/models.py +174 -0
  109. nthlayer_workers/observe/drift/patterns.py +88 -0
  110. nthlayer_workers/observe/explanation.py +118 -0
  111. nthlayer_workers/observe/gate/__init__.py +39 -0
  112. nthlayer_workers/observe/gate/conditions.py +92 -0
  113. nthlayer_workers/observe/gate/correlator.py +154 -0
  114. nthlayer_workers/observe/gate/evaluator.py +192 -0
  115. nthlayer_workers/observe/gate/policies.py +226 -0
  116. nthlayer_workers/observe/gate_adapter.py +40 -0
  117. nthlayer_workers/observe/incident.py +36 -0
  118. nthlayer_workers/observe/portfolio/__init__.py +17 -0
  119. nthlayer_workers/observe/portfolio/aggregator.py +168 -0
  120. nthlayer_workers/observe/portfolio/scorer.py +13 -0
  121. nthlayer_workers/observe/slo/__init__.py +19 -0
  122. nthlayer_workers/observe/slo/collector.py +235 -0
  123. nthlayer_workers/observe/slo/spec_loader.py +40 -0
  124. nthlayer_workers/observe/sqlite_store.py +152 -0
  125. nthlayer_workers/observe/store.py +92 -0
  126. nthlayer_workers/observe/verification/__init__.py +22 -0
  127. nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
  128. nthlayer_workers/observe/verification/extractor.py +127 -0
  129. nthlayer_workers/observe/verification/models.py +101 -0
  130. nthlayer_workers/observe/verification/verifier.py +111 -0
  131. nthlayer_workers/observe/worker.py +332 -0
  132. nthlayer_workers/respond/__init__.py +2 -0
  133. nthlayer_workers/respond/__main__.py +4 -0
  134. nthlayer_workers/respond/agents/__init__.py +0 -0
  135. nthlayer_workers/respond/agents/base.py +556 -0
  136. nthlayer_workers/respond/agents/communication.py +115 -0
  137. nthlayer_workers/respond/agents/investigation.py +124 -0
  138. nthlayer_workers/respond/agents/remediation.py +219 -0
  139. nthlayer_workers/respond/agents/triage.py +132 -0
  140. nthlayer_workers/respond/cli.py +772 -0
  141. nthlayer_workers/respond/config.py +135 -0
  142. nthlayer_workers/respond/context_store.py +256 -0
  143. nthlayer_workers/respond/coordinator.py +487 -0
  144. nthlayer_workers/respond/metrics.py +104 -0
  145. nthlayer_workers/respond/notification_backends/__init__.py +1 -0
  146. nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
  147. nthlayer_workers/respond/notification_backends/protocol.py +59 -0
  148. nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
  149. nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
  150. nthlayer_workers/respond/notifications.py +247 -0
  151. nthlayer_workers/respond/oncall/__init__.py +1 -0
  152. nthlayer_workers/respond/oncall/escalation.py +103 -0
  153. nthlayer_workers/respond/oncall/runner.py +193 -0
  154. nthlayer_workers/respond/oncall/schedule.py +243 -0
  155. nthlayer_workers/respond/safe_actions/__init__.py +0 -0
  156. nthlayer_workers/respond/safe_actions/actions.py +139 -0
  157. nthlayer_workers/respond/safe_actions/registry.py +171 -0
  158. nthlayer_workers/respond/safe_actions/webhook.py +194 -0
  159. nthlayer_workers/respond/server.py +357 -0
  160. nthlayer_workers/respond/sre/__init__.py +1 -0
  161. nthlayer_workers/respond/sre/brief.py +175 -0
  162. nthlayer_workers/respond/sre/delegation.py +101 -0
  163. nthlayer_workers/respond/sre/post_incident.py +146 -0
  164. nthlayer_workers/respond/sre/shift_report.py +129 -0
  165. nthlayer_workers/respond/sre/suppression.py +91 -0
  166. nthlayer_workers/respond/types.py +109 -0
  167. nthlayer_workers/respond/verdict_submission.py +56 -0
  168. nthlayer_workers/respond/worker.py +533 -0
  169. nthlayer_workers/respond/worker_helpers.py +140 -0
  170. nthlayer_workers/runner.py +198 -0
  171. nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
  172. nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
  173. nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
  174. nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
  175. nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,847 @@
1
+ """SitRep CLI — serve, status, replay."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import asyncio
6
+ import os
7
+ import re
8
+ import signal
9
+ import sys
10
+ import tempfile
11
+ from dataclasses import replace
12
+ from datetime import datetime, timedelta, timezone
13
+ from typing import Any
14
+
15
+ import structlog
16
+ import yaml
17
+
18
+ from nthlayer_workers.correlate.config import SitRepConfig, load_config
19
+ from nthlayer_workers.correlate.correlation.changes import find_change_candidates
20
+ from nthlayer_workers.correlate.correlation.dedup import deduplicate
21
+ from nthlayer_workers.correlate.correlation.engine import CorrelationEngine
22
+ from nthlayer_workers.correlate.correlation.temporal import group_temporal
23
+ from nthlayer_workers.correlate.correlation.topology import group_topology
24
+ from nthlayer_workers.correlate.ingestion.severity import pre_score
25
+ from nthlayer_workers.correlate.snapshot.generator import SnapshotBudget, SnapshotGenerator
26
+ from nthlayer_workers.correlate.snapshot.model import ModelInterface
27
+ from nthlayer_workers.correlate.state import StateMachine
28
+ from nthlayer_workers.correlate.store.sqlite import SQLiteEventStore
29
+ from nthlayer_workers.correlate.types import AgentState, EventType, SitRepEvent
30
+
31
+ logger = structlog.get_logger()
32
+
33
+ REFERENCE_TIME = datetime(2026, 1, 1, tzinfo=timezone.utc)
34
+
35
+ # Confidence decay window for temporal proximity heuristic (30 minutes)
36
+ _PROXIMITY_WINDOW_SECONDS = 1800.0
37
+
38
+ _DURATION_UNITS = {"ms": 0.001, "s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}
39
+
40
+
41
+ def _parse_duration(value: str) -> timedelta:
42
+ """Parse a duration string (e.g. '1h', '30m', '2h') to timedelta."""
43
+ for suffix, multiplier in sorted(_DURATION_UNITS.items(), key=lambda x: -len(x[0])):
44
+ if value.endswith(suffix):
45
+ return timedelta(seconds=float(value[:-len(suffix)]) * multiplier)
46
+ return timedelta(hours=1) # default fallback
47
+
48
+
49
+ def _proximity_confidence(seconds: float | None) -> float:
50
+ """Heuristic confidence from temporal proximity of a change to a signal.
51
+
52
+ Decays linearly from 1.0 (simultaneous) to 0.0 (30 minutes apart).
53
+ Returns 0.5 for unknown proximity.
54
+ """
55
+ if seconds is None:
56
+ return 0.5
57
+ from nthlayer_common.parsing import clamp
58
+ return clamp(1.0 - seconds / _PROXIMITY_WINDOW_SECONDS)
59
+
60
+
61
+ def parse_relative_time(at_str: str) -> str:
62
+ """Parse 'T+Nm' to ISO 8601."""
63
+ match = re.match(r"T\+(\d+)m", at_str)
64
+ if not match:
65
+ raise ValueError(f"Invalid time format: {at_str}")
66
+ minutes = int(match.group(1))
67
+ ts = REFERENCE_TIME + timedelta(minutes=minutes)
68
+ return ts.isoformat()
69
+
70
+
71
+ def scenario_event_to_sitrep(evt_data: dict, index: int) -> SitRepEvent:
72
+ """Convert a scenario event dict to a SitRepEvent."""
73
+ ts = parse_relative_time(evt_data["at"])
74
+ payload = evt_data["payload"]
75
+ service = payload.get("service", "unknown")
76
+ return SitRepEvent(
77
+ id=f"scenario-evt-{index:04d}",
78
+ timestamp=ts,
79
+ source="scenario",
80
+ type=EventType(evt_data["type"]),
81
+ service=service,
82
+ environment="production",
83
+ severity=0.5,
84
+ payload=payload,
85
+ )
86
+
87
+
88
+ def _build_topology_dict(scenario: dict) -> dict[str, Any] | None:
89
+ """Build the topology dict expected by the correlation engine from scenario YAML."""
90
+ topo_section = scenario.get("topology")
91
+ if not topo_section:
92
+ return None
93
+ services = topo_section.get("services", [])
94
+ if not services:
95
+ return None
96
+ result: dict[str, Any] = {}
97
+ for svc in services:
98
+ name = svc["name"]
99
+ result[name] = {
100
+ "tier": svc.get("tier", "standard"),
101
+ "dependencies": svc.get("dependencies", []),
102
+ "dependents": svc.get("dependents", []),
103
+ }
104
+ return result
105
+
106
+
107
+ def replay_command(
108
+ scenario_path: str,
109
+ config_path: str | None,
110
+ no_model: bool,
111
+ store_dir: str,
112
+ ) -> int:
113
+ """Replay a scenario fixture. Returns exit code."""
114
+ # Load scenario
115
+ try:
116
+ with open(scenario_path) as f:
117
+ raw = yaml.safe_load(f)
118
+ except (FileNotFoundError, OSError) as exc:
119
+ logger.error("scenario_load_failed", path=scenario_path, error=str(exc))
120
+ return 2
121
+
122
+ scenario = raw.get("scenario", raw)
123
+
124
+ # Parse events
125
+ events_data = scenario.get("events", [])
126
+ events: list[SitRepEvent] = []
127
+ for i, evt_data in enumerate(events_data):
128
+ try:
129
+ event = scenario_event_to_sitrep(evt_data, i)
130
+ events.append(event)
131
+ except (ValueError, KeyError) as exc:
132
+ logger.warning("event_parse_failed", index=i, error=str(exc))
133
+
134
+ # Build topology
135
+ topology = _build_topology_dict(scenario)
136
+
137
+ # Open temp store
138
+ db_path = os.path.join(store_dir, "replay.db")
139
+ store = SQLiteEventStore(db_path)
140
+
141
+ try:
142
+ # Insert events
143
+ if events:
144
+ store.insert_batch(events)
145
+
146
+ # Compute the actual time window from events for correlation
147
+ if events:
148
+ timestamps = [e.timestamp for e in events]
149
+ start_ts = min(timestamps)
150
+ end_ts = max(timestamps)
151
+ # Add a buffer to ensure the window captures all events
152
+ end_dt = datetime.fromisoformat(end_ts.replace("Z", "+00:00"))
153
+ end_buffered = (end_dt + timedelta(minutes=1)).isoformat()
154
+
155
+ # Run correlation sub-steps manually (since engine uses datetime.now())
156
+ all_events = store.get_by_time_window(start_ts, end_buffered)
157
+
158
+ if all_events:
159
+ # Deduplicate
160
+ deduped = deduplicate(all_events)
161
+
162
+ # Severity enrichment
163
+ enriched = []
164
+ for event in deduped:
165
+ new_severity = pre_score(event, None)
166
+ if new_severity != event.severity:
167
+ event = replace(event, severity=new_severity)
168
+ enriched.append(event)
169
+
170
+ # Compute window minutes from the event spread
171
+ start_dt = datetime.fromisoformat(start_ts.replace("Z", "+00:00"))
172
+ window_minutes = max(
173
+ int((end_dt - start_dt).total_seconds() / 60) + 1,
174
+ 5,
175
+ )
176
+
177
+ # Temporal grouping
178
+ temporal_groups = group_temporal(enriched, window_minutes=window_minutes)
179
+
180
+ # Topology grouping
181
+ topology_correlations = group_topology(temporal_groups, topology)
182
+
183
+ # Change candidate indexing
184
+ # Since get_recent_changes uses datetime('now'), we query the store
185
+ # directly for change events in our scenario window
186
+ change_candidates_map = find_change_candidates(
187
+ store, temporal_groups, topology=topology,
188
+ window_minutes=window_minutes + 30,
189
+ )
190
+
191
+ # Assemble correlation groups using the engine helper
192
+ engine = CorrelationEngine()
193
+ groups = engine.assemble_groups(
194
+ temporal_groups, topology_correlations,
195
+ change_candidates_map, topology,
196
+ )
197
+ else:
198
+ groups = []
199
+ else:
200
+ groups = []
201
+
202
+ # Report
203
+ scenario_id = scenario.get("id", "unknown")
204
+ print(f"\n=== Replay: {scenario_id} ===")
205
+ print(f"Events inserted: {len(events)}")
206
+ print(f"Correlation groups found: {len(groups)}")
207
+
208
+ services_affected: set[str] = set()
209
+ total_changes = 0
210
+ for g in groups:
211
+ services_affected.update(g.services)
212
+ total_changes += len(g.change_candidates)
213
+ print(f" [{g.id}] P{g.priority}: {g.summary}")
214
+
215
+ print(f"Services affected: {sorted(services_affected)}")
216
+ print(f"Change candidates: {total_changes}")
217
+
218
+ if not no_model:
219
+ # Model-enabled path
220
+ config = load_config(config_path) if config_path else SitRepConfig()
221
+ generator = SnapshotGenerator(SnapshotBudget(config.token_budget))
222
+ model = ModelInterface(config.model_name, config.model_max_tokens)
223
+
224
+ if groups:
225
+ prompt, cache_hit = generator.generate(groups, AgentState.WATCHING)
226
+ try:
227
+ verdicts = asyncio.run(model.interpret(prompt, groups))
228
+ print(f"Verdicts created: {len(verdicts)}")
229
+ except Exception as exc:
230
+ logger.warning("model_call_failed", error=str(exc))
231
+ print("Model call failed, skipping verdicts")
232
+ else:
233
+ print("Model: skipped (--no-model)")
234
+
235
+ print()
236
+ return 0
237
+
238
+ finally:
239
+ store.close()
240
+
241
+
242
+
243
+ def status_command(config_path: str | None, store_dir: str | None = None) -> int:
244
+ """Show current SitRep status. Returns exit code."""
245
+ config = load_config(config_path) if config_path else SitRepConfig()
246
+
247
+ # Use store_dir if provided (for testing), otherwise use config path
248
+ if store_dir:
249
+ db_path = os.path.join(store_dir, "sitrep-events.db")
250
+ else:
251
+ db_path = config.store_path
252
+
253
+ store = SQLiteEventStore(db_path)
254
+ try:
255
+ stats = store.get_stats()
256
+
257
+ print("\n=== SitRep Status ===")
258
+ print(f"Agent state: {AgentState.WATCHING.value}")
259
+ print(f"Event count: {stats['event_count']}")
260
+ print(f"Oldest event: {stats['min_timestamp'] or 'none'}")
261
+ print(f"Newest event: {stats['max_timestamp'] or 'none'}")
262
+
263
+ # DB file size
264
+ if os.path.exists(db_path):
265
+ size_bytes = os.path.getsize(db_path)
266
+ if size_bytes < 1024:
267
+ size_str = f"{size_bytes} B"
268
+ elif size_bytes < 1024 * 1024:
269
+ size_str = f"{size_bytes / 1024:.1f} KB"
270
+ else:
271
+ size_str = f"{size_bytes / (1024 * 1024):.1f} MB"
272
+ print(f"DB size: {size_str}")
273
+ else:
274
+ print("DB size: 0 B")
275
+
276
+ print()
277
+ return 0
278
+
279
+ finally:
280
+ store.close()
281
+
282
+
283
+ async def _serve_loop(config: SitRepConfig) -> None:
284
+ """Run the full serve pipeline."""
285
+ from nthlayer_workers.correlate.ingestion.webhook import WebhookIngester
286
+
287
+ store = SQLiteEventStore(config.store_path)
288
+ ingester = WebhookIngester(config.ingestion_host, config.ingestion_port)
289
+ engine = CorrelationEngine()
290
+ generator = SnapshotGenerator(SnapshotBudget(config.token_budget))
291
+ model = ModelInterface(config.model_name, config.model_max_tokens)
292
+ state_machine = StateMachine()
293
+
294
+ # Buffer events via queue to avoid concurrent SQLite writes
295
+ event_queue: asyncio.Queue = asyncio.Queue(maxsize=10000)
296
+ ingester.on_event(lambda event: event_queue.put_nowait(event))
297
+
298
+ # Start ingester
299
+ await ingester.start()
300
+ logger.info(
301
+ "sitrep_started",
302
+ host=config.ingestion_host,
303
+ port=config.ingestion_port,
304
+ )
305
+
306
+ # Shutdown event
307
+ shutdown = asyncio.Event()
308
+
309
+ def _handle_signal() -> None:
310
+ logger.info("shutdown_requested")
311
+ shutdown.set()
312
+
313
+ loop = asyncio.get_running_loop()
314
+ for sig in (signal.SIGTERM, signal.SIGINT):
315
+ loop.add_signal_handler(sig, _handle_signal)
316
+
317
+ # Optionally open verdict store
318
+ verdict_store = None
319
+ try:
320
+ from nthlayer_learn.store import VerdictStore
321
+ verdict_store = VerdictStore(config.verdict_store_path)
322
+ except Exception:
323
+ logger.info("verdict_store_not_available")
324
+
325
+ try:
326
+ while not shutdown.is_set():
327
+ interval = state_machine.get_interval()
328
+ try:
329
+ await asyncio.wait_for(shutdown.wait(), timeout=interval)
330
+ break # shutdown requested
331
+ except asyncio.TimeoutError:
332
+ pass # normal cycle
333
+
334
+ # Drain buffered events into store (single-threaded, safe)
335
+ while not event_queue.empty():
336
+ try:
337
+ event = event_queue.get_nowait()
338
+ store.insert(event)
339
+ except asyncio.QueueEmpty:
340
+ break
341
+
342
+ # Run correlation
343
+ try:
344
+ groups = engine.correlate(
345
+ store,
346
+ config.correlation_window_minutes,
347
+ topology=None, # No manifests in Tier 1
348
+ )
349
+
350
+ model_healthy = True
351
+ state_machine.update(groups, model_healthy)
352
+
353
+ prompt, cache_hit = generator.generate(groups, state_machine.state)
354
+
355
+ if not cache_hit and groups:
356
+ try:
357
+ await model.interpret(prompt, groups, verdict_store)
358
+ except Exception as exc:
359
+ logger.warning("model_call_failed", error=str(exc))
360
+ state_machine.update(groups, model_healthy=False)
361
+
362
+ logger.info(
363
+ "correlation_cycle",
364
+ state=state_machine.state.value,
365
+ groups=len(groups),
366
+ cache_hit=cache_hit,
367
+ )
368
+
369
+ except Exception as exc:
370
+ logger.error("correlation_error", error=str(exc))
371
+
372
+ finally:
373
+ await ingester.stop()
374
+ store.close()
375
+ logger.info("sitrep_stopped")
376
+
377
+
378
+ def serve_command(config_path: str | None) -> int:
379
+ """Start the full SitRep pipeline. Returns exit code."""
380
+ config = load_config(config_path) if config_path else SitRepConfig()
381
+ try:
382
+ asyncio.run(_serve_loop(config))
383
+ return 0
384
+ except KeyboardInterrupt:
385
+ return 0
386
+ except Exception as exc:
387
+ logger.error("serve_failed", error=str(exc))
388
+ return 2
389
+
390
+
391
+ def correlate_command(
392
+ trigger_verdict_id: str,
393
+ prometheus_url: str,
394
+ specs_dir: str,
395
+ verdict_store_path: str,
396
+ respond_args: str | None = None,
397
+ reasoning: bool = True,
398
+ reasoning_model: str | None = None,
399
+ decision_store_path: str | None = None,
400
+ trace_backend: object | None = None, # TraceBackend Protocol (not runtime_checkable)
401
+ trace_baseline_window: str = "1h",
402
+ ) -> int:
403
+ """Correlate signals from a trigger evaluation verdict.
404
+
405
+ Reads the trigger verdict from the store, queries Prometheus for correlated
406
+ signals across the blast radius, runs the correlation engine, and writes
407
+ a correlation verdict.
408
+ """
409
+ from nthlayer_common.verdicts import (
410
+ SQLiteVerdictStore,
411
+ VerdictFilter,
412
+ create as verdict_create,
413
+ link as verdict_link,
414
+ )
415
+
416
+ from nthlayer_workers.correlate.prometheus import (
417
+ blast_radius_services,
418
+ fetch_alerts,
419
+ fetch_metric_breaches,
420
+ load_dependency_graph,
421
+ verdict_to_event,
422
+ )
423
+
424
+ log = structlog.get_logger("correlate_command")
425
+
426
+ # Open verdict store
427
+ verdict_store = SQLiteVerdictStore(verdict_store_path)
428
+
429
+ # Read trigger verdict
430
+ trigger = verdict_store.get(trigger_verdict_id)
431
+ if trigger is None:
432
+ log.error("Trigger verdict not found", verdict_id=trigger_verdict_id)
433
+ return 1
434
+
435
+ trigger_service = trigger.subject.ref or "unknown"
436
+ trigger_custom = getattr(trigger.metadata, "custom", {}) or {}
437
+ log.info(
438
+ "Trigger verdict loaded",
439
+ service=trigger_service,
440
+ slo_name=trigger_custom.get("slo_name"),
441
+ breach=trigger_custom.get("breach"),
442
+ )
443
+
444
+ # Load dependency graph from specs
445
+ dep_graph = load_dependency_graph(specs_dir)
446
+
447
+ # Compute blast radius
448
+ affected = blast_radius_services(trigger_service, dep_graph)
449
+ log.info("Blast radius computed", affected_services=sorted(affected))
450
+
451
+ # Gather events from Prometheus and verdict store (+ optional trace evidence)
452
+ async def _gather():
453
+ import httpx
454
+
455
+ events: list[SitRepEvent] = []
456
+ trace_evidence = None
457
+
458
+ async with httpx.AsyncClient() as client:
459
+ # 1. Prometheus alerts on affected services
460
+ alerts = await fetch_alerts(client, prometheus_url, affected)
461
+ events.extend(alerts)
462
+
463
+ # 2. Prometheus metric breaches on affected services
464
+ breaches = await fetch_metric_breaches(client, prometheus_url, affected)
465
+ events.extend(breaches)
466
+
467
+ # 3. Recent evaluation verdicts from store as events
468
+ recent = verdict_store.query(VerdictFilter(
469
+ producer_system="nthlayer-measure",
470
+ subject_type="evaluation",
471
+ limit=50,
472
+ ))
473
+ for v in recent:
474
+ svc = v.subject.ref or v.subject.service or ""
475
+ if svc in affected:
476
+ events.append(verdict_to_event(v))
477
+
478
+ # 4. Trace evidence (optional, graceful degradation)
479
+ if trace_backend is not None:
480
+ try:
481
+ baseline_td = _parse_duration(trace_baseline_window)
482
+ end = datetime.now(tz=timezone.utc)
483
+ start = end - timedelta(minutes=30)
484
+ trace_evidence = await trace_backend.get_trace_evidence(
485
+ services=sorted(affected),
486
+ start=start,
487
+ end=end,
488
+ baseline_window=baseline_td,
489
+ )
490
+ except Exception as exc:
491
+ log.warning("trace_evidence_unavailable", error=str(exc))
492
+
493
+ return events, trace_evidence
494
+
495
+ events, trace_evidence = asyncio.run(_gather())
496
+
497
+ # Close trace backend client to release connections
498
+ if trace_backend is not None and hasattr(trace_backend, "aclose"):
499
+ asyncio.run(trace_backend.aclose())
500
+
501
+ log.info("Gathered events", count=len(events))
502
+
503
+ # Compute topology divergence (declared vs observed) for reasoning enrichment
504
+ if trace_evidence and dep_graph:
505
+ from nthlayer_workers.correlate.traces.topology import detect_topology_divergence
506
+ trace_evidence.topology_divergence = detect_topology_divergence(trace_evidence, dep_graph)
507
+
508
+ if not events:
509
+ log.info("No correlated events found, no correlation verdict needed")
510
+ return 0
511
+
512
+ # Insert events into temp store and run correlation engine
513
+ with tempfile.TemporaryDirectory() as tmp_dir:
514
+ tmp_store = SQLiteEventStore(os.path.join(tmp_dir, "correlation.db"))
515
+ tmp_store.insert_batch(events)
516
+
517
+ engine = CorrelationEngine()
518
+ topology = dep_graph if dep_graph else None
519
+ groups = engine.correlate(tmp_store, window_minutes=30, topology=topology)
520
+ tmp_store.close()
521
+
522
+ if not groups:
523
+ log.info("No correlation groups formed")
524
+ return 0
525
+
526
+ log.info("Correlation groups", count=len(groups))
527
+ for g in groups:
528
+ log.info(
529
+ "Group",
530
+ priority=g.priority,
531
+ services=g.services,
532
+ event_count=g.event_count,
533
+ summary=g.summary,
534
+ )
535
+
536
+ # Reasoning layer: model-based causal analysis if enabled, else heuristic
537
+ reasoning_result = None
538
+ reasoning_mode = "heuristic"
539
+
540
+ if reasoning:
541
+ from nthlayer_workers.correlate.reasoning import reason_about_correlations, reasoning_available
542
+
543
+ if reasoning_available():
544
+ kwargs = {}
545
+ if reasoning_model:
546
+ kwargs["model"] = reasoning_model
547
+ if trace_evidence:
548
+ kwargs["trace_evidence"] = trace_evidence
549
+ reasoning_result = asyncio.run(
550
+ reason_about_correlations(groups, dep_graph, **kwargs)
551
+ )
552
+ if not reasoning_result.get("degraded", False):
553
+ reasoning_mode = "model"
554
+ log.info("Reasoning complete", mode=reasoning_mode,
555
+ overall_confidence=reasoning_result.get("overall_confidence"))
556
+ else:
557
+ log.info("Reasoning degraded, falling back to heuristic",
558
+ reason=reasoning_result.get("overall_assessment"))
559
+ reasoning_result = None
560
+ else:
561
+ log.info("Reasoning skipped: no LLM API key set (ANTHROPIC_API_KEY or OPENAI_API_KEY)")
562
+ else:
563
+ log.info("Reasoning disabled via --no-reasoning")
564
+
565
+ # Build root causes and confidence from reasoning or heuristic
566
+ reasoning_by_group = {}
567
+ if reasoning_result and reasoning_mode == "model":
568
+ for ga in reasoning_result.get("groups", []):
569
+ reasoning_by_group[ga["group_id"]] = ga
570
+
571
+ root_causes = []
572
+ for g in groups:
573
+ ga = reasoning_by_group.get(g.id)
574
+ if ga and ga.get("root_cause"):
575
+ # Model-provided root cause
576
+ root_causes.append({
577
+ "service": g.services[0] if g.services else trigger_service,
578
+ "type": ga["root_cause"],
579
+ "confidence": ga["confidence"],
580
+ "evidence": ga.get("reasoning", ""),
581
+ "recommended_actions": ga.get("recommended_actions", []),
582
+ })
583
+ else:
584
+ # Heuristic fallback: temporal proximity
585
+ for cc in g.change_candidates:
586
+ root_causes.append({
587
+ "service": cc.change.service,
588
+ "type": cc.change.payload.get("change_type", "unknown"),
589
+ "confidence": _proximity_confidence(cc.temporal_proximity_seconds),
590
+ "evidence": cc.change.payload.get("detail", ""),
591
+ })
592
+
593
+ blast_list = [
594
+ {
595
+ "service": svc,
596
+ "impact": "direct" if svc == trigger_service else "downstream",
597
+ "slo_breached": any(
598
+ getattr(e, "service", "") == svc
599
+ for e in events
600
+ if getattr(e, "type", None) == EventType.METRIC_BREACH
601
+ ),
602
+ }
603
+ for svc in sorted(affected)
604
+ ]
605
+
606
+ # Overall confidence: model reasoning or peak severity heuristic
607
+ if reasoning_result and reasoning_mode == "model":
608
+ overall_confidence = reasoning_result["overall_confidence"]
609
+ else:
610
+ overall_confidence = min(1.0, max(
611
+ max(s.peak_severity for s in g.signals) for g in groups
612
+ )) if groups else 0.5
613
+
614
+ # Build verdict summary: prefer model assessment over template
615
+ if reasoning_result and reasoning_mode == "model" and reasoning_result.get("overall_assessment"):
616
+ verdict_summary = reasoning_result["overall_assessment"]
617
+ elif root_causes:
618
+ verdict_summary = (
619
+ f"{root_causes[0].get('service', trigger_service)} "
620
+ f"{root_causes[0].get('type', 'incident')} — "
621
+ f"{len(blast_list)} services in blast radius"
622
+ )
623
+ else:
624
+ verdict_summary = f"{trigger_service} incident — {len(blast_list)} services affected"
625
+
626
+ corr_verdict = verdict_create(
627
+ subject={
628
+ "type": "correlation",
629
+ "ref": trigger_service,
630
+ "summary": verdict_summary,
631
+ },
632
+ judgment={
633
+ "action": "flag" if any(g.priority <= 1 for g in groups) else "escalate",
634
+ "confidence": overall_confidence,
635
+ },
636
+ producer={"system": "nthlayer-correlate"},
637
+ metadata={"custom": {
638
+ "trigger_verdict": trigger_verdict_id,
639
+ "root_causes": root_causes,
640
+ "blast_radius": blast_list,
641
+ "groups": len(groups),
642
+ "events_gathered": len(events),
643
+ "reasoning_mode": reasoning_mode,
644
+ "reasoning": reasoning_result if reasoning_mode == "model" else None,
645
+ "evidence_sources": {
646
+ "prometheus": True,
647
+ "verdict_store": True,
648
+ "trace_backend": trace_evidence.backend if trace_evidence else None,
649
+ },
650
+ "trace_query_time_ms": trace_evidence.query_time_ms if trace_evidence else None,
651
+ }},
652
+ )
653
+ verdict_link(corr_verdict, context=[trigger_verdict_id])
654
+ verdict_store.put(corr_verdict)
655
+
656
+ # Write content-addressed decision record
657
+ if decision_store_path:
658
+ from nthlayer_common.records.sqlite_store import SQLiteDecisionRecordStore
659
+ from nthlayer_common.records.verdict_bridge import write_decision_verdict
660
+
661
+ ds = SQLiteDecisionRecordStore(decision_store_path)
662
+ write_decision_verdict(
663
+ ds,
664
+ agent="correlate",
665
+ incident_id=getattr(corr_verdict.subject, "ref", "") or trigger_service,
666
+ timestamp=corr_verdict.timestamp,
667
+ model=reasoning_model or os.environ.get("NTHLAYER_MODEL", "heuristic"),
668
+ reasoning=getattr(corr_verdict.judgment, "reasoning", "") or verdict_summary,
669
+ action={
670
+ "root_causes": root_causes[:3],
671
+ "blast_radius_count": len(blast_list),
672
+ "trace_backend": trace_evidence.backend if trace_evidence else None,
673
+ "trace_services_count": len(trace_evidence.services) if trace_evidence else 0,
674
+ },
675
+ prompt_text=f"correlate {trigger_service} mode={reasoning_mode}",
676
+ response_text=str(reasoning_result) if reasoning_result else "heuristic",
677
+ summaries_technical=(
678
+ f"Correlation: {trigger_service}, {len(groups)} groups, {len(blast_list)} blast radius"
679
+ + (f", trace evidence from {trace_evidence.backend}" if trace_evidence else "")
680
+ ),
681
+ summaries_plain=verdict_summary[:280],
682
+ summaries_executive=(
683
+ f"{trigger_service} correlation — {reasoning_mode}"
684
+ + (" + traces" if trace_evidence else "")
685
+ ),
686
+ )
687
+
688
+ # Slack notification for correlation verdict
689
+ slack_url = os.environ.get("SLACK_WEBHOOK_URL", "")
690
+ if slack_url:
691
+ from nthlayer_common.slack import SlackNotifier
692
+ from nthlayer_workers.correlate.notifications import build_correlation_blocks, find_slack_thread_ts
693
+
694
+ thread_ts = find_slack_thread_ts(verdict_store, [trigger_verdict_id])
695
+ blocks, text = build_correlation_blocks(corr_verdict)
696
+ notifier = SlackNotifier(slack_url)
697
+ new_ts = asyncio.run(notifier.send(blocks, text, thread_ts=thread_ts))
698
+ if new_ts and not thread_ts:
699
+ corr_verdict.metadata.custom["slack_thread_ts"] = new_ts
700
+ verdict_store.put(corr_verdict)
701
+
702
+ print(f"Correlation verdict: {corr_verdict.id}")
703
+ print(f" Groups: {len(groups)}, Events: {len(events)}, Blast radius: {len(affected)} services")
704
+
705
+ # Forward to nthlayer-respond if respond_args is set
706
+ if respond_args:
707
+ import json
708
+ import subprocess
709
+
710
+ try:
711
+ args_dict = json.loads(respond_args)
712
+ except json.JSONDecodeError:
713
+ log.error("Invalid --respond-args JSON", raw=respond_args)
714
+ return 1
715
+
716
+ # Only allow known respond flags to prevent injection
717
+ allowed_keys = {"specs-dir", "config", "notify"}
718
+ for key in args_dict:
719
+ if key not in allowed_keys:
720
+ log.warning("Ignoring unknown respond arg", key=key)
721
+
722
+ cmd = [
723
+ "nthlayer-respond", "respond",
724
+ "--trigger-verdict", corr_verdict.id,
725
+ "--verdict-store", verdict_store_path,
726
+ ]
727
+ for key, value in args_dict.items():
728
+ if key in allowed_keys:
729
+ cmd.extend([f"--{key}", str(value)])
730
+
731
+ log.info("Invoking nthlayer-respond", cmd=cmd)
732
+ try:
733
+ result = subprocess.run(cmd, capture_output=True, text=True)
734
+ if result.returncode != 0:
735
+ log.error("nthlayer-respond failed", returncode=result.returncode, stderr=result.stderr)
736
+ else:
737
+ print(result.stdout)
738
+ except FileNotFoundError:
739
+ log.error("nthlayer-respond not found on PATH")
740
+
741
+ return 0
742
+
743
+
744
+ def _build_parser() -> argparse.ArgumentParser:
745
+ """Build the CLI argument parser."""
746
+ parser = argparse.ArgumentParser(
747
+ prog="nthlayer-correlate",
748
+ description="SitRep — Situational awareness through automated signal correlation",
749
+ )
750
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
751
+
752
+ # serve
753
+ serve_parser = subparsers.add_parser("serve", help="Start the full SitRep pipeline")
754
+ serve_parser.add_argument(
755
+ "--config", default=None, help="Path to sitrep.yaml config file"
756
+ )
757
+
758
+ # status
759
+ status_parser = subparsers.add_parser("status", help="Show current SitRep status")
760
+ status_parser.add_argument(
761
+ "--config", default=None, help="Path to sitrep.yaml config file"
762
+ )
763
+
764
+ # replay
765
+ replay_parser = subparsers.add_parser("replay", help="Replay a scenario fixture")
766
+ replay_parser.add_argument(
767
+ "--scenario", required=True, help="Path to scenario YAML file"
768
+ )
769
+ replay_parser.add_argument(
770
+ "--config", default=None, help="Path to sitrep.yaml config file"
771
+ )
772
+ replay_parser.add_argument(
773
+ "--no-model", action="store_true", help="Skip model calls"
774
+ )
775
+
776
+ # correlate (live data — trigger from verdict)
777
+ corr_parser = subparsers.add_parser("correlate", help="Correlate signals from a trigger verdict")
778
+ corr_parser.add_argument("--trigger-verdict", required=True, help="Evaluation verdict ID that triggered correlation")
779
+ corr_parser.add_argument("--prometheus-url", required=True, help="Prometheus base URL")
780
+ corr_parser.add_argument("--specs-dir", required=True, help="Directory of OpenSRM spec YAMLs")
781
+ corr_parser.add_argument("--verdict-store", default="verdicts.db", help="Path to verdict SQLite DB")
782
+ # Reasoning layer flags
783
+ reasoning_group = corr_parser.add_mutually_exclusive_group()
784
+ reasoning_group.add_argument("--reasoning", action="store_true", default=True, help="Enable model-based causal reasoning (default if API key set)")
785
+ reasoning_group.add_argument("--no-reasoning", action="store_true", help="Disable model reasoning, use heuristic only")
786
+ corr_parser.add_argument("--model", default=None, help="Model for reasoning (e.g. 'openai/gpt-4o', 'anthropic/claude-sonnet-4-20250514')")
787
+ # Forward flags for downstream components (passed through, not parsed by correlate)
788
+ corr_parser.add_argument("--respond-args", default=None, help="JSON-encoded args to forward to nthlayer-respond")
789
+ corr_parser.add_argument("--decision-store", default=None, help="Path to decision record SQLite DB for content-addressed records")
790
+ # Trace backend flags
791
+ corr_parser.add_argument("--trace-backend", default=None, choices=["tempo"], help="Trace backend to query for evidence (e.g. 'tempo')")
792
+ corr_parser.add_argument("--tempo-endpoint", default=None, help="Tempo query endpoint (e.g. 'http://tempo:3200')")
793
+ corr_parser.add_argument("--trace-detail", default="full", choices=["summary", "full"], help="Trace evidence detail level")
794
+
795
+ return parser
796
+
797
+
798
+ def main() -> None:
799
+ """CLI entry point."""
800
+ parser = _build_parser()
801
+ args = parser.parse_args()
802
+
803
+ if args.command is None:
804
+ parser.print_help()
805
+ sys.exit(2)
806
+
807
+ if args.command == "serve":
808
+ sys.exit(serve_command(args.config))
809
+ elif args.command == "status":
810
+ sys.exit(status_command(args.config))
811
+ elif args.command == "replay":
812
+ with tempfile.TemporaryDirectory() as tmp_dir:
813
+ sys.exit(
814
+ replay_command(
815
+ scenario_path=args.scenario,
816
+ config_path=args.config,
817
+ no_model=args.no_model,
818
+ store_dir=tmp_dir,
819
+ )
820
+ )
821
+ elif args.command == "correlate":
822
+ # Construct trace backend if requested
823
+ _trace_backend = None
824
+ if args.trace_backend == "tempo":
825
+ from nthlayer_workers.correlate.traces.tempo import TempoTraceBackend
826
+ _trace_backend = TempoTraceBackend(
827
+ endpoint=args.tempo_endpoint,
828
+ use_service_graphs=True,
829
+ )
830
+
831
+ sys.exit(
832
+ correlate_command(
833
+ trigger_verdict_id=args.trigger_verdict,
834
+ prometheus_url=args.prometheus_url,
835
+ specs_dir=args.specs_dir,
836
+ verdict_store_path=args.verdict_store,
837
+ respond_args=args.respond_args,
838
+ reasoning=not args.no_reasoning,
839
+ reasoning_model=args.model,
840
+ decision_store_path=getattr(args, "decision_store", None),
841
+ trace_backend=_trace_backend,
842
+ )
843
+ )
844
+
845
+
846
+ if __name__ == "__main__":
847
+ main()