nthlayer-workers 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. nthlayer_workers/__init__.py +5 -0
  2. nthlayer_workers/cli.py +234 -0
  3. nthlayer_workers/correlate/__init__.py +1 -0
  4. nthlayer_workers/correlate/cli.py +847 -0
  5. nthlayer_workers/correlate/config.py +111 -0
  6. nthlayer_workers/correlate/correlation/__init__.py +1 -0
  7. nthlayer_workers/correlate/correlation/changes.py +87 -0
  8. nthlayer_workers/correlate/correlation/dedup.py +62 -0
  9. nthlayer_workers/correlate/correlation/engine.py +244 -0
  10. nthlayer_workers/correlate/correlation/temporal.py +79 -0
  11. nthlayer_workers/correlate/correlation/topology.py +104 -0
  12. nthlayer_workers/correlate/ingestion/__init__.py +1 -0
  13. nthlayer_workers/correlate/ingestion/protocol.py +10 -0
  14. nthlayer_workers/correlate/ingestion/severity.py +18 -0
  15. nthlayer_workers/correlate/ingestion/webhook.py +197 -0
  16. nthlayer_workers/correlate/notifications.py +85 -0
  17. nthlayer_workers/correlate/prometheus.py +234 -0
  18. nthlayer_workers/correlate/reasoning.py +375 -0
  19. nthlayer_workers/correlate/session.py +189 -0
  20. nthlayer_workers/correlate/snapshot/__init__.py +1 -0
  21. nthlayer_workers/correlate/snapshot/generator.py +170 -0
  22. nthlayer_workers/correlate/snapshot/model.py +177 -0
  23. nthlayer_workers/correlate/snapshot/token.py +14 -0
  24. nthlayer_workers/correlate/state.py +88 -0
  25. nthlayer_workers/correlate/store/__init__.py +5 -0
  26. nthlayer_workers/correlate/store/protocol.py +48 -0
  27. nthlayer_workers/correlate/store/sqlite.py +443 -0
  28. nthlayer_workers/correlate/summary.py +180 -0
  29. nthlayer_workers/correlate/traces/__init__.py +1 -0
  30. nthlayer_workers/correlate/traces/protocol.py +120 -0
  31. nthlayer_workers/correlate/traces/tempo.py +667 -0
  32. nthlayer_workers/correlate/traces/topology.py +39 -0
  33. nthlayer_workers/correlate/types.py +77 -0
  34. nthlayer_workers/correlate/worker.py +630 -0
  35. nthlayer_workers/learn/__init__.py +5 -0
  36. nthlayer_workers/learn/__main__.py +5 -0
  37. nthlayer_workers/learn/cli.py +164 -0
  38. nthlayer_workers/learn/retrospective.py +381 -0
  39. nthlayer_workers/learn/trends.py +102 -0
  40. nthlayer_workers/learn/worker.py +366 -0
  41. nthlayer_workers/measure/__init__.py +3 -0
  42. nthlayer_workers/measure/__main__.py +5 -0
  43. nthlayer_workers/measure/_parsing.py +15 -0
  44. nthlayer_workers/measure/adapters/__init__.py +0 -0
  45. nthlayer_workers/measure/adapters/_util.py +24 -0
  46. nthlayer_workers/measure/adapters/devin.py +119 -0
  47. nthlayer_workers/measure/adapters/gastown.py +88 -0
  48. nthlayer_workers/measure/adapters/prometheus.py +277 -0
  49. nthlayer_workers/measure/adapters/protocol.py +20 -0
  50. nthlayer_workers/measure/adapters/webhook.py +161 -0
  51. nthlayer_workers/measure/api/__init__.py +0 -0
  52. nthlayer_workers/measure/api/normalise.py +50 -0
  53. nthlayer_workers/measure/api/queue.py +243 -0
  54. nthlayer_workers/measure/api/response.py +51 -0
  55. nthlayer_workers/measure/api/server.py +504 -0
  56. nthlayer_workers/measure/calibration/__init__.py +0 -0
  57. nthlayer_workers/measure/calibration/loop.py +62 -0
  58. nthlayer_workers/measure/calibration/slos.py +212 -0
  59. nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
  60. nthlayer_workers/measure/cli.py +753 -0
  61. nthlayer_workers/measure/config.py +191 -0
  62. nthlayer_workers/measure/detection/__init__.py +6 -0
  63. nthlayer_workers/measure/detection/detector.py +82 -0
  64. nthlayer_workers/measure/detection/protocol.py +29 -0
  65. nthlayer_workers/measure/governance/__init__.py +0 -0
  66. nthlayer_workers/measure/governance/engine.py +163 -0
  67. nthlayer_workers/measure/manifest.py +77 -0
  68. nthlayer_workers/measure/notifications.py +53 -0
  69. nthlayer_workers/measure/pipeline/__init__.py +0 -0
  70. nthlayer_workers/measure/pipeline/evaluator.py +155 -0
  71. nthlayer_workers/measure/pipeline/router.py +160 -0
  72. nthlayer_workers/measure/store/__init__.py +0 -0
  73. nthlayer_workers/measure/store/protocol.py +38 -0
  74. nthlayer_workers/measure/store/sqlite.py +276 -0
  75. nthlayer_workers/measure/telemetry.py +116 -0
  76. nthlayer_workers/measure/tiering/__init__.py +0 -0
  77. nthlayer_workers/measure/tiering/classifier.py +58 -0
  78. nthlayer_workers/measure/tiering/promotion.py +118 -0
  79. nthlayer_workers/measure/trends/__init__.py +0 -0
  80. nthlayer_workers/measure/trends/tracker.py +72 -0
  81. nthlayer_workers/measure/types.py +75 -0
  82. nthlayer_workers/measure/worker.py +439 -0
  83. nthlayer_workers/observe/__init__.py +25 -0
  84. nthlayer_workers/observe/__main__.py +5 -0
  85. nthlayer_workers/observe/api/__init__.py +1 -0
  86. nthlayer_workers/observe/assessment.py +95 -0
  87. nthlayer_workers/observe/cli.py +737 -0
  88. nthlayer_workers/observe/config.py +11 -0
  89. nthlayer_workers/observe/db/__init__.py +1 -0
  90. nthlayer_workers/observe/decision_records.py +220 -0
  91. nthlayer_workers/observe/dependencies/__init__.py +18 -0
  92. nthlayer_workers/observe/dependencies/discovery.py +294 -0
  93. nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
  94. nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
  95. nthlayer_workers/observe/dependencies/providers/base.py +76 -0
  96. nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
  97. nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
  98. nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
  99. nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
  100. nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
  101. nthlayer_workers/observe/deployments/__init__.py +1 -0
  102. nthlayer_workers/observe/discovery/__init__.py +14 -0
  103. nthlayer_workers/observe/discovery/classifier.py +66 -0
  104. nthlayer_workers/observe/discovery/client.py +189 -0
  105. nthlayer_workers/observe/discovery/models.py +53 -0
  106. nthlayer_workers/observe/drift/__init__.py +26 -0
  107. nthlayer_workers/observe/drift/analyzer.py +383 -0
  108. nthlayer_workers/observe/drift/models.py +174 -0
  109. nthlayer_workers/observe/drift/patterns.py +88 -0
  110. nthlayer_workers/observe/explanation.py +118 -0
  111. nthlayer_workers/observe/gate/__init__.py +39 -0
  112. nthlayer_workers/observe/gate/conditions.py +92 -0
  113. nthlayer_workers/observe/gate/correlator.py +154 -0
  114. nthlayer_workers/observe/gate/evaluator.py +192 -0
  115. nthlayer_workers/observe/gate/policies.py +226 -0
  116. nthlayer_workers/observe/gate_adapter.py +40 -0
  117. nthlayer_workers/observe/incident.py +36 -0
  118. nthlayer_workers/observe/portfolio/__init__.py +17 -0
  119. nthlayer_workers/observe/portfolio/aggregator.py +168 -0
  120. nthlayer_workers/observe/portfolio/scorer.py +13 -0
  121. nthlayer_workers/observe/slo/__init__.py +19 -0
  122. nthlayer_workers/observe/slo/collector.py +235 -0
  123. nthlayer_workers/observe/slo/spec_loader.py +40 -0
  124. nthlayer_workers/observe/sqlite_store.py +152 -0
  125. nthlayer_workers/observe/store.py +92 -0
  126. nthlayer_workers/observe/verification/__init__.py +22 -0
  127. nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
  128. nthlayer_workers/observe/verification/extractor.py +127 -0
  129. nthlayer_workers/observe/verification/models.py +101 -0
  130. nthlayer_workers/observe/verification/verifier.py +111 -0
  131. nthlayer_workers/observe/worker.py +332 -0
  132. nthlayer_workers/respond/__init__.py +2 -0
  133. nthlayer_workers/respond/__main__.py +4 -0
  134. nthlayer_workers/respond/agents/__init__.py +0 -0
  135. nthlayer_workers/respond/agents/base.py +556 -0
  136. nthlayer_workers/respond/agents/communication.py +115 -0
  137. nthlayer_workers/respond/agents/investigation.py +124 -0
  138. nthlayer_workers/respond/agents/remediation.py +219 -0
  139. nthlayer_workers/respond/agents/triage.py +132 -0
  140. nthlayer_workers/respond/cli.py +772 -0
  141. nthlayer_workers/respond/config.py +135 -0
  142. nthlayer_workers/respond/context_store.py +256 -0
  143. nthlayer_workers/respond/coordinator.py +487 -0
  144. nthlayer_workers/respond/metrics.py +104 -0
  145. nthlayer_workers/respond/notification_backends/__init__.py +1 -0
  146. nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
  147. nthlayer_workers/respond/notification_backends/protocol.py +59 -0
  148. nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
  149. nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
  150. nthlayer_workers/respond/notifications.py +247 -0
  151. nthlayer_workers/respond/oncall/__init__.py +1 -0
  152. nthlayer_workers/respond/oncall/escalation.py +103 -0
  153. nthlayer_workers/respond/oncall/runner.py +193 -0
  154. nthlayer_workers/respond/oncall/schedule.py +243 -0
  155. nthlayer_workers/respond/safe_actions/__init__.py +0 -0
  156. nthlayer_workers/respond/safe_actions/actions.py +139 -0
  157. nthlayer_workers/respond/safe_actions/registry.py +171 -0
  158. nthlayer_workers/respond/safe_actions/webhook.py +194 -0
  159. nthlayer_workers/respond/server.py +357 -0
  160. nthlayer_workers/respond/sre/__init__.py +1 -0
  161. nthlayer_workers/respond/sre/brief.py +175 -0
  162. nthlayer_workers/respond/sre/delegation.py +101 -0
  163. nthlayer_workers/respond/sre/post_incident.py +146 -0
  164. nthlayer_workers/respond/sre/shift_report.py +129 -0
  165. nthlayer_workers/respond/sre/suppression.py +91 -0
  166. nthlayer_workers/respond/types.py +109 -0
  167. nthlayer_workers/respond/verdict_submission.py +56 -0
  168. nthlayer_workers/respond/worker.py +533 -0
  169. nthlayer_workers/respond/worker_helpers.py +140 -0
  170. nthlayer_workers/runner.py +198 -0
  171. nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
  172. nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
  173. nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
  174. nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
  175. nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,375 @@
1
+ """Reasoning layer for correlation groups.
2
+
3
+ Used by: the `correlate` CLI subcommand (live Prometheus-triggered correlation).
4
+ See also: snapshot/model.py which serves the `serve` and `replay` subcommands.
5
+
6
+ Sits between CorrelationEngine.correlate() group assembly and verdict creation.
7
+ Calls an LLM to assess causal relationships, root causes, and recommended actions.
8
+ Provider-agnostic via nthlayer-common wrapper (Anthropic, OpenAI, Ollama, etc.).
9
+ Additive: --no-reasoning produces identical output to pre-reasoning behavior.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import json
15
+ import os
16
+ from pathlib import Path
17
+
18
+ import structlog
19
+
20
+ from nthlayer_common.prompts import load_prompt
21
+ from nthlayer_workers.correlate.types import CorrelationGroup
22
+
23
+ _PROMPT_PATH = Path(__file__).parent.parent.parent / "prompts" / "reasoning.yaml"
24
+
25
+ logger = structlog.get_logger(__name__)
26
+
27
+
28
+ def reasoning_available() -> bool:
29
+ """Check if an LLM provider is configured and reachable.
30
+
31
+ Returns True if NTHLAYER_MODEL is set (any provider, including keyless
32
+ ones like Ollama) or if an API key for a cloud provider is set.
33
+ """
34
+ return bool(
35
+ os.environ.get("NTHLAYER_MODEL")
36
+ or os.environ.get("ANTHROPIC_API_KEY")
37
+ or os.environ.get("OPENAI_API_KEY")
38
+ )
39
+
40
+
41
+ async def reason_about_correlations(
42
+ groups: list[CorrelationGroup],
43
+ dependency_graph: dict,
44
+ slo_targets: dict | None = None,
45
+ trace_evidence: object | None = None,
46
+ model: str | None = None,
47
+ max_tokens: int = 4096,
48
+ timeout: int = 30,
49
+ ) -> dict:
50
+ """Call an LLM to reason about pre-correlated groups.
51
+
52
+ Returns structured reasoning dict with keys:
53
+ - groups: list of per-group reasoning (root_cause, confidence, reasoning, recommended_actions)
54
+ - overall_assessment: str
55
+ - overall_confidence: float
56
+
57
+ Falls back to _degraded_reasoning() on any failure.
58
+ """
59
+ if not groups:
60
+ return _degraded_reasoning(groups, reason="no correlation groups to assess")
61
+
62
+ system_prompt = _build_system_prompt()
63
+ user_prompt = _build_user_prompt(groups, dependency_graph, slo_targets, trace_evidence=trace_evidence)
64
+
65
+ try:
66
+ response_text = await _call_model(
67
+ system_prompt, user_prompt, model, max_tokens, timeout
68
+ )
69
+ result = _parse_reasoning_response(response_text, groups)
70
+ logger.info(
71
+ "reasoning_complete",
72
+ groups=len(groups),
73
+ overall_confidence=result.get("overall_confidence", 0.0),
74
+ )
75
+ return result
76
+ except Exception as exc:
77
+ logger.warning("reasoning_failed", error=str(exc))
78
+ return _degraded_reasoning(groups, reason=str(exc))
79
+
80
+
81
+ def _degraded_reasoning(
82
+ groups: list[CorrelationGroup],
83
+ reason: str = "model unavailable",
84
+ ) -> dict:
85
+ """Fallback reasoning when API is unavailable.
86
+
87
+ Confidence 0.0, tagged as degraded. Transport continues, judgment pauses.
88
+ """
89
+ group_assessments = []
90
+ for g in groups:
91
+ group_assessments.append({
92
+ "group_id": g.id,
93
+ "root_cause": None,
94
+ "confidence": 0.0,
95
+ "reasoning": f"Degraded mode: {reason}",
96
+ "recommended_actions": [],
97
+ "is_causal": None,
98
+ "degraded": True,
99
+ })
100
+
101
+ return {
102
+ "groups": group_assessments,
103
+ "overall_assessment": f"Reasoning unavailable: {reason}",
104
+ "overall_confidence": 0.0,
105
+ "degraded": True,
106
+ }
107
+
108
+
109
+ async def _call_model(
110
+ system_prompt: str,
111
+ user_prompt: str,
112
+ model: str | None,
113
+ max_tokens: int,
114
+ timeout: int,
115
+ ) -> str:
116
+ """Call LLM via the shared nthlayer-common wrapper."""
117
+ from nthlayer_common.llm import llm_call
118
+
119
+ result = await asyncio.to_thread(
120
+ llm_call,
121
+ system=system_prompt,
122
+ user=user_prompt,
123
+ model=model,
124
+ max_tokens=max_tokens,
125
+ timeout=timeout,
126
+ )
127
+ return result.text
128
+
129
+
130
+ def _build_system_prompt() -> str:
131
+ spec = load_prompt(_PROMPT_PATH)
132
+ return spec.system
133
+
134
+
135
+ def _build_user_prompt(
136
+ groups: list[CorrelationGroup],
137
+ dependency_graph: dict,
138
+ slo_targets: dict | None,
139
+ *,
140
+ trace_evidence: object | None = None,
141
+ ) -> str:
142
+ sections = []
143
+
144
+ # Cap groups at 10 — drop P3 (lowest priority) first to stay within token budget
145
+ MAX_GROUPS = 10
146
+ if len(groups) > MAX_GROUPS:
147
+ groups = sorted(groups, key=lambda g: g.priority)[:MAX_GROUPS]
148
+
149
+ # Collect services mentioned in groups for dependency graph pruning
150
+ relevant_services = set()
151
+ for g in groups:
152
+ relevant_services.update(g.services)
153
+ for cc in g.change_candidates:
154
+ relevant_services.add(cc.change.service)
155
+
156
+ # Dependency graph — pruned to services in correlation groups + 1 hop
157
+ if dependency_graph:
158
+ # Add 1 hop of deps/dependents
159
+ extended = set(relevant_services)
160
+ for svc in relevant_services:
161
+ info = dependency_graph.get(svc, {})
162
+ extended.update(info.get("dependencies", []))
163
+ extended.update(info.get("dependents", []))
164
+
165
+ dep_lines = []
166
+ for svc, info in sorted(dependency_graph.items()):
167
+ if svc not in extended:
168
+ continue
169
+ deps = info.get("dependencies", [])
170
+ dependents = info.get("dependents", [])
171
+ tier = info.get("tier", "standard")
172
+ dep_lines.append(
173
+ f" {svc} (tier={tier}): depends_on={deps}, depended_by={dependents}"
174
+ )
175
+ if dep_lines:
176
+ sections.append("DEPENDENCY GRAPH:\n" + "\n".join(dep_lines))
177
+
178
+ # SLO targets
179
+ if slo_targets:
180
+ slo_lines = []
181
+ for svc, targets in sorted(slo_targets.items()):
182
+ slo_lines.append(f" {svc}: {targets}")
183
+ sections.append("SLO TARGETS:\n" + "\n".join(slo_lines))
184
+
185
+ # Correlation groups
186
+ for g in groups:
187
+ lines = [
188
+ f"GROUP {g.id} (P{g.priority}):",
189
+ f" Services: {', '.join(g.services)}",
190
+ f" Event count: {g.event_count}",
191
+ f" Time range: {g.first_seen} to {g.last_updated}",
192
+ f" Summary: {g.summary}",
193
+ ]
194
+
195
+ # Topology
196
+ if g.topology:
197
+ lines.append(f" Topology: primary={g.topology.primary_service}, "
198
+ f"related={g.topology.related_services}, "
199
+ f"path={g.topology.topology_path}")
200
+
201
+ # Signals
202
+ for sig in g.signals:
203
+ lines.append(
204
+ f" Signal: {sig.service} — {sig.count} event(s), "
205
+ f"peak_severity={sig.peak_severity:.2f}, "
206
+ f"duration={sig.duration_seconds:.0f}s, "
207
+ f"window={sig.time_window[0]} to {sig.time_window[1]}"
208
+ )
209
+ for evt in sig.events[:5]: # cap per signal to stay within budget
210
+ lines.append(
211
+ f" Event: type={evt.type.value}, source={evt.source}, "
212
+ f"severity={evt.severity:.2f}, payload={_compact_payload(evt.payload)}"
213
+ )
214
+
215
+ # Change candidates
216
+ for cc in g.change_candidates:
217
+ lines.append(
218
+ f" Change candidate: service={cc.change.service}, "
219
+ f"proximity={cc.temporal_proximity_seconds:.0f}s, "
220
+ f"same_service={cc.same_service}, "
221
+ f"dependency_related={cc.dependency_related}, "
222
+ f"payload={_compact_payload(cc.change.payload)}"
223
+ )
224
+
225
+ sections.append("\n".join(lines))
226
+
227
+ # Trace evidence section (optional)
228
+ trace_section = _build_trace_evidence_section(trace_evidence)
229
+ if trace_section:
230
+ sections.append(trace_section)
231
+
232
+ return "\n\n".join(sections)
233
+
234
+
235
+ def _build_trace_evidence_section(trace_evidence: object | None) -> str:
236
+ """Format trace evidence for the reasoning prompt.
237
+
238
+ NOTE: v0 renders trace evidence as flat text for the reasoning prompt.
239
+ Future: support multi-register summaries (span-level detail for
240
+ investigation agent, high-level for communication agent) via the
241
+ Summaries(technical, plain, executive) pattern from decision records.
242
+ """
243
+ if trace_evidence is None:
244
+ return ""
245
+
246
+ from nthlayer_workers.correlate.traces.protocol import TraceEvidence
247
+ if not isinstance(trace_evidence, TraceEvidence):
248
+ return ""
249
+ if not trace_evidence.services:
250
+ return ""
251
+
252
+ sections = ["TRACE EVIDENCE:"]
253
+
254
+ for svc in trace_evidence.services:
255
+ lines = [f" {svc.service}:"]
256
+
257
+ # Latency summary
258
+ if svc.latency_change_pct is not None and abs(svc.latency_change_pct) > 10:
259
+ lines.append(
260
+ f" Latency: p50={svc.p50_latency_ms:.0f}ms, "
261
+ f"p99={svc.p99_latency_ms:.0f}ms "
262
+ f"({svc.latency_change_pct:+.0f}% vs baseline)"
263
+ )
264
+ else:
265
+ lines.append(
266
+ f" Latency: p50={svc.p50_latency_ms:.0f}ms, "
267
+ f"p99={svc.p99_latency_ms:.0f}ms (within baseline)"
268
+ )
269
+
270
+ # Error summary
271
+ if svc.error_rate > 0.01:
272
+ lines.append(
273
+ f" Errors: {svc.error_rate:.1%} error rate "
274
+ f"({svc.error_count}/{svc.total_request_count} requests)"
275
+ )
276
+ for err in svc.top_errors[:3]:
277
+ lines.append(f" - \"{err.error_message}\" (x{err.count})")
278
+
279
+ # Slow operations
280
+ for op in svc.slow_operations[:3]:
281
+ if op.change_pct is not None and op.change_pct > 20:
282
+ lines.append(
283
+ f" Slow operation: {op.operation} "
284
+ f"p99={op.p99_ms:.0f}ms ({op.change_pct:+.0f}% vs baseline)"
285
+ )
286
+
287
+ # Call edges
288
+ if svc.callers:
289
+ caller_str = ", ".join(
290
+ f"{c.source_service} ({c.request_count} reqs, p99={c.p99_latency_ms:.0f}ms)"
291
+ for c in svc.callers[:3]
292
+ )
293
+ lines.append(f" Called by: {caller_str}")
294
+
295
+ if svc.callees:
296
+ callee_str = ", ".join(
297
+ f"{c.target_service} ({c.request_count} reqs, p99={c.p99_latency_ms:.0f}ms, "
298
+ f"err={c.error_count}/{c.request_count})"
299
+ for c in svc.callees[:3]
300
+ )
301
+ lines.append(f" Calls: {callee_str}")
302
+
303
+ sections.append("\n".join(lines))
304
+
305
+ # Topology divergence
306
+ if trace_evidence.topology_divergence:
307
+ div = trace_evidence.topology_divergence
308
+ if div.observed_not_declared:
309
+ sections.append(
310
+ " Undeclared Dependencies (observed in traces, not in specs):\n"
311
+ + "\n".join(f" - {a} → {b}" for a, b in div.observed_not_declared)
312
+ )
313
+ if div.declared_not_observed:
314
+ sections.append(
315
+ " Declared but Unobserved Dependencies (in specs, no traces):\n"
316
+ + "\n".join(f" - {a} → {b}" for a, b in div.declared_not_observed)
317
+ )
318
+
319
+ return "\n\n".join(sections)
320
+
321
+
322
+ def _compact_payload(payload: dict) -> str:
323
+ """Compact payload representation to stay within token budget."""
324
+ compact = {}
325
+ for k, v in payload.items():
326
+ if isinstance(v, dict) and len(str(v)) > 100:
327
+ compact[k] = "{...}"
328
+ elif isinstance(v, list) and len(v) > 5:
329
+ compact[k] = f"[{len(v)} items]"
330
+ else:
331
+ compact[k] = v
332
+ return json.dumps(compact, default=str, separators=(",", ":"))
333
+
334
+
335
+ def _parse_reasoning_response(
336
+ response_text: str,
337
+ groups: list[CorrelationGroup],
338
+ ) -> dict:
339
+ """Parse model JSON response into structured reasoning."""
340
+ from nthlayer_common.parsing import clamp, strip_markdown_fences
341
+
342
+ text = strip_markdown_fences(response_text)
343
+ data = json.loads(text)
344
+
345
+ # Validate and normalize group assessments
346
+ group_assessments = data.get("groups", [])
347
+ valid_group_ids = {g.id for g in groups}
348
+
349
+ normalized = []
350
+ for ga in group_assessments:
351
+ gid = ga.get("group_id", "")
352
+ if gid not in valid_group_ids:
353
+ logger.debug("reasoning_unknown_group_id", group_id=gid)
354
+ continue
355
+
356
+ confidence = clamp(float(ga.get("confidence", 0.5)))
357
+
358
+ normalized.append({
359
+ "group_id": gid,
360
+ "root_cause": ga.get("root_cause"),
361
+ "confidence": confidence,
362
+ "reasoning": ga.get("reasoning", ""),
363
+ "recommended_actions": ga.get("recommended_actions", []),
364
+ "is_causal": ga.get("is_causal"),
365
+ "degraded": False,
366
+ })
367
+
368
+ overall_confidence = clamp(float(data.get("overall_confidence", 0.0)))
369
+
370
+ return {
371
+ "groups": normalized,
372
+ "overall_assessment": data.get("overall_assessment", ""),
373
+ "overall_confidence": overall_confidence,
374
+ "degraded": False,
375
+ }
@@ -0,0 +1,189 @@
1
+ """Session window logic for the correlate worker module.
2
+
3
+ Events are grouped by correlation domain (service + environment) into
4
+ session windows. Windows close on three conditions:
5
+ - Temporal gap: no new events for gap_seconds (default 60s)
6
+ - Max duration: window has been open for max_duration_seconds (default 15m)
7
+ - Trigger: a quality_breach verdict arrives in the domain
8
+
9
+ This is the architectural core of the correlate module — session windows
10
+ are what make correlation novel vs. simple alert aggregation.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import dataclass, field
16
+ from datetime import datetime, timezone
17
+
18
+ from nthlayer_workers.correlate.types import SitRepEvent
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class CorrelationDomain:
23
+ """Key for session window grouping."""
24
+
25
+ service: str
26
+ environment: str
27
+
28
+ @classmethod
29
+ def from_event(cls, event: SitRepEvent) -> CorrelationDomain:
30
+ return cls(service=event.service, environment=event.environment)
31
+
32
+ def __str__(self) -> str:
33
+ return f"{self.service}:{self.environment}"
34
+
35
+
36
+ @dataclass
37
+ class SessionWindow:
38
+ """An open session window accumulating events for a correlation domain."""
39
+
40
+ domain: CorrelationDomain
41
+ events: list[SitRepEvent] = field(default_factory=list)
42
+ opened_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
43
+ last_event_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
44
+ has_trigger: bool = False
45
+
46
+ def _close_flags(
47
+ self,
48
+ now: datetime,
49
+ gap_seconds: float = 60.0,
50
+ max_duration_seconds: float = 900.0,
51
+ ) -> tuple[bool, bool, bool]:
52
+ """Compute the three close condition flags: (gap, max_duration, trigger)."""
53
+ gap = (now - self.last_event_at).total_seconds() >= gap_seconds
54
+ duration = (now - self.opened_at).total_seconds() >= max_duration_seconds
55
+ return (gap, duration, self.has_trigger)
56
+
57
+ def should_close(
58
+ self,
59
+ now: datetime,
60
+ gap_seconds: float = 60.0,
61
+ max_duration_seconds: float = 900.0,
62
+ ) -> bool:
63
+ """Check whether this window should close.
64
+
65
+ Returns True if ANY of the three close conditions is met:
66
+ - Temporal gap: no new event for gap_seconds since last_event_at
67
+ - Max duration: window has been open longer than max_duration_seconds
68
+ (prevents never-closing windows under continuous load)
69
+ - Trigger: a quality_breach verdict arrived in this domain
70
+ """
71
+ return any(self._close_flags(now, gap_seconds, max_duration_seconds))
72
+
73
+ def close_reason(
74
+ self,
75
+ now: datetime,
76
+ gap_seconds: float = 60.0,
77
+ max_duration_seconds: float = 900.0,
78
+ ) -> str:
79
+ """Return the reason this window closed. Trigger takes priority."""
80
+ gap, duration, trigger = self._close_flags(now, gap_seconds, max_duration_seconds)
81
+ if trigger:
82
+ return "trigger"
83
+ if duration:
84
+ return "max_duration"
85
+ if gap:
86
+ return "gap"
87
+ return "unknown"
88
+
89
+ def add_event(self, event: SitRepEvent) -> None:
90
+ """Add an event to the window, updating last_event_at."""
91
+ self.events.append(event)
92
+ ts = _parse_event_ts(event)
93
+ if ts > self.last_event_at:
94
+ self.last_event_at = ts
95
+
96
+
97
+ class SessionWindowManager:
98
+ """Manages open session windows across correlation domains."""
99
+
100
+ def __init__(
101
+ self,
102
+ gap_seconds: float = 60.0,
103
+ max_duration_seconds: float = 900.0,
104
+ ):
105
+ self.gap_seconds = gap_seconds
106
+ self.max_duration_seconds = max_duration_seconds
107
+ self._windows: dict[CorrelationDomain, SessionWindow] = {}
108
+
109
+ @property
110
+ def open_windows(self) -> dict[CorrelationDomain, SessionWindow]:
111
+ return dict(self._windows)
112
+
113
+ def ingest(self, event: SitRepEvent) -> None:
114
+ """Assign an event to a session window. Opens a new window if needed."""
115
+ domain = CorrelationDomain.from_event(event)
116
+ ts = _parse_event_ts(event)
117
+
118
+ if domain not in self._windows:
119
+ # Cap opened_at at now — a stale backlogged event should not
120
+ # push opened_at into the past (which would trigger instant
121
+ # max_duration closure on the next cycle).
122
+ now = datetime.now(timezone.utc)
123
+ self._windows[domain] = SessionWindow(
124
+ domain=domain,
125
+ events=[],
126
+ opened_at=min(ts, now),
127
+ last_event_at=ts,
128
+ )
129
+
130
+ window = self._windows[domain]
131
+ window.add_event(event)
132
+
133
+ # Mark trigger if quality_breach
134
+ if event.source.startswith("verdict:quality_breach") or (
135
+ hasattr(event, "payload")
136
+ and isinstance(event.payload, dict)
137
+ and event.payload.get("type") == "quality_breach"
138
+ ):
139
+ window.has_trigger = True
140
+
141
+ def close_ready(self, now: datetime | None = None) -> list[SessionWindow]:
142
+ """Close and return all windows that meet their close conditions."""
143
+ if now is None:
144
+ now = datetime.now(timezone.utc)
145
+
146
+ closed: list[SessionWindow] = []
147
+ for domain in list(self._windows):
148
+ window = self._windows[domain]
149
+ if window.should_close(now, self.gap_seconds, self.max_duration_seconds):
150
+ closed.append(window)
151
+ del self._windows[domain]
152
+
153
+ return closed
154
+
155
+ def restore_window(self, domain: CorrelationDomain, metadata: dict) -> None:
156
+ """Restore a window from persisted metadata (crash recovery).
157
+
158
+ Events are re-fetched separately; this only restores window state.
159
+ """
160
+ self._windows[domain] = SessionWindow(
161
+ domain=domain,
162
+ events=[],
163
+ opened_at=datetime.fromisoformat(metadata["opened_at"]),
164
+ last_event_at=datetime.fromisoformat(metadata["last_event_at"]),
165
+ has_trigger=metadata.get("has_trigger", False),
166
+ )
167
+
168
+ def to_state(self) -> dict:
169
+ """Serialise open windows to dict for component_state persistence."""
170
+ return {
171
+ str(domain): {
172
+ "opened_at": window.opened_at.isoformat(),
173
+ "last_event_at": window.last_event_at.isoformat(),
174
+ "event_count": len(window.events),
175
+ "has_trigger": window.has_trigger,
176
+ }
177
+ for domain, window in self._windows.items()
178
+ }
179
+
180
+
181
+ def _parse_event_ts(event: SitRepEvent) -> datetime:
182
+ """Parse event timestamp to datetime. Always timezone-aware (UTC)."""
183
+ ts = event.timestamp
184
+ if isinstance(ts, datetime):
185
+ return ts if ts.tzinfo else ts.replace(tzinfo=timezone.utc)
186
+ dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
187
+ if dt.tzinfo is None:
188
+ dt = dt.replace(tzinfo=timezone.utc)
189
+ return dt
@@ -0,0 +1 @@
1
+ """Snapshot generation for SitRep."""