nthlayer-workers 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nthlayer_workers/__init__.py +5 -0
- nthlayer_workers/cli.py +234 -0
- nthlayer_workers/correlate/__init__.py +1 -0
- nthlayer_workers/correlate/cli.py +847 -0
- nthlayer_workers/correlate/config.py +111 -0
- nthlayer_workers/correlate/correlation/__init__.py +1 -0
- nthlayer_workers/correlate/correlation/changes.py +87 -0
- nthlayer_workers/correlate/correlation/dedup.py +62 -0
- nthlayer_workers/correlate/correlation/engine.py +244 -0
- nthlayer_workers/correlate/correlation/temporal.py +79 -0
- nthlayer_workers/correlate/correlation/topology.py +104 -0
- nthlayer_workers/correlate/ingestion/__init__.py +1 -0
- nthlayer_workers/correlate/ingestion/protocol.py +10 -0
- nthlayer_workers/correlate/ingestion/severity.py +18 -0
- nthlayer_workers/correlate/ingestion/webhook.py +197 -0
- nthlayer_workers/correlate/notifications.py +85 -0
- nthlayer_workers/correlate/prometheus.py +234 -0
- nthlayer_workers/correlate/reasoning.py +375 -0
- nthlayer_workers/correlate/session.py +189 -0
- nthlayer_workers/correlate/snapshot/__init__.py +1 -0
- nthlayer_workers/correlate/snapshot/generator.py +170 -0
- nthlayer_workers/correlate/snapshot/model.py +177 -0
- nthlayer_workers/correlate/snapshot/token.py +14 -0
- nthlayer_workers/correlate/state.py +88 -0
- nthlayer_workers/correlate/store/__init__.py +5 -0
- nthlayer_workers/correlate/store/protocol.py +48 -0
- nthlayer_workers/correlate/store/sqlite.py +443 -0
- nthlayer_workers/correlate/summary.py +180 -0
- nthlayer_workers/correlate/traces/__init__.py +1 -0
- nthlayer_workers/correlate/traces/protocol.py +120 -0
- nthlayer_workers/correlate/traces/tempo.py +667 -0
- nthlayer_workers/correlate/traces/topology.py +39 -0
- nthlayer_workers/correlate/types.py +77 -0
- nthlayer_workers/correlate/worker.py +630 -0
- nthlayer_workers/learn/__init__.py +5 -0
- nthlayer_workers/learn/__main__.py +5 -0
- nthlayer_workers/learn/cli.py +164 -0
- nthlayer_workers/learn/retrospective.py +381 -0
- nthlayer_workers/learn/trends.py +102 -0
- nthlayer_workers/learn/worker.py +366 -0
- nthlayer_workers/measure/__init__.py +3 -0
- nthlayer_workers/measure/__main__.py +5 -0
- nthlayer_workers/measure/_parsing.py +15 -0
- nthlayer_workers/measure/adapters/__init__.py +0 -0
- nthlayer_workers/measure/adapters/_util.py +24 -0
- nthlayer_workers/measure/adapters/devin.py +119 -0
- nthlayer_workers/measure/adapters/gastown.py +88 -0
- nthlayer_workers/measure/adapters/prometheus.py +277 -0
- nthlayer_workers/measure/adapters/protocol.py +20 -0
- nthlayer_workers/measure/adapters/webhook.py +161 -0
- nthlayer_workers/measure/api/__init__.py +0 -0
- nthlayer_workers/measure/api/normalise.py +50 -0
- nthlayer_workers/measure/api/queue.py +243 -0
- nthlayer_workers/measure/api/response.py +51 -0
- nthlayer_workers/measure/api/server.py +504 -0
- nthlayer_workers/measure/calibration/__init__.py +0 -0
- nthlayer_workers/measure/calibration/loop.py +62 -0
- nthlayer_workers/measure/calibration/slos.py +212 -0
- nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
- nthlayer_workers/measure/cli.py +753 -0
- nthlayer_workers/measure/config.py +191 -0
- nthlayer_workers/measure/detection/__init__.py +6 -0
- nthlayer_workers/measure/detection/detector.py +82 -0
- nthlayer_workers/measure/detection/protocol.py +29 -0
- nthlayer_workers/measure/governance/__init__.py +0 -0
- nthlayer_workers/measure/governance/engine.py +163 -0
- nthlayer_workers/measure/manifest.py +77 -0
- nthlayer_workers/measure/notifications.py +53 -0
- nthlayer_workers/measure/pipeline/__init__.py +0 -0
- nthlayer_workers/measure/pipeline/evaluator.py +155 -0
- nthlayer_workers/measure/pipeline/router.py +160 -0
- nthlayer_workers/measure/store/__init__.py +0 -0
- nthlayer_workers/measure/store/protocol.py +38 -0
- nthlayer_workers/measure/store/sqlite.py +276 -0
- nthlayer_workers/measure/telemetry.py +116 -0
- nthlayer_workers/measure/tiering/__init__.py +0 -0
- nthlayer_workers/measure/tiering/classifier.py +58 -0
- nthlayer_workers/measure/tiering/promotion.py +118 -0
- nthlayer_workers/measure/trends/__init__.py +0 -0
- nthlayer_workers/measure/trends/tracker.py +72 -0
- nthlayer_workers/measure/types.py +75 -0
- nthlayer_workers/measure/worker.py +439 -0
- nthlayer_workers/observe/__init__.py +25 -0
- nthlayer_workers/observe/__main__.py +5 -0
- nthlayer_workers/observe/api/__init__.py +1 -0
- nthlayer_workers/observe/assessment.py +95 -0
- nthlayer_workers/observe/cli.py +737 -0
- nthlayer_workers/observe/config.py +11 -0
- nthlayer_workers/observe/db/__init__.py +1 -0
- nthlayer_workers/observe/decision_records.py +220 -0
- nthlayer_workers/observe/dependencies/__init__.py +18 -0
- nthlayer_workers/observe/dependencies/discovery.py +294 -0
- nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
- nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
- nthlayer_workers/observe/dependencies/providers/base.py +76 -0
- nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
- nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
- nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
- nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
- nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
- nthlayer_workers/observe/deployments/__init__.py +1 -0
- nthlayer_workers/observe/discovery/__init__.py +14 -0
- nthlayer_workers/observe/discovery/classifier.py +66 -0
- nthlayer_workers/observe/discovery/client.py +189 -0
- nthlayer_workers/observe/discovery/models.py +53 -0
- nthlayer_workers/observe/drift/__init__.py +26 -0
- nthlayer_workers/observe/drift/analyzer.py +383 -0
- nthlayer_workers/observe/drift/models.py +174 -0
- nthlayer_workers/observe/drift/patterns.py +88 -0
- nthlayer_workers/observe/explanation.py +118 -0
- nthlayer_workers/observe/gate/__init__.py +39 -0
- nthlayer_workers/observe/gate/conditions.py +92 -0
- nthlayer_workers/observe/gate/correlator.py +154 -0
- nthlayer_workers/observe/gate/evaluator.py +192 -0
- nthlayer_workers/observe/gate/policies.py +226 -0
- nthlayer_workers/observe/gate_adapter.py +40 -0
- nthlayer_workers/observe/incident.py +36 -0
- nthlayer_workers/observe/portfolio/__init__.py +17 -0
- nthlayer_workers/observe/portfolio/aggregator.py +168 -0
- nthlayer_workers/observe/portfolio/scorer.py +13 -0
- nthlayer_workers/observe/slo/__init__.py +19 -0
- nthlayer_workers/observe/slo/collector.py +235 -0
- nthlayer_workers/observe/slo/spec_loader.py +40 -0
- nthlayer_workers/observe/sqlite_store.py +152 -0
- nthlayer_workers/observe/store.py +92 -0
- nthlayer_workers/observe/verification/__init__.py +22 -0
- nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
- nthlayer_workers/observe/verification/extractor.py +127 -0
- nthlayer_workers/observe/verification/models.py +101 -0
- nthlayer_workers/observe/verification/verifier.py +111 -0
- nthlayer_workers/observe/worker.py +332 -0
- nthlayer_workers/respond/__init__.py +2 -0
- nthlayer_workers/respond/__main__.py +4 -0
- nthlayer_workers/respond/agents/__init__.py +0 -0
- nthlayer_workers/respond/agents/base.py +556 -0
- nthlayer_workers/respond/agents/communication.py +115 -0
- nthlayer_workers/respond/agents/investigation.py +124 -0
- nthlayer_workers/respond/agents/remediation.py +219 -0
- nthlayer_workers/respond/agents/triage.py +132 -0
- nthlayer_workers/respond/cli.py +772 -0
- nthlayer_workers/respond/config.py +135 -0
- nthlayer_workers/respond/context_store.py +256 -0
- nthlayer_workers/respond/coordinator.py +487 -0
- nthlayer_workers/respond/metrics.py +104 -0
- nthlayer_workers/respond/notification_backends/__init__.py +1 -0
- nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
- nthlayer_workers/respond/notification_backends/protocol.py +59 -0
- nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
- nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
- nthlayer_workers/respond/notifications.py +247 -0
- nthlayer_workers/respond/oncall/__init__.py +1 -0
- nthlayer_workers/respond/oncall/escalation.py +103 -0
- nthlayer_workers/respond/oncall/runner.py +193 -0
- nthlayer_workers/respond/oncall/schedule.py +243 -0
- nthlayer_workers/respond/safe_actions/__init__.py +0 -0
- nthlayer_workers/respond/safe_actions/actions.py +139 -0
- nthlayer_workers/respond/safe_actions/registry.py +171 -0
- nthlayer_workers/respond/safe_actions/webhook.py +194 -0
- nthlayer_workers/respond/server.py +357 -0
- nthlayer_workers/respond/sre/__init__.py +1 -0
- nthlayer_workers/respond/sre/brief.py +175 -0
- nthlayer_workers/respond/sre/delegation.py +101 -0
- nthlayer_workers/respond/sre/post_incident.py +146 -0
- nthlayer_workers/respond/sre/shift_report.py +129 -0
- nthlayer_workers/respond/sre/suppression.py +91 -0
- nthlayer_workers/respond/types.py +109 -0
- nthlayer_workers/respond/verdict_submission.py +56 -0
- nthlayer_workers/respond/worker.py +533 -0
- nthlayer_workers/respond/worker_helpers.py +140 -0
- nthlayer_workers/runner.py +198 -0
- nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
- nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
- nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
- nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
- nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""Reasoning layer for correlation groups.
|
|
2
|
+
|
|
3
|
+
Used by: the `correlate` CLI subcommand (live Prometheus-triggered correlation).
|
|
4
|
+
See also: snapshot/model.py which serves the `serve` and `replay` subcommands.
|
|
5
|
+
|
|
6
|
+
Sits between CorrelationEngine.correlate() group assembly and verdict creation.
|
|
7
|
+
Calls an LLM to assess causal relationships, root causes, and recommended actions.
|
|
8
|
+
Provider-agnostic via nthlayer-common wrapper (Anthropic, OpenAI, Ollama, etc.).
|
|
9
|
+
Additive: --no-reasoning produces identical output to pre-reasoning behavior.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import structlog
|
|
19
|
+
|
|
20
|
+
from nthlayer_common.prompts import load_prompt
|
|
21
|
+
from nthlayer_workers.correlate.types import CorrelationGroup
|
|
22
|
+
|
|
23
|
+
_PROMPT_PATH = Path(__file__).parent.parent.parent / "prompts" / "reasoning.yaml"
|
|
24
|
+
|
|
25
|
+
logger = structlog.get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def reasoning_available() -> bool:
|
|
29
|
+
"""Check if an LLM provider is configured and reachable.
|
|
30
|
+
|
|
31
|
+
Returns True if NTHLAYER_MODEL is set (any provider, including keyless
|
|
32
|
+
ones like Ollama) or if an API key for a cloud provider is set.
|
|
33
|
+
"""
|
|
34
|
+
return bool(
|
|
35
|
+
os.environ.get("NTHLAYER_MODEL")
|
|
36
|
+
or os.environ.get("ANTHROPIC_API_KEY")
|
|
37
|
+
or os.environ.get("OPENAI_API_KEY")
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
async def reason_about_correlations(
|
|
42
|
+
groups: list[CorrelationGroup],
|
|
43
|
+
dependency_graph: dict,
|
|
44
|
+
slo_targets: dict | None = None,
|
|
45
|
+
trace_evidence: object | None = None,
|
|
46
|
+
model: str | None = None,
|
|
47
|
+
max_tokens: int = 4096,
|
|
48
|
+
timeout: int = 30,
|
|
49
|
+
) -> dict:
|
|
50
|
+
"""Call an LLM to reason about pre-correlated groups.
|
|
51
|
+
|
|
52
|
+
Returns structured reasoning dict with keys:
|
|
53
|
+
- groups: list of per-group reasoning (root_cause, confidence, reasoning, recommended_actions)
|
|
54
|
+
- overall_assessment: str
|
|
55
|
+
- overall_confidence: float
|
|
56
|
+
|
|
57
|
+
Falls back to _degraded_reasoning() on any failure.
|
|
58
|
+
"""
|
|
59
|
+
if not groups:
|
|
60
|
+
return _degraded_reasoning(groups, reason="no correlation groups to assess")
|
|
61
|
+
|
|
62
|
+
system_prompt = _build_system_prompt()
|
|
63
|
+
user_prompt = _build_user_prompt(groups, dependency_graph, slo_targets, trace_evidence=trace_evidence)
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
response_text = await _call_model(
|
|
67
|
+
system_prompt, user_prompt, model, max_tokens, timeout
|
|
68
|
+
)
|
|
69
|
+
result = _parse_reasoning_response(response_text, groups)
|
|
70
|
+
logger.info(
|
|
71
|
+
"reasoning_complete",
|
|
72
|
+
groups=len(groups),
|
|
73
|
+
overall_confidence=result.get("overall_confidence", 0.0),
|
|
74
|
+
)
|
|
75
|
+
return result
|
|
76
|
+
except Exception as exc:
|
|
77
|
+
logger.warning("reasoning_failed", error=str(exc))
|
|
78
|
+
return _degraded_reasoning(groups, reason=str(exc))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _degraded_reasoning(
|
|
82
|
+
groups: list[CorrelationGroup],
|
|
83
|
+
reason: str = "model unavailable",
|
|
84
|
+
) -> dict:
|
|
85
|
+
"""Fallback reasoning when API is unavailable.
|
|
86
|
+
|
|
87
|
+
Confidence 0.0, tagged as degraded. Transport continues, judgment pauses.
|
|
88
|
+
"""
|
|
89
|
+
group_assessments = []
|
|
90
|
+
for g in groups:
|
|
91
|
+
group_assessments.append({
|
|
92
|
+
"group_id": g.id,
|
|
93
|
+
"root_cause": None,
|
|
94
|
+
"confidence": 0.0,
|
|
95
|
+
"reasoning": f"Degraded mode: {reason}",
|
|
96
|
+
"recommended_actions": [],
|
|
97
|
+
"is_causal": None,
|
|
98
|
+
"degraded": True,
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"groups": group_assessments,
|
|
103
|
+
"overall_assessment": f"Reasoning unavailable: {reason}",
|
|
104
|
+
"overall_confidence": 0.0,
|
|
105
|
+
"degraded": True,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
async def _call_model(
|
|
110
|
+
system_prompt: str,
|
|
111
|
+
user_prompt: str,
|
|
112
|
+
model: str | None,
|
|
113
|
+
max_tokens: int,
|
|
114
|
+
timeout: int,
|
|
115
|
+
) -> str:
|
|
116
|
+
"""Call LLM via the shared nthlayer-common wrapper."""
|
|
117
|
+
from nthlayer_common.llm import llm_call
|
|
118
|
+
|
|
119
|
+
result = await asyncio.to_thread(
|
|
120
|
+
llm_call,
|
|
121
|
+
system=system_prompt,
|
|
122
|
+
user=user_prompt,
|
|
123
|
+
model=model,
|
|
124
|
+
max_tokens=max_tokens,
|
|
125
|
+
timeout=timeout,
|
|
126
|
+
)
|
|
127
|
+
return result.text
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _build_system_prompt() -> str:
|
|
131
|
+
spec = load_prompt(_PROMPT_PATH)
|
|
132
|
+
return spec.system
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _build_user_prompt(
|
|
136
|
+
groups: list[CorrelationGroup],
|
|
137
|
+
dependency_graph: dict,
|
|
138
|
+
slo_targets: dict | None,
|
|
139
|
+
*,
|
|
140
|
+
trace_evidence: object | None = None,
|
|
141
|
+
) -> str:
|
|
142
|
+
sections = []
|
|
143
|
+
|
|
144
|
+
# Cap groups at 10 — drop P3 (lowest priority) first to stay within token budget
|
|
145
|
+
MAX_GROUPS = 10
|
|
146
|
+
if len(groups) > MAX_GROUPS:
|
|
147
|
+
groups = sorted(groups, key=lambda g: g.priority)[:MAX_GROUPS]
|
|
148
|
+
|
|
149
|
+
# Collect services mentioned in groups for dependency graph pruning
|
|
150
|
+
relevant_services = set()
|
|
151
|
+
for g in groups:
|
|
152
|
+
relevant_services.update(g.services)
|
|
153
|
+
for cc in g.change_candidates:
|
|
154
|
+
relevant_services.add(cc.change.service)
|
|
155
|
+
|
|
156
|
+
# Dependency graph — pruned to services in correlation groups + 1 hop
|
|
157
|
+
if dependency_graph:
|
|
158
|
+
# Add 1 hop of deps/dependents
|
|
159
|
+
extended = set(relevant_services)
|
|
160
|
+
for svc in relevant_services:
|
|
161
|
+
info = dependency_graph.get(svc, {})
|
|
162
|
+
extended.update(info.get("dependencies", []))
|
|
163
|
+
extended.update(info.get("dependents", []))
|
|
164
|
+
|
|
165
|
+
dep_lines = []
|
|
166
|
+
for svc, info in sorted(dependency_graph.items()):
|
|
167
|
+
if svc not in extended:
|
|
168
|
+
continue
|
|
169
|
+
deps = info.get("dependencies", [])
|
|
170
|
+
dependents = info.get("dependents", [])
|
|
171
|
+
tier = info.get("tier", "standard")
|
|
172
|
+
dep_lines.append(
|
|
173
|
+
f" {svc} (tier={tier}): depends_on={deps}, depended_by={dependents}"
|
|
174
|
+
)
|
|
175
|
+
if dep_lines:
|
|
176
|
+
sections.append("DEPENDENCY GRAPH:\n" + "\n".join(dep_lines))
|
|
177
|
+
|
|
178
|
+
# SLO targets
|
|
179
|
+
if slo_targets:
|
|
180
|
+
slo_lines = []
|
|
181
|
+
for svc, targets in sorted(slo_targets.items()):
|
|
182
|
+
slo_lines.append(f" {svc}: {targets}")
|
|
183
|
+
sections.append("SLO TARGETS:\n" + "\n".join(slo_lines))
|
|
184
|
+
|
|
185
|
+
# Correlation groups
|
|
186
|
+
for g in groups:
|
|
187
|
+
lines = [
|
|
188
|
+
f"GROUP {g.id} (P{g.priority}):",
|
|
189
|
+
f" Services: {', '.join(g.services)}",
|
|
190
|
+
f" Event count: {g.event_count}",
|
|
191
|
+
f" Time range: {g.first_seen} to {g.last_updated}",
|
|
192
|
+
f" Summary: {g.summary}",
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
# Topology
|
|
196
|
+
if g.topology:
|
|
197
|
+
lines.append(f" Topology: primary={g.topology.primary_service}, "
|
|
198
|
+
f"related={g.topology.related_services}, "
|
|
199
|
+
f"path={g.topology.topology_path}")
|
|
200
|
+
|
|
201
|
+
# Signals
|
|
202
|
+
for sig in g.signals:
|
|
203
|
+
lines.append(
|
|
204
|
+
f" Signal: {sig.service} — {sig.count} event(s), "
|
|
205
|
+
f"peak_severity={sig.peak_severity:.2f}, "
|
|
206
|
+
f"duration={sig.duration_seconds:.0f}s, "
|
|
207
|
+
f"window={sig.time_window[0]} to {sig.time_window[1]}"
|
|
208
|
+
)
|
|
209
|
+
for evt in sig.events[:5]: # cap per signal to stay within budget
|
|
210
|
+
lines.append(
|
|
211
|
+
f" Event: type={evt.type.value}, source={evt.source}, "
|
|
212
|
+
f"severity={evt.severity:.2f}, payload={_compact_payload(evt.payload)}"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Change candidates
|
|
216
|
+
for cc in g.change_candidates:
|
|
217
|
+
lines.append(
|
|
218
|
+
f" Change candidate: service={cc.change.service}, "
|
|
219
|
+
f"proximity={cc.temporal_proximity_seconds:.0f}s, "
|
|
220
|
+
f"same_service={cc.same_service}, "
|
|
221
|
+
f"dependency_related={cc.dependency_related}, "
|
|
222
|
+
f"payload={_compact_payload(cc.change.payload)}"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
sections.append("\n".join(lines))
|
|
226
|
+
|
|
227
|
+
# Trace evidence section (optional)
|
|
228
|
+
trace_section = _build_trace_evidence_section(trace_evidence)
|
|
229
|
+
if trace_section:
|
|
230
|
+
sections.append(trace_section)
|
|
231
|
+
|
|
232
|
+
return "\n\n".join(sections)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _build_trace_evidence_section(trace_evidence: object | None) -> str:
|
|
236
|
+
"""Format trace evidence for the reasoning prompt.
|
|
237
|
+
|
|
238
|
+
NOTE: v0 renders trace evidence as flat text for the reasoning prompt.
|
|
239
|
+
Future: support multi-register summaries (span-level detail for
|
|
240
|
+
investigation agent, high-level for communication agent) via the
|
|
241
|
+
Summaries(technical, plain, executive) pattern from decision records.
|
|
242
|
+
"""
|
|
243
|
+
if trace_evidence is None:
|
|
244
|
+
return ""
|
|
245
|
+
|
|
246
|
+
from nthlayer_workers.correlate.traces.protocol import TraceEvidence
|
|
247
|
+
if not isinstance(trace_evidence, TraceEvidence):
|
|
248
|
+
return ""
|
|
249
|
+
if not trace_evidence.services:
|
|
250
|
+
return ""
|
|
251
|
+
|
|
252
|
+
sections = ["TRACE EVIDENCE:"]
|
|
253
|
+
|
|
254
|
+
for svc in trace_evidence.services:
|
|
255
|
+
lines = [f" {svc.service}:"]
|
|
256
|
+
|
|
257
|
+
# Latency summary
|
|
258
|
+
if svc.latency_change_pct is not None and abs(svc.latency_change_pct) > 10:
|
|
259
|
+
lines.append(
|
|
260
|
+
f" Latency: p50={svc.p50_latency_ms:.0f}ms, "
|
|
261
|
+
f"p99={svc.p99_latency_ms:.0f}ms "
|
|
262
|
+
f"({svc.latency_change_pct:+.0f}% vs baseline)"
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
lines.append(
|
|
266
|
+
f" Latency: p50={svc.p50_latency_ms:.0f}ms, "
|
|
267
|
+
f"p99={svc.p99_latency_ms:.0f}ms (within baseline)"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Error summary
|
|
271
|
+
if svc.error_rate > 0.01:
|
|
272
|
+
lines.append(
|
|
273
|
+
f" Errors: {svc.error_rate:.1%} error rate "
|
|
274
|
+
f"({svc.error_count}/{svc.total_request_count} requests)"
|
|
275
|
+
)
|
|
276
|
+
for err in svc.top_errors[:3]:
|
|
277
|
+
lines.append(f" - \"{err.error_message}\" (x{err.count})")
|
|
278
|
+
|
|
279
|
+
# Slow operations
|
|
280
|
+
for op in svc.slow_operations[:3]:
|
|
281
|
+
if op.change_pct is not None and op.change_pct > 20:
|
|
282
|
+
lines.append(
|
|
283
|
+
f" Slow operation: {op.operation} "
|
|
284
|
+
f"p99={op.p99_ms:.0f}ms ({op.change_pct:+.0f}% vs baseline)"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Call edges
|
|
288
|
+
if svc.callers:
|
|
289
|
+
caller_str = ", ".join(
|
|
290
|
+
f"{c.source_service} ({c.request_count} reqs, p99={c.p99_latency_ms:.0f}ms)"
|
|
291
|
+
for c in svc.callers[:3]
|
|
292
|
+
)
|
|
293
|
+
lines.append(f" Called by: {caller_str}")
|
|
294
|
+
|
|
295
|
+
if svc.callees:
|
|
296
|
+
callee_str = ", ".join(
|
|
297
|
+
f"{c.target_service} ({c.request_count} reqs, p99={c.p99_latency_ms:.0f}ms, "
|
|
298
|
+
f"err={c.error_count}/{c.request_count})"
|
|
299
|
+
for c in svc.callees[:3]
|
|
300
|
+
)
|
|
301
|
+
lines.append(f" Calls: {callee_str}")
|
|
302
|
+
|
|
303
|
+
sections.append("\n".join(lines))
|
|
304
|
+
|
|
305
|
+
# Topology divergence
|
|
306
|
+
if trace_evidence.topology_divergence:
|
|
307
|
+
div = trace_evidence.topology_divergence
|
|
308
|
+
if div.observed_not_declared:
|
|
309
|
+
sections.append(
|
|
310
|
+
" Undeclared Dependencies (observed in traces, not in specs):\n"
|
|
311
|
+
+ "\n".join(f" - {a} → {b}" for a, b in div.observed_not_declared)
|
|
312
|
+
)
|
|
313
|
+
if div.declared_not_observed:
|
|
314
|
+
sections.append(
|
|
315
|
+
" Declared but Unobserved Dependencies (in specs, no traces):\n"
|
|
316
|
+
+ "\n".join(f" - {a} → {b}" for a, b in div.declared_not_observed)
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
return "\n\n".join(sections)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _compact_payload(payload: dict) -> str:
|
|
323
|
+
"""Compact payload representation to stay within token budget."""
|
|
324
|
+
compact = {}
|
|
325
|
+
for k, v in payload.items():
|
|
326
|
+
if isinstance(v, dict) and len(str(v)) > 100:
|
|
327
|
+
compact[k] = "{...}"
|
|
328
|
+
elif isinstance(v, list) and len(v) > 5:
|
|
329
|
+
compact[k] = f"[{len(v)} items]"
|
|
330
|
+
else:
|
|
331
|
+
compact[k] = v
|
|
332
|
+
return json.dumps(compact, default=str, separators=(",", ":"))
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _parse_reasoning_response(
|
|
336
|
+
response_text: str,
|
|
337
|
+
groups: list[CorrelationGroup],
|
|
338
|
+
) -> dict:
|
|
339
|
+
"""Parse model JSON response into structured reasoning."""
|
|
340
|
+
from nthlayer_common.parsing import clamp, strip_markdown_fences
|
|
341
|
+
|
|
342
|
+
text = strip_markdown_fences(response_text)
|
|
343
|
+
data = json.loads(text)
|
|
344
|
+
|
|
345
|
+
# Validate and normalize group assessments
|
|
346
|
+
group_assessments = data.get("groups", [])
|
|
347
|
+
valid_group_ids = {g.id for g in groups}
|
|
348
|
+
|
|
349
|
+
normalized = []
|
|
350
|
+
for ga in group_assessments:
|
|
351
|
+
gid = ga.get("group_id", "")
|
|
352
|
+
if gid not in valid_group_ids:
|
|
353
|
+
logger.debug("reasoning_unknown_group_id", group_id=gid)
|
|
354
|
+
continue
|
|
355
|
+
|
|
356
|
+
confidence = clamp(float(ga.get("confidence", 0.5)))
|
|
357
|
+
|
|
358
|
+
normalized.append({
|
|
359
|
+
"group_id": gid,
|
|
360
|
+
"root_cause": ga.get("root_cause"),
|
|
361
|
+
"confidence": confidence,
|
|
362
|
+
"reasoning": ga.get("reasoning", ""),
|
|
363
|
+
"recommended_actions": ga.get("recommended_actions", []),
|
|
364
|
+
"is_causal": ga.get("is_causal"),
|
|
365
|
+
"degraded": False,
|
|
366
|
+
})
|
|
367
|
+
|
|
368
|
+
overall_confidence = clamp(float(data.get("overall_confidence", 0.0)))
|
|
369
|
+
|
|
370
|
+
return {
|
|
371
|
+
"groups": normalized,
|
|
372
|
+
"overall_assessment": data.get("overall_assessment", ""),
|
|
373
|
+
"overall_confidence": overall_confidence,
|
|
374
|
+
"degraded": False,
|
|
375
|
+
}
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Session window logic for the correlate worker module.
|
|
2
|
+
|
|
3
|
+
Events are grouped by correlation domain (service + environment) into
|
|
4
|
+
session windows. Windows close on three conditions:
|
|
5
|
+
- Temporal gap: no new events for gap_seconds (default 60s)
|
|
6
|
+
- Max duration: window has been open for max_duration_seconds (default 15m)
|
|
7
|
+
- Trigger: a quality_breach verdict arrives in the domain
|
|
8
|
+
|
|
9
|
+
This is the architectural core of the correlate module — session windows
|
|
10
|
+
are what make correlation novel vs. simple alert aggregation.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from datetime import datetime, timezone
|
|
17
|
+
|
|
18
|
+
from nthlayer_workers.correlate.types import SitRepEvent
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class CorrelationDomain:
|
|
23
|
+
"""Key for session window grouping."""
|
|
24
|
+
|
|
25
|
+
service: str
|
|
26
|
+
environment: str
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def from_event(cls, event: SitRepEvent) -> CorrelationDomain:
|
|
30
|
+
return cls(service=event.service, environment=event.environment)
|
|
31
|
+
|
|
32
|
+
def __str__(self) -> str:
|
|
33
|
+
return f"{self.service}:{self.environment}"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class SessionWindow:
|
|
38
|
+
"""An open session window accumulating events for a correlation domain."""
|
|
39
|
+
|
|
40
|
+
domain: CorrelationDomain
|
|
41
|
+
events: list[SitRepEvent] = field(default_factory=list)
|
|
42
|
+
opened_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
43
|
+
last_event_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
44
|
+
has_trigger: bool = False
|
|
45
|
+
|
|
46
|
+
def _close_flags(
|
|
47
|
+
self,
|
|
48
|
+
now: datetime,
|
|
49
|
+
gap_seconds: float = 60.0,
|
|
50
|
+
max_duration_seconds: float = 900.0,
|
|
51
|
+
) -> tuple[bool, bool, bool]:
|
|
52
|
+
"""Compute the three close condition flags: (gap, max_duration, trigger)."""
|
|
53
|
+
gap = (now - self.last_event_at).total_seconds() >= gap_seconds
|
|
54
|
+
duration = (now - self.opened_at).total_seconds() >= max_duration_seconds
|
|
55
|
+
return (gap, duration, self.has_trigger)
|
|
56
|
+
|
|
57
|
+
def should_close(
|
|
58
|
+
self,
|
|
59
|
+
now: datetime,
|
|
60
|
+
gap_seconds: float = 60.0,
|
|
61
|
+
max_duration_seconds: float = 900.0,
|
|
62
|
+
) -> bool:
|
|
63
|
+
"""Check whether this window should close.
|
|
64
|
+
|
|
65
|
+
Returns True if ANY of the three close conditions is met:
|
|
66
|
+
- Temporal gap: no new event for gap_seconds since last_event_at
|
|
67
|
+
- Max duration: window has been open longer than max_duration_seconds
|
|
68
|
+
(prevents never-closing windows under continuous load)
|
|
69
|
+
- Trigger: a quality_breach verdict arrived in this domain
|
|
70
|
+
"""
|
|
71
|
+
return any(self._close_flags(now, gap_seconds, max_duration_seconds))
|
|
72
|
+
|
|
73
|
+
def close_reason(
|
|
74
|
+
self,
|
|
75
|
+
now: datetime,
|
|
76
|
+
gap_seconds: float = 60.0,
|
|
77
|
+
max_duration_seconds: float = 900.0,
|
|
78
|
+
) -> str:
|
|
79
|
+
"""Return the reason this window closed. Trigger takes priority."""
|
|
80
|
+
gap, duration, trigger = self._close_flags(now, gap_seconds, max_duration_seconds)
|
|
81
|
+
if trigger:
|
|
82
|
+
return "trigger"
|
|
83
|
+
if duration:
|
|
84
|
+
return "max_duration"
|
|
85
|
+
if gap:
|
|
86
|
+
return "gap"
|
|
87
|
+
return "unknown"
|
|
88
|
+
|
|
89
|
+
def add_event(self, event: SitRepEvent) -> None:
|
|
90
|
+
"""Add an event to the window, updating last_event_at."""
|
|
91
|
+
self.events.append(event)
|
|
92
|
+
ts = _parse_event_ts(event)
|
|
93
|
+
if ts > self.last_event_at:
|
|
94
|
+
self.last_event_at = ts
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class SessionWindowManager:
|
|
98
|
+
"""Manages open session windows across correlation domains."""
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
gap_seconds: float = 60.0,
|
|
103
|
+
max_duration_seconds: float = 900.0,
|
|
104
|
+
):
|
|
105
|
+
self.gap_seconds = gap_seconds
|
|
106
|
+
self.max_duration_seconds = max_duration_seconds
|
|
107
|
+
self._windows: dict[CorrelationDomain, SessionWindow] = {}
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def open_windows(self) -> dict[CorrelationDomain, SessionWindow]:
|
|
111
|
+
return dict(self._windows)
|
|
112
|
+
|
|
113
|
+
def ingest(self, event: SitRepEvent) -> None:
|
|
114
|
+
"""Assign an event to a session window. Opens a new window if needed."""
|
|
115
|
+
domain = CorrelationDomain.from_event(event)
|
|
116
|
+
ts = _parse_event_ts(event)
|
|
117
|
+
|
|
118
|
+
if domain not in self._windows:
|
|
119
|
+
# Cap opened_at at now — a stale backlogged event should not
|
|
120
|
+
# push opened_at into the past (which would trigger instant
|
|
121
|
+
# max_duration closure on the next cycle).
|
|
122
|
+
now = datetime.now(timezone.utc)
|
|
123
|
+
self._windows[domain] = SessionWindow(
|
|
124
|
+
domain=domain,
|
|
125
|
+
events=[],
|
|
126
|
+
opened_at=min(ts, now),
|
|
127
|
+
last_event_at=ts,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
window = self._windows[domain]
|
|
131
|
+
window.add_event(event)
|
|
132
|
+
|
|
133
|
+
# Mark trigger if quality_breach
|
|
134
|
+
if event.source.startswith("verdict:quality_breach") or (
|
|
135
|
+
hasattr(event, "payload")
|
|
136
|
+
and isinstance(event.payload, dict)
|
|
137
|
+
and event.payload.get("type") == "quality_breach"
|
|
138
|
+
):
|
|
139
|
+
window.has_trigger = True
|
|
140
|
+
|
|
141
|
+
def close_ready(self, now: datetime | None = None) -> list[SessionWindow]:
|
|
142
|
+
"""Close and return all windows that meet their close conditions."""
|
|
143
|
+
if now is None:
|
|
144
|
+
now = datetime.now(timezone.utc)
|
|
145
|
+
|
|
146
|
+
closed: list[SessionWindow] = []
|
|
147
|
+
for domain in list(self._windows):
|
|
148
|
+
window = self._windows[domain]
|
|
149
|
+
if window.should_close(now, self.gap_seconds, self.max_duration_seconds):
|
|
150
|
+
closed.append(window)
|
|
151
|
+
del self._windows[domain]
|
|
152
|
+
|
|
153
|
+
return closed
|
|
154
|
+
|
|
155
|
+
def restore_window(self, domain: CorrelationDomain, metadata: dict) -> None:
|
|
156
|
+
"""Restore a window from persisted metadata (crash recovery).
|
|
157
|
+
|
|
158
|
+
Events are re-fetched separately; this only restores window state.
|
|
159
|
+
"""
|
|
160
|
+
self._windows[domain] = SessionWindow(
|
|
161
|
+
domain=domain,
|
|
162
|
+
events=[],
|
|
163
|
+
opened_at=datetime.fromisoformat(metadata["opened_at"]),
|
|
164
|
+
last_event_at=datetime.fromisoformat(metadata["last_event_at"]),
|
|
165
|
+
has_trigger=metadata.get("has_trigger", False),
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def to_state(self) -> dict:
|
|
169
|
+
"""Serialise open windows to dict for component_state persistence."""
|
|
170
|
+
return {
|
|
171
|
+
str(domain): {
|
|
172
|
+
"opened_at": window.opened_at.isoformat(),
|
|
173
|
+
"last_event_at": window.last_event_at.isoformat(),
|
|
174
|
+
"event_count": len(window.events),
|
|
175
|
+
"has_trigger": window.has_trigger,
|
|
176
|
+
}
|
|
177
|
+
for domain, window in self._windows.items()
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _parse_event_ts(event: SitRepEvent) -> datetime:
|
|
182
|
+
"""Parse event timestamp to datetime. Always timezone-aware (UTC)."""
|
|
183
|
+
ts = event.timestamp
|
|
184
|
+
if isinstance(ts, datetime):
|
|
185
|
+
return ts if ts.tzinfo else ts.replace(tzinfo=timezone.utc)
|
|
186
|
+
dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
|
187
|
+
if dt.tzinfo is None:
|
|
188
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
189
|
+
return dt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Snapshot generation for SitRep."""
|