nthlayer-workers 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nthlayer_workers/__init__.py +5 -0
- nthlayer_workers/cli.py +234 -0
- nthlayer_workers/correlate/__init__.py +1 -0
- nthlayer_workers/correlate/cli.py +847 -0
- nthlayer_workers/correlate/config.py +111 -0
- nthlayer_workers/correlate/correlation/__init__.py +1 -0
- nthlayer_workers/correlate/correlation/changes.py +87 -0
- nthlayer_workers/correlate/correlation/dedup.py +62 -0
- nthlayer_workers/correlate/correlation/engine.py +244 -0
- nthlayer_workers/correlate/correlation/temporal.py +79 -0
- nthlayer_workers/correlate/correlation/topology.py +104 -0
- nthlayer_workers/correlate/ingestion/__init__.py +1 -0
- nthlayer_workers/correlate/ingestion/protocol.py +10 -0
- nthlayer_workers/correlate/ingestion/severity.py +18 -0
- nthlayer_workers/correlate/ingestion/webhook.py +197 -0
- nthlayer_workers/correlate/notifications.py +85 -0
- nthlayer_workers/correlate/prometheus.py +234 -0
- nthlayer_workers/correlate/reasoning.py +375 -0
- nthlayer_workers/correlate/session.py +189 -0
- nthlayer_workers/correlate/snapshot/__init__.py +1 -0
- nthlayer_workers/correlate/snapshot/generator.py +170 -0
- nthlayer_workers/correlate/snapshot/model.py +177 -0
- nthlayer_workers/correlate/snapshot/token.py +14 -0
- nthlayer_workers/correlate/state.py +88 -0
- nthlayer_workers/correlate/store/__init__.py +5 -0
- nthlayer_workers/correlate/store/protocol.py +48 -0
- nthlayer_workers/correlate/store/sqlite.py +443 -0
- nthlayer_workers/correlate/summary.py +180 -0
- nthlayer_workers/correlate/traces/__init__.py +1 -0
- nthlayer_workers/correlate/traces/protocol.py +120 -0
- nthlayer_workers/correlate/traces/tempo.py +667 -0
- nthlayer_workers/correlate/traces/topology.py +39 -0
- nthlayer_workers/correlate/types.py +77 -0
- nthlayer_workers/correlate/worker.py +630 -0
- nthlayer_workers/learn/__init__.py +5 -0
- nthlayer_workers/learn/__main__.py +5 -0
- nthlayer_workers/learn/cli.py +164 -0
- nthlayer_workers/learn/retrospective.py +381 -0
- nthlayer_workers/learn/trends.py +102 -0
- nthlayer_workers/learn/worker.py +366 -0
- nthlayer_workers/measure/__init__.py +3 -0
- nthlayer_workers/measure/__main__.py +5 -0
- nthlayer_workers/measure/_parsing.py +15 -0
- nthlayer_workers/measure/adapters/__init__.py +0 -0
- nthlayer_workers/measure/adapters/_util.py +24 -0
- nthlayer_workers/measure/adapters/devin.py +119 -0
- nthlayer_workers/measure/adapters/gastown.py +88 -0
- nthlayer_workers/measure/adapters/prometheus.py +277 -0
- nthlayer_workers/measure/adapters/protocol.py +20 -0
- nthlayer_workers/measure/adapters/webhook.py +161 -0
- nthlayer_workers/measure/api/__init__.py +0 -0
- nthlayer_workers/measure/api/normalise.py +50 -0
- nthlayer_workers/measure/api/queue.py +243 -0
- nthlayer_workers/measure/api/response.py +51 -0
- nthlayer_workers/measure/api/server.py +504 -0
- nthlayer_workers/measure/calibration/__init__.py +0 -0
- nthlayer_workers/measure/calibration/loop.py +62 -0
- nthlayer_workers/measure/calibration/slos.py +212 -0
- nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
- nthlayer_workers/measure/cli.py +753 -0
- nthlayer_workers/measure/config.py +191 -0
- nthlayer_workers/measure/detection/__init__.py +6 -0
- nthlayer_workers/measure/detection/detector.py +82 -0
- nthlayer_workers/measure/detection/protocol.py +29 -0
- nthlayer_workers/measure/governance/__init__.py +0 -0
- nthlayer_workers/measure/governance/engine.py +163 -0
- nthlayer_workers/measure/manifest.py +77 -0
- nthlayer_workers/measure/notifications.py +53 -0
- nthlayer_workers/measure/pipeline/__init__.py +0 -0
- nthlayer_workers/measure/pipeline/evaluator.py +155 -0
- nthlayer_workers/measure/pipeline/router.py +160 -0
- nthlayer_workers/measure/store/__init__.py +0 -0
- nthlayer_workers/measure/store/protocol.py +38 -0
- nthlayer_workers/measure/store/sqlite.py +276 -0
- nthlayer_workers/measure/telemetry.py +116 -0
- nthlayer_workers/measure/tiering/__init__.py +0 -0
- nthlayer_workers/measure/tiering/classifier.py +58 -0
- nthlayer_workers/measure/tiering/promotion.py +118 -0
- nthlayer_workers/measure/trends/__init__.py +0 -0
- nthlayer_workers/measure/trends/tracker.py +72 -0
- nthlayer_workers/measure/types.py +75 -0
- nthlayer_workers/measure/worker.py +439 -0
- nthlayer_workers/observe/__init__.py +25 -0
- nthlayer_workers/observe/__main__.py +5 -0
- nthlayer_workers/observe/api/__init__.py +1 -0
- nthlayer_workers/observe/assessment.py +95 -0
- nthlayer_workers/observe/cli.py +737 -0
- nthlayer_workers/observe/config.py +11 -0
- nthlayer_workers/observe/db/__init__.py +1 -0
- nthlayer_workers/observe/decision_records.py +220 -0
- nthlayer_workers/observe/dependencies/__init__.py +18 -0
- nthlayer_workers/observe/dependencies/discovery.py +294 -0
- nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
- nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
- nthlayer_workers/observe/dependencies/providers/base.py +76 -0
- nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
- nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
- nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
- nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
- nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
- nthlayer_workers/observe/deployments/__init__.py +1 -0
- nthlayer_workers/observe/discovery/__init__.py +14 -0
- nthlayer_workers/observe/discovery/classifier.py +66 -0
- nthlayer_workers/observe/discovery/client.py +189 -0
- nthlayer_workers/observe/discovery/models.py +53 -0
- nthlayer_workers/observe/drift/__init__.py +26 -0
- nthlayer_workers/observe/drift/analyzer.py +383 -0
- nthlayer_workers/observe/drift/models.py +174 -0
- nthlayer_workers/observe/drift/patterns.py +88 -0
- nthlayer_workers/observe/explanation.py +118 -0
- nthlayer_workers/observe/gate/__init__.py +39 -0
- nthlayer_workers/observe/gate/conditions.py +92 -0
- nthlayer_workers/observe/gate/correlator.py +154 -0
- nthlayer_workers/observe/gate/evaluator.py +192 -0
- nthlayer_workers/observe/gate/policies.py +226 -0
- nthlayer_workers/observe/gate_adapter.py +40 -0
- nthlayer_workers/observe/incident.py +36 -0
- nthlayer_workers/observe/portfolio/__init__.py +17 -0
- nthlayer_workers/observe/portfolio/aggregator.py +168 -0
- nthlayer_workers/observe/portfolio/scorer.py +13 -0
- nthlayer_workers/observe/slo/__init__.py +19 -0
- nthlayer_workers/observe/slo/collector.py +235 -0
- nthlayer_workers/observe/slo/spec_loader.py +40 -0
- nthlayer_workers/observe/sqlite_store.py +152 -0
- nthlayer_workers/observe/store.py +92 -0
- nthlayer_workers/observe/verification/__init__.py +22 -0
- nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
- nthlayer_workers/observe/verification/extractor.py +127 -0
- nthlayer_workers/observe/verification/models.py +101 -0
- nthlayer_workers/observe/verification/verifier.py +111 -0
- nthlayer_workers/observe/worker.py +332 -0
- nthlayer_workers/respond/__init__.py +2 -0
- nthlayer_workers/respond/__main__.py +4 -0
- nthlayer_workers/respond/agents/__init__.py +0 -0
- nthlayer_workers/respond/agents/base.py +556 -0
- nthlayer_workers/respond/agents/communication.py +115 -0
- nthlayer_workers/respond/agents/investigation.py +124 -0
- nthlayer_workers/respond/agents/remediation.py +219 -0
- nthlayer_workers/respond/agents/triage.py +132 -0
- nthlayer_workers/respond/cli.py +772 -0
- nthlayer_workers/respond/config.py +135 -0
- nthlayer_workers/respond/context_store.py +256 -0
- nthlayer_workers/respond/coordinator.py +487 -0
- nthlayer_workers/respond/metrics.py +104 -0
- nthlayer_workers/respond/notification_backends/__init__.py +1 -0
- nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
- nthlayer_workers/respond/notification_backends/protocol.py +59 -0
- nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
- nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
- nthlayer_workers/respond/notifications.py +247 -0
- nthlayer_workers/respond/oncall/__init__.py +1 -0
- nthlayer_workers/respond/oncall/escalation.py +103 -0
- nthlayer_workers/respond/oncall/runner.py +193 -0
- nthlayer_workers/respond/oncall/schedule.py +243 -0
- nthlayer_workers/respond/safe_actions/__init__.py +0 -0
- nthlayer_workers/respond/safe_actions/actions.py +139 -0
- nthlayer_workers/respond/safe_actions/registry.py +171 -0
- nthlayer_workers/respond/safe_actions/webhook.py +194 -0
- nthlayer_workers/respond/server.py +357 -0
- nthlayer_workers/respond/sre/__init__.py +1 -0
- nthlayer_workers/respond/sre/brief.py +175 -0
- nthlayer_workers/respond/sre/delegation.py +101 -0
- nthlayer_workers/respond/sre/post_incident.py +146 -0
- nthlayer_workers/respond/sre/shift_report.py +129 -0
- nthlayer_workers/respond/sre/suppression.py +91 -0
- nthlayer_workers/respond/types.py +109 -0
- nthlayer_workers/respond/verdict_submission.py +56 -0
- nthlayer_workers/respond/worker.py +533 -0
- nthlayer_workers/respond/worker_helpers.py +140 -0
- nthlayer_workers/runner.py +198 -0
- nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
- nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
- nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
- nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
- nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""SitRep configuration."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class SitRepConfig:
|
|
12
|
+
store_path: str = "sitrep-events.db"
|
|
13
|
+
ingestion_host: str = "127.0.0.1"
|
|
14
|
+
ingestion_port: int = 8081
|
|
15
|
+
correlation_window_minutes: int = 5
|
|
16
|
+
dedup_key_fields: list[str] = field(
|
|
17
|
+
default_factory=lambda: ["source", "service", "type", "environment"]
|
|
18
|
+
)
|
|
19
|
+
token_budget: int = 4000
|
|
20
|
+
cache_ttl_minutes: int = 15
|
|
21
|
+
model_name: str = "claude-sonnet-4-20250514"
|
|
22
|
+
model_max_tokens: int = 4096
|
|
23
|
+
verdict_store_path: str = "verdicts.db"
|
|
24
|
+
manifests_dir: str | None = None
|
|
25
|
+
watching_interval: int = 300
|
|
26
|
+
alert_interval: int = 60
|
|
27
|
+
incident_interval: int = 30
|
|
28
|
+
degraded_interval: int = 120
|
|
29
|
+
# Trace backend configuration
|
|
30
|
+
trace_backend: str | None = None
|
|
31
|
+
trace_detail: str = "full"
|
|
32
|
+
trace_baseline_window: str = "1h"
|
|
33
|
+
tempo_endpoint: str = "http://localhost:3200"
|
|
34
|
+
tempo_org_id: str = ""
|
|
35
|
+
tempo_timeout: int = 30
|
|
36
|
+
tempo_use_service_graphs: bool = True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def load_config(path: str | None = None) -> SitRepConfig:
|
|
40
|
+
"""Load config from YAML file. Missing fields use defaults."""
|
|
41
|
+
if path is None:
|
|
42
|
+
return SitRepConfig()
|
|
43
|
+
|
|
44
|
+
with open(path) as f:
|
|
45
|
+
data = yaml.safe_load(f) or {}
|
|
46
|
+
|
|
47
|
+
kwargs: dict[str, Any] = {}
|
|
48
|
+
|
|
49
|
+
# Flatten nested YAML structure to flat config
|
|
50
|
+
store = data.get("store", {})
|
|
51
|
+
if "path" in store:
|
|
52
|
+
kwargs["store_path"] = store["path"]
|
|
53
|
+
|
|
54
|
+
ingestion = data.get("ingestion", {})
|
|
55
|
+
if "host" in ingestion:
|
|
56
|
+
kwargs["ingestion_host"] = ingestion["host"]
|
|
57
|
+
if "port" in ingestion:
|
|
58
|
+
kwargs["ingestion_port"] = ingestion["port"]
|
|
59
|
+
|
|
60
|
+
correlation = data.get("correlation", {})
|
|
61
|
+
if "window_minutes" in correlation:
|
|
62
|
+
kwargs["correlation_window_minutes"] = correlation["window_minutes"]
|
|
63
|
+
if "dedup_key_fields" in correlation:
|
|
64
|
+
kwargs["dedup_key_fields"] = correlation["dedup_key_fields"]
|
|
65
|
+
|
|
66
|
+
snapshot = data.get("snapshot", {})
|
|
67
|
+
if "token_budget" in snapshot:
|
|
68
|
+
kwargs["token_budget"] = snapshot["token_budget"]
|
|
69
|
+
if "cache_ttl_minutes" in snapshot:
|
|
70
|
+
kwargs["cache_ttl_minutes"] = snapshot["cache_ttl_minutes"]
|
|
71
|
+
|
|
72
|
+
model = data.get("model", {})
|
|
73
|
+
if "model" in model:
|
|
74
|
+
kwargs["model_name"] = model["model"]
|
|
75
|
+
if "max_tokens" in model:
|
|
76
|
+
kwargs["model_max_tokens"] = model["max_tokens"]
|
|
77
|
+
|
|
78
|
+
verdict = data.get("verdict", {}).get("store", {})
|
|
79
|
+
if "path" in verdict:
|
|
80
|
+
kwargs["verdict_store_path"] = verdict["path"]
|
|
81
|
+
|
|
82
|
+
topology = data.get("topology", {})
|
|
83
|
+
if "manifests_dir" in topology:
|
|
84
|
+
kwargs["manifests_dir"] = topology["manifests_dir"]
|
|
85
|
+
|
|
86
|
+
state = data.get("state", {})
|
|
87
|
+
for key in ["watching_interval_seconds", "alert_interval_seconds",
|
|
88
|
+
"incident_interval_seconds", "degraded_interval_seconds"]:
|
|
89
|
+
short_key = key.replace("_seconds", "")
|
|
90
|
+
if key in state:
|
|
91
|
+
kwargs[short_key] = state[key]
|
|
92
|
+
|
|
93
|
+
traces = data.get("traces", {})
|
|
94
|
+
if "backend" in traces:
|
|
95
|
+
kwargs["trace_backend"] = traces["backend"]
|
|
96
|
+
if "detail" in traces:
|
|
97
|
+
kwargs["trace_detail"] = traces["detail"]
|
|
98
|
+
if "baseline_window" in traces:
|
|
99
|
+
kwargs["trace_baseline_window"] = traces["baseline_window"]
|
|
100
|
+
|
|
101
|
+
tempo = traces.get("tempo", {})
|
|
102
|
+
if "endpoint" in tempo:
|
|
103
|
+
kwargs["tempo_endpoint"] = tempo["endpoint"]
|
|
104
|
+
if "org_id" in tempo:
|
|
105
|
+
kwargs["tempo_org_id"] = tempo["org_id"]
|
|
106
|
+
if "timeout_seconds" in tempo:
|
|
107
|
+
kwargs["tempo_timeout"] = tempo["timeout_seconds"]
|
|
108
|
+
if "use_service_graphs" in tempo:
|
|
109
|
+
kwargs["tempo_use_service_graphs"] = tempo["use_service_graphs"]
|
|
110
|
+
|
|
111
|
+
return SitRepConfig(**kwargs)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Signal correlation for SitRep. All operations are deterministic transport."""
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Change candidate indexing. Deterministic transport."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
from nthlayer_workers.correlate.store.protocol import EventStore
|
|
7
|
+
from nthlayer_workers.correlate.types import ChangeCandidate, TemporalGroup
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _parse_ts(ts: str) -> datetime:
|
|
11
|
+
"""Parse ISO 8601 timestamp. Always returns timezone-aware (UTC default)."""
|
|
12
|
+
dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
|
13
|
+
if dt.tzinfo is None:
|
|
14
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
15
|
+
return dt
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def find_change_candidates(
|
|
19
|
+
store: EventStore,
|
|
20
|
+
temporal_groups: list[TemporalGroup],
|
|
21
|
+
topology: dict | None = None,
|
|
22
|
+
window_minutes: int = 30,
|
|
23
|
+
) -> dict[str, list[ChangeCandidate]]:
|
|
24
|
+
"""For each temporal group, find recent changes on the same service or dependencies.
|
|
25
|
+
|
|
26
|
+
Returns mapping from service name to list of change candidates,
|
|
27
|
+
sorted by temporal proximity (closest first).
|
|
28
|
+
"""
|
|
29
|
+
result: dict[str, list[ChangeCandidate]] = {}
|
|
30
|
+
|
|
31
|
+
for group in temporal_groups:
|
|
32
|
+
service = group.service
|
|
33
|
+
candidates: list[ChangeCandidate] = []
|
|
34
|
+
|
|
35
|
+
# Parse group's first event timestamp for proximity calculation
|
|
36
|
+
group_first_ts = _parse_ts(group.time_window[0])
|
|
37
|
+
|
|
38
|
+
# Use the group's first event as reference time for "recent" lookback.
|
|
39
|
+
# This is essential for replay with historical timestamps — without it,
|
|
40
|
+
# get_recent_changes uses datetime('now') and finds nothing.
|
|
41
|
+
reference_time = group.time_window[0]
|
|
42
|
+
|
|
43
|
+
# Check for changes on the same service
|
|
44
|
+
same_service_changes = store.get_recent_changes(
|
|
45
|
+
service, window_minutes, reference_time=reference_time
|
|
46
|
+
)
|
|
47
|
+
for change in same_service_changes:
|
|
48
|
+
change_ts = _parse_ts(change.timestamp)
|
|
49
|
+
proximity = abs((group_first_ts - change_ts).total_seconds())
|
|
50
|
+
candidates.append(
|
|
51
|
+
ChangeCandidate(
|
|
52
|
+
change=change,
|
|
53
|
+
affected_service=service,
|
|
54
|
+
temporal_proximity_seconds=proximity,
|
|
55
|
+
same_service=True,
|
|
56
|
+
dependency_related=False,
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Check for changes on dependency services
|
|
61
|
+
if topology is not None:
|
|
62
|
+
svc_info = topology.get(service, {})
|
|
63
|
+
dependencies = svc_info.get("dependencies", [])
|
|
64
|
+
for dep_service in dependencies:
|
|
65
|
+
dep_changes = store.get_recent_changes(
|
|
66
|
+
dep_service, window_minutes, reference_time=reference_time
|
|
67
|
+
)
|
|
68
|
+
for change in dep_changes:
|
|
69
|
+
change_ts = _parse_ts(change.timestamp)
|
|
70
|
+
proximity = abs((group_first_ts - change_ts).total_seconds())
|
|
71
|
+
candidates.append(
|
|
72
|
+
ChangeCandidate(
|
|
73
|
+
change=change,
|
|
74
|
+
affected_service=service,
|
|
75
|
+
temporal_proximity_seconds=proximity,
|
|
76
|
+
same_service=False,
|
|
77
|
+
dependency_related=True,
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Sort by temporal proximity (closest first)
|
|
82
|
+
candidates.sort(key=lambda c: c.temporal_proximity_seconds)
|
|
83
|
+
|
|
84
|
+
if candidates:
|
|
85
|
+
result[service] = candidates
|
|
86
|
+
|
|
87
|
+
return result
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Signal deduplication. Deterministic transport."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import replace
|
|
5
|
+
|
|
6
|
+
from nthlayer_workers.correlate.types import SitRepEvent
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _dedup_key(event: SitRepEvent) -> str:
|
|
10
|
+
"""Build dedup key from event fields."""
|
|
11
|
+
parts = [event.source, event.service, event.type.value, event.environment]
|
|
12
|
+
# Add alert_name or metric from payload if present
|
|
13
|
+
alert_name = event.payload.get("alert_name") or event.payload.get("metric")
|
|
14
|
+
if alert_name:
|
|
15
|
+
parts.append(str(alert_name))
|
|
16
|
+
return "|".join(parts)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def deduplicate(events: list[SitRepEvent]) -> list[SitRepEvent]:
|
|
20
|
+
"""Collapse events with the same dedup key.
|
|
21
|
+
|
|
22
|
+
Returns deduplicated list. The first occurrence of each key is kept,
|
|
23
|
+
with _dedup_count and _dedup_duration_seconds added to its payload.
|
|
24
|
+
"""
|
|
25
|
+
seen: dict[str, SitRepEvent] = {}
|
|
26
|
+
counts: dict[str, int] = {}
|
|
27
|
+
first_ts: dict[str, str] = {}
|
|
28
|
+
last_ts: dict[str, str] = {}
|
|
29
|
+
|
|
30
|
+
for event in events:
|
|
31
|
+
key = _dedup_key(event)
|
|
32
|
+
if key not in seen:
|
|
33
|
+
seen[key] = event
|
|
34
|
+
counts[key] = 1
|
|
35
|
+
first_ts[key] = event.timestamp
|
|
36
|
+
last_ts[key] = event.timestamp
|
|
37
|
+
else:
|
|
38
|
+
counts[key] += 1
|
|
39
|
+
if event.timestamp < first_ts[key]:
|
|
40
|
+
first_ts[key] = event.timestamp
|
|
41
|
+
if event.timestamp > last_ts[key]:
|
|
42
|
+
last_ts[key] = event.timestamp
|
|
43
|
+
|
|
44
|
+
result = []
|
|
45
|
+
for key, event in seen.items():
|
|
46
|
+
if counts[key] > 1:
|
|
47
|
+
# Add dedup metadata to payload (don't mutate original)
|
|
48
|
+
from datetime import datetime
|
|
49
|
+
try:
|
|
50
|
+
start = datetime.fromisoformat(first_ts[key].replace("Z", "+00:00"))
|
|
51
|
+
end = datetime.fromisoformat(last_ts[key].replace("Z", "+00:00"))
|
|
52
|
+
duration = (end - start).total_seconds()
|
|
53
|
+
except (ValueError, TypeError):
|
|
54
|
+
duration = 0.0
|
|
55
|
+
|
|
56
|
+
new_payload = dict(event.payload)
|
|
57
|
+
new_payload["_dedup_count"] = counts[key]
|
|
58
|
+
new_payload["_dedup_duration_seconds"] = duration
|
|
59
|
+
event = replace(event, payload=new_payload)
|
|
60
|
+
result.append(event)
|
|
61
|
+
|
|
62
|
+
return result
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""Pre-correlation engine. Orchestrates all correlation steps."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import uuid
|
|
5
|
+
from dataclasses import replace
|
|
6
|
+
from datetime import datetime, timedelta, timezone
|
|
7
|
+
|
|
8
|
+
from nthlayer_workers.correlate.correlation.changes import find_change_candidates
|
|
9
|
+
from nthlayer_workers.correlate.correlation.dedup import deduplicate
|
|
10
|
+
from nthlayer_workers.correlate.correlation.temporal import group_temporal
|
|
11
|
+
from nthlayer_workers.correlate.correlation.topology import group_topology
|
|
12
|
+
from nthlayer_workers.correlate.ingestion.severity import pre_score
|
|
13
|
+
from nthlayer_workers.correlate.store.protocol import EventStore
|
|
14
|
+
from nthlayer_workers.correlate.types import ChangeCandidate, CorrelationGroup, TemporalGroup
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CorrelationEngine:
|
|
18
|
+
"""Orchestrates the full pre-correlation pipeline. All deterministic transport."""
|
|
19
|
+
|
|
20
|
+
def correlate(
|
|
21
|
+
self,
|
|
22
|
+
store: EventStore,
|
|
23
|
+
window_minutes: int = 5,
|
|
24
|
+
topology: dict | None = None,
|
|
25
|
+
slo_targets: dict | None = None,
|
|
26
|
+
) -> list[CorrelationGroup]:
|
|
27
|
+
"""Run full pre-correlation pipeline. All deterministic transport.
|
|
28
|
+
|
|
29
|
+
Steps:
|
|
30
|
+
1. Query store for events in window
|
|
31
|
+
2. Deduplicate
|
|
32
|
+
3. Severity enrichment (second-pass for events without SLO context)
|
|
33
|
+
4. Temporal grouping
|
|
34
|
+
5. Topology-aware grouping
|
|
35
|
+
6. Change candidate indexing
|
|
36
|
+
7. Assemble CorrelationGroups with priority scoring
|
|
37
|
+
"""
|
|
38
|
+
# Step 1: Query store for events in window
|
|
39
|
+
now = datetime.now(timezone.utc)
|
|
40
|
+
start = (now - timedelta(minutes=window_minutes)).isoformat()
|
|
41
|
+
end = now.isoformat()
|
|
42
|
+
|
|
43
|
+
events = store.get_by_time_window(start, end)
|
|
44
|
+
if not events:
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
# Step 2: Deduplicate
|
|
48
|
+
deduped = deduplicate(events)
|
|
49
|
+
|
|
50
|
+
# Step 3: Severity enrichment
|
|
51
|
+
enriched = []
|
|
52
|
+
for event in deduped:
|
|
53
|
+
new_severity = pre_score(event, slo_targets)
|
|
54
|
+
if new_severity != event.severity:
|
|
55
|
+
event = replace(event, severity=new_severity)
|
|
56
|
+
enriched.append(event)
|
|
57
|
+
|
|
58
|
+
# Step 4: Temporal grouping
|
|
59
|
+
temporal_groups = group_temporal(enriched, window_minutes=window_minutes)
|
|
60
|
+
if not temporal_groups:
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
# Step 5: Topology-aware grouping
|
|
64
|
+
topology_correlations = group_topology(temporal_groups, topology)
|
|
65
|
+
|
|
66
|
+
# Step 6: Change candidate indexing
|
|
67
|
+
change_candidates_map = find_change_candidates(
|
|
68
|
+
store, temporal_groups, topology=topology, window_minutes=max(window_minutes, 30)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Step 7: Assemble CorrelationGroups with priority scoring
|
|
72
|
+
return self.assemble_groups(
|
|
73
|
+
temporal_groups, topology_correlations, change_candidates_map, topology
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def assemble_groups(
|
|
77
|
+
self,
|
|
78
|
+
temporal_groups: list[TemporalGroup],
|
|
79
|
+
topology_correlations: list,
|
|
80
|
+
change_candidates_map: dict[str, list[ChangeCandidate]],
|
|
81
|
+
topology: dict | None,
|
|
82
|
+
) -> list[CorrelationGroup]:
|
|
83
|
+
"""Assemble CorrelationGroups from pre-computed correlation sub-step outputs.
|
|
84
|
+
|
|
85
|
+
Two-pass assembly: topology-linked groups first, then unassigned temporal groups.
|
|
86
|
+
Used by both correlate() and CLI replay.
|
|
87
|
+
"""
|
|
88
|
+
assigned: set[int] = set()
|
|
89
|
+
correlation_groups: list[CorrelationGroup] = []
|
|
90
|
+
|
|
91
|
+
# First pass: create groups from topology correlations
|
|
92
|
+
for tc in topology_correlations:
|
|
93
|
+
involved_services = {tc.primary_service}
|
|
94
|
+
for rs in tc.related_services:
|
|
95
|
+
involved_services.add(rs["service"])
|
|
96
|
+
|
|
97
|
+
signals: list[TemporalGroup] = []
|
|
98
|
+
for gi, tg in enumerate(temporal_groups):
|
|
99
|
+
if tg.service in involved_services and gi not in assigned:
|
|
100
|
+
signals.append(tg)
|
|
101
|
+
assigned.add(gi)
|
|
102
|
+
|
|
103
|
+
if not signals:
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
all_changes: list[ChangeCandidate] = []
|
|
107
|
+
for svc in involved_services:
|
|
108
|
+
all_changes.extend(change_candidates_map.get(svc, []))
|
|
109
|
+
|
|
110
|
+
peak_severity = max(s.peak_severity for s in signals)
|
|
111
|
+
priority = self._compute_priority(
|
|
112
|
+
peak_severity, list(involved_services), topology, topology_correlations
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
total_events = sum(s.count for s in signals)
|
|
116
|
+
primary_type = self._dominant_event_type(signals)
|
|
117
|
+
services_str = ", ".join(sorted(involved_services))
|
|
118
|
+
summary = (
|
|
119
|
+
f"{total_events} {primary_type}(s) on {services_str} "
|
|
120
|
+
f"with {len(all_changes)} recent change(s)"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
all_timestamps = []
|
|
124
|
+
for s in signals:
|
|
125
|
+
all_timestamps.append(s.time_window[0])
|
|
126
|
+
all_timestamps.append(s.time_window[1])
|
|
127
|
+
first_seen = min(all_timestamps)
|
|
128
|
+
last_updated = max(all_timestamps)
|
|
129
|
+
|
|
130
|
+
group_id = f"cg-{uuid.uuid4().hex[:8]}"
|
|
131
|
+
correlation_groups.append(
|
|
132
|
+
CorrelationGroup(
|
|
133
|
+
id=group_id,
|
|
134
|
+
priority=priority,
|
|
135
|
+
summary=summary,
|
|
136
|
+
services=sorted(involved_services),
|
|
137
|
+
signals=signals,
|
|
138
|
+
topology=tc,
|
|
139
|
+
change_candidates=all_changes,
|
|
140
|
+
first_seen=first_seen,
|
|
141
|
+
last_updated=last_updated,
|
|
142
|
+
event_count=total_events,
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Second pass: create groups for unassigned temporal groups
|
|
147
|
+
for gi, tg in enumerate(temporal_groups):
|
|
148
|
+
if gi in assigned:
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
service = tg.service
|
|
152
|
+
changes = change_candidates_map.get(service, [])
|
|
153
|
+
peak_severity = tg.peak_severity
|
|
154
|
+
priority = self._compute_priority(
|
|
155
|
+
peak_severity, [service], topology, topology_correlations
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
primary_type = self._dominant_event_type([tg])
|
|
159
|
+
summary = (
|
|
160
|
+
f"{tg.count} {primary_type}(s) on {service} "
|
|
161
|
+
f"with {len(changes)} recent change(s)"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
group_id = f"cg-{uuid.uuid4().hex[:8]}"
|
|
165
|
+
correlation_groups.append(
|
|
166
|
+
CorrelationGroup(
|
|
167
|
+
id=group_id,
|
|
168
|
+
priority=priority,
|
|
169
|
+
summary=summary,
|
|
170
|
+
services=[service],
|
|
171
|
+
signals=[tg],
|
|
172
|
+
topology=None,
|
|
173
|
+
change_candidates=changes,
|
|
174
|
+
first_seen=tg.time_window[0],
|
|
175
|
+
last_updated=tg.time_window[1],
|
|
176
|
+
event_count=tg.count,
|
|
177
|
+
)
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
correlation_groups.sort(
|
|
181
|
+
key=lambda g: (g.priority, -max(s.peak_severity for s in g.signals))
|
|
182
|
+
)
|
|
183
|
+
return correlation_groups
|
|
184
|
+
|
|
185
|
+
def _compute_priority(
|
|
186
|
+
self,
|
|
187
|
+
peak_severity: float,
|
|
188
|
+
services: list[str],
|
|
189
|
+
topology: dict | None,
|
|
190
|
+
topology_correlations: list,
|
|
191
|
+
) -> int:
|
|
192
|
+
"""Compute priority tier for a correlation group.
|
|
193
|
+
|
|
194
|
+
P0: peak_severity > 0.8 AND service tier is "critical"
|
|
195
|
+
P1: peak_severity > 0.6 OR has topology correlation with a P0 group
|
|
196
|
+
P2: peak_severity > 0.3
|
|
197
|
+
P3: everything else
|
|
198
|
+
"""
|
|
199
|
+
# Check if any service is critical tier
|
|
200
|
+
has_critical = False
|
|
201
|
+
if topology is not None:
|
|
202
|
+
for svc in services:
|
|
203
|
+
svc_info = topology.get(svc, {})
|
|
204
|
+
if svc_info.get("tier") == "critical":
|
|
205
|
+
has_critical = True
|
|
206
|
+
break
|
|
207
|
+
|
|
208
|
+
if peak_severity > 0.8 and has_critical:
|
|
209
|
+
return 0 # P0
|
|
210
|
+
|
|
211
|
+
if peak_severity > 0.6:
|
|
212
|
+
return 1 # P1
|
|
213
|
+
|
|
214
|
+
# P1 if has topology correlation with a P0-eligible group
|
|
215
|
+
if topology is not None and topology_correlations:
|
|
216
|
+
for tc in topology_correlations:
|
|
217
|
+
linked_services = {tc.primary_service}
|
|
218
|
+
for rs in tc.related_services:
|
|
219
|
+
linked_services.add(rs["service"])
|
|
220
|
+
if linked_services & set(services):
|
|
221
|
+
# Check if any linked service is P0-eligible
|
|
222
|
+
for svc in linked_services:
|
|
223
|
+
svc_info = topology.get(svc, {})
|
|
224
|
+
if svc_info.get("tier") == "critical":
|
|
225
|
+
return 1 # P1 — topology link to critical service
|
|
226
|
+
|
|
227
|
+
if peak_severity > 0.3:
|
|
228
|
+
return 2 # P2
|
|
229
|
+
|
|
230
|
+
return 3 # P3
|
|
231
|
+
|
|
232
|
+
@staticmethod
|
|
233
|
+
def _dominant_event_type(signals: list[TemporalGroup]) -> str:
|
|
234
|
+
"""Find the most common event type across all signals."""
|
|
235
|
+
type_counts: dict[str, int] = {}
|
|
236
|
+
for signal in signals:
|
|
237
|
+
for event in signal.events:
|
|
238
|
+
t = event.type.value
|
|
239
|
+
type_counts[t] = type_counts.get(t, 0) + 1
|
|
240
|
+
|
|
241
|
+
if not type_counts:
|
|
242
|
+
return "event"
|
|
243
|
+
|
|
244
|
+
return max(type_counts, key=type_counts.get) # type: ignore[arg-type]
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Temporal grouping — windowed aggregation by service. Deterministic transport."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
from nthlayer_workers.correlate.types import SitRepEvent, TemporalGroup
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _parse_ts(ts: str) -> datetime:
|
|
10
|
+
"""Parse ISO 8601 timestamp. Always returns timezone-aware (UTC default)."""
|
|
11
|
+
dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
|
12
|
+
if dt.tzinfo is None:
|
|
13
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
14
|
+
return dt
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def group_temporal(
|
|
18
|
+
events: list[SitRepEvent], window_minutes: int = 5
|
|
19
|
+
) -> list[TemporalGroup]:
|
|
20
|
+
"""Group events by service within time windows.
|
|
21
|
+
|
|
22
|
+
Events on the same service within window_minutes of the earliest
|
|
23
|
+
event in the group are grouped together.
|
|
24
|
+
"""
|
|
25
|
+
if not events:
|
|
26
|
+
return []
|
|
27
|
+
|
|
28
|
+
# Sort by timestamp
|
|
29
|
+
sorted_events = sorted(events, key=lambda e: e.timestamp)
|
|
30
|
+
|
|
31
|
+
# Group by service
|
|
32
|
+
by_service: dict[str, list[SitRepEvent]] = {}
|
|
33
|
+
for event in sorted_events:
|
|
34
|
+
by_service.setdefault(event.service, []).append(event)
|
|
35
|
+
|
|
36
|
+
groups: list[TemporalGroup] = []
|
|
37
|
+
window_seconds = window_minutes * 60
|
|
38
|
+
|
|
39
|
+
for service, svc_events in by_service.items():
|
|
40
|
+
# Split into windows
|
|
41
|
+
current_window: list[SitRepEvent] = [svc_events[0]]
|
|
42
|
+
window_start = _parse_ts(svc_events[0].timestamp)
|
|
43
|
+
|
|
44
|
+
for event in svc_events[1:]:
|
|
45
|
+
event_ts = _parse_ts(event.timestamp)
|
|
46
|
+
if (event_ts - window_start).total_seconds() <= window_seconds:
|
|
47
|
+
current_window.append(event)
|
|
48
|
+
else:
|
|
49
|
+
# Emit current window, start new one
|
|
50
|
+
groups.append(_make_group(service, current_window))
|
|
51
|
+
current_window = [event]
|
|
52
|
+
window_start = event_ts
|
|
53
|
+
|
|
54
|
+
# Emit last window
|
|
55
|
+
if current_window:
|
|
56
|
+
groups.append(_make_group(service, current_window))
|
|
57
|
+
|
|
58
|
+
return groups
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _make_group(service: str, events: list[SitRepEvent]) -> TemporalGroup:
|
|
62
|
+
"""Create a TemporalGroup from a list of events."""
|
|
63
|
+
first_ts = events[0].timestamp
|
|
64
|
+
last_ts = events[-1].timestamp
|
|
65
|
+
try:
|
|
66
|
+
start = _parse_ts(first_ts)
|
|
67
|
+
end = _parse_ts(last_ts)
|
|
68
|
+
duration = (end - start).total_seconds()
|
|
69
|
+
except (ValueError, TypeError):
|
|
70
|
+
duration = 0.0
|
|
71
|
+
|
|
72
|
+
return TemporalGroup(
|
|
73
|
+
service=service,
|
|
74
|
+
time_window=(first_ts, last_ts),
|
|
75
|
+
events=events,
|
|
76
|
+
count=len(events),
|
|
77
|
+
peak_severity=max(e.severity for e in events),
|
|
78
|
+
duration_seconds=duration,
|
|
79
|
+
)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Topology-aware grouping. Deterministic transport."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from nthlayer_workers.correlate.types import TemporalGroup, TopologyCorrelation
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _get_tier_rank(topology: dict, service: str) -> int:
|
|
8
|
+
"""Return numeric rank for service tier. Higher = more critical."""
|
|
9
|
+
tier_ranks = {"critical": 3, "standard": 2, "low": 1}
|
|
10
|
+
svc_info = topology.get(service, {})
|
|
11
|
+
tier = svc_info.get("tier", "standard")
|
|
12
|
+
return tier_ranks.get(tier, 1)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _depends_on(topology: dict, service_a: str, service_b: str) -> bool:
|
|
16
|
+
"""Check if service_a depends on service_b."""
|
|
17
|
+
svc_info = topology.get(service_a, {})
|
|
18
|
+
return service_b in svc_info.get("dependencies", [])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def group_topology(
|
|
22
|
+
temporal_groups: list[TemporalGroup],
|
|
23
|
+
topology: dict | None,
|
|
24
|
+
) -> list[TopologyCorrelation]:
|
|
25
|
+
"""Link temporal groups for services that have dependency relationships.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
temporal_groups: Groups from temporal.group_temporal()
|
|
29
|
+
topology: Dict mapping service names to {dependencies: [...], dependents: [...], tier: ...}
|
|
30
|
+
Typically from OpenSRM manifests. None = skip topology grouping.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
List of TopologyCorrelations linking related service groups.
|
|
34
|
+
"""
|
|
35
|
+
if topology is None:
|
|
36
|
+
return []
|
|
37
|
+
|
|
38
|
+
if len(temporal_groups) < 2:
|
|
39
|
+
return []
|
|
40
|
+
|
|
41
|
+
# Track which pairs have been linked to avoid duplicates
|
|
42
|
+
linked: set[tuple[str, str]] = set()
|
|
43
|
+
results: list[TopologyCorrelation] = []
|
|
44
|
+
|
|
45
|
+
for i, group_a in enumerate(temporal_groups):
|
|
46
|
+
for j, group_b in enumerate(temporal_groups):
|
|
47
|
+
if i >= j:
|
|
48
|
+
continue
|
|
49
|
+
if group_a.service == group_b.service:
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
pair_key = tuple(sorted((group_a.service, group_b.service)))
|
|
53
|
+
if pair_key in linked:
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
a_depends_on_b = _depends_on(topology, group_a.service, group_b.service)
|
|
57
|
+
b_depends_on_a = _depends_on(topology, group_b.service, group_a.service)
|
|
58
|
+
|
|
59
|
+
if not a_depends_on_b and not b_depends_on_a:
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
linked.add(pair_key)
|
|
63
|
+
|
|
64
|
+
# Determine primary: higher severity wins, then higher tier
|
|
65
|
+
a_severity = group_a.peak_severity
|
|
66
|
+
b_severity = group_b.peak_severity
|
|
67
|
+
a_tier = _get_tier_rank(topology, group_a.service)
|
|
68
|
+
b_tier = _get_tier_rank(topology, group_b.service)
|
|
69
|
+
|
|
70
|
+
if a_severity > b_severity or (a_severity == b_severity and a_tier >= b_tier):
|
|
71
|
+
primary_service = group_a.service
|
|
72
|
+
secondary_service = group_b.service
|
|
73
|
+
secondary_group = group_b
|
|
74
|
+
else:
|
|
75
|
+
primary_service = group_b.service
|
|
76
|
+
secondary_service = group_a.service
|
|
77
|
+
secondary_group = group_a
|
|
78
|
+
|
|
79
|
+
# Determine relationship: from primary's perspective toward secondary
|
|
80
|
+
if _depends_on(topology, primary_service, secondary_service):
|
|
81
|
+
relationship = "depends_on"
|
|
82
|
+
else:
|
|
83
|
+
relationship = "depended_by"
|
|
84
|
+
|
|
85
|
+
related_services = [
|
|
86
|
+
{
|
|
87
|
+
"service": secondary_service,
|
|
88
|
+
"relationship": relationship,
|
|
89
|
+
"events": secondary_group.events,
|
|
90
|
+
}
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
# Build topology path
|
|
94
|
+
topology_path = [primary_service, secondary_service]
|
|
95
|
+
|
|
96
|
+
results.append(
|
|
97
|
+
TopologyCorrelation(
|
|
98
|
+
primary_service=primary_service,
|
|
99
|
+
related_services=related_services,
|
|
100
|
+
topology_path=topology_path,
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return results
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Event ingestion for SitRep."""
|