nthlayer-workers 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. nthlayer_workers/__init__.py +5 -0
  2. nthlayer_workers/cli.py +234 -0
  3. nthlayer_workers/correlate/__init__.py +1 -0
  4. nthlayer_workers/correlate/cli.py +847 -0
  5. nthlayer_workers/correlate/config.py +111 -0
  6. nthlayer_workers/correlate/correlation/__init__.py +1 -0
  7. nthlayer_workers/correlate/correlation/changes.py +87 -0
  8. nthlayer_workers/correlate/correlation/dedup.py +62 -0
  9. nthlayer_workers/correlate/correlation/engine.py +244 -0
  10. nthlayer_workers/correlate/correlation/temporal.py +79 -0
  11. nthlayer_workers/correlate/correlation/topology.py +104 -0
  12. nthlayer_workers/correlate/ingestion/__init__.py +1 -0
  13. nthlayer_workers/correlate/ingestion/protocol.py +10 -0
  14. nthlayer_workers/correlate/ingestion/severity.py +18 -0
  15. nthlayer_workers/correlate/ingestion/webhook.py +197 -0
  16. nthlayer_workers/correlate/notifications.py +85 -0
  17. nthlayer_workers/correlate/prometheus.py +234 -0
  18. nthlayer_workers/correlate/reasoning.py +375 -0
  19. nthlayer_workers/correlate/session.py +189 -0
  20. nthlayer_workers/correlate/snapshot/__init__.py +1 -0
  21. nthlayer_workers/correlate/snapshot/generator.py +170 -0
  22. nthlayer_workers/correlate/snapshot/model.py +177 -0
  23. nthlayer_workers/correlate/snapshot/token.py +14 -0
  24. nthlayer_workers/correlate/state.py +88 -0
  25. nthlayer_workers/correlate/store/__init__.py +5 -0
  26. nthlayer_workers/correlate/store/protocol.py +48 -0
  27. nthlayer_workers/correlate/store/sqlite.py +443 -0
  28. nthlayer_workers/correlate/summary.py +180 -0
  29. nthlayer_workers/correlate/traces/__init__.py +1 -0
  30. nthlayer_workers/correlate/traces/protocol.py +120 -0
  31. nthlayer_workers/correlate/traces/tempo.py +667 -0
  32. nthlayer_workers/correlate/traces/topology.py +39 -0
  33. nthlayer_workers/correlate/types.py +77 -0
  34. nthlayer_workers/correlate/worker.py +630 -0
  35. nthlayer_workers/learn/__init__.py +5 -0
  36. nthlayer_workers/learn/__main__.py +5 -0
  37. nthlayer_workers/learn/cli.py +164 -0
  38. nthlayer_workers/learn/retrospective.py +381 -0
  39. nthlayer_workers/learn/trends.py +102 -0
  40. nthlayer_workers/learn/worker.py +366 -0
  41. nthlayer_workers/measure/__init__.py +3 -0
  42. nthlayer_workers/measure/__main__.py +5 -0
  43. nthlayer_workers/measure/_parsing.py +15 -0
  44. nthlayer_workers/measure/adapters/__init__.py +0 -0
  45. nthlayer_workers/measure/adapters/_util.py +24 -0
  46. nthlayer_workers/measure/adapters/devin.py +119 -0
  47. nthlayer_workers/measure/adapters/gastown.py +88 -0
  48. nthlayer_workers/measure/adapters/prometheus.py +277 -0
  49. nthlayer_workers/measure/adapters/protocol.py +20 -0
  50. nthlayer_workers/measure/adapters/webhook.py +161 -0
  51. nthlayer_workers/measure/api/__init__.py +0 -0
  52. nthlayer_workers/measure/api/normalise.py +50 -0
  53. nthlayer_workers/measure/api/queue.py +243 -0
  54. nthlayer_workers/measure/api/response.py +51 -0
  55. nthlayer_workers/measure/api/server.py +504 -0
  56. nthlayer_workers/measure/calibration/__init__.py +0 -0
  57. nthlayer_workers/measure/calibration/loop.py +62 -0
  58. nthlayer_workers/measure/calibration/slos.py +212 -0
  59. nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
  60. nthlayer_workers/measure/cli.py +753 -0
  61. nthlayer_workers/measure/config.py +191 -0
  62. nthlayer_workers/measure/detection/__init__.py +6 -0
  63. nthlayer_workers/measure/detection/detector.py +82 -0
  64. nthlayer_workers/measure/detection/protocol.py +29 -0
  65. nthlayer_workers/measure/governance/__init__.py +0 -0
  66. nthlayer_workers/measure/governance/engine.py +163 -0
  67. nthlayer_workers/measure/manifest.py +77 -0
  68. nthlayer_workers/measure/notifications.py +53 -0
  69. nthlayer_workers/measure/pipeline/__init__.py +0 -0
  70. nthlayer_workers/measure/pipeline/evaluator.py +155 -0
  71. nthlayer_workers/measure/pipeline/router.py +160 -0
  72. nthlayer_workers/measure/store/__init__.py +0 -0
  73. nthlayer_workers/measure/store/protocol.py +38 -0
  74. nthlayer_workers/measure/store/sqlite.py +276 -0
  75. nthlayer_workers/measure/telemetry.py +116 -0
  76. nthlayer_workers/measure/tiering/__init__.py +0 -0
  77. nthlayer_workers/measure/tiering/classifier.py +58 -0
  78. nthlayer_workers/measure/tiering/promotion.py +118 -0
  79. nthlayer_workers/measure/trends/__init__.py +0 -0
  80. nthlayer_workers/measure/trends/tracker.py +72 -0
  81. nthlayer_workers/measure/types.py +75 -0
  82. nthlayer_workers/measure/worker.py +439 -0
  83. nthlayer_workers/observe/__init__.py +25 -0
  84. nthlayer_workers/observe/__main__.py +5 -0
  85. nthlayer_workers/observe/api/__init__.py +1 -0
  86. nthlayer_workers/observe/assessment.py +95 -0
  87. nthlayer_workers/observe/cli.py +737 -0
  88. nthlayer_workers/observe/config.py +11 -0
  89. nthlayer_workers/observe/db/__init__.py +1 -0
  90. nthlayer_workers/observe/decision_records.py +220 -0
  91. nthlayer_workers/observe/dependencies/__init__.py +18 -0
  92. nthlayer_workers/observe/dependencies/discovery.py +294 -0
  93. nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
  94. nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
  95. nthlayer_workers/observe/dependencies/providers/base.py +76 -0
  96. nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
  97. nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
  98. nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
  99. nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
  100. nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
  101. nthlayer_workers/observe/deployments/__init__.py +1 -0
  102. nthlayer_workers/observe/discovery/__init__.py +14 -0
  103. nthlayer_workers/observe/discovery/classifier.py +66 -0
  104. nthlayer_workers/observe/discovery/client.py +189 -0
  105. nthlayer_workers/observe/discovery/models.py +53 -0
  106. nthlayer_workers/observe/drift/__init__.py +26 -0
  107. nthlayer_workers/observe/drift/analyzer.py +383 -0
  108. nthlayer_workers/observe/drift/models.py +174 -0
  109. nthlayer_workers/observe/drift/patterns.py +88 -0
  110. nthlayer_workers/observe/explanation.py +118 -0
  111. nthlayer_workers/observe/gate/__init__.py +39 -0
  112. nthlayer_workers/observe/gate/conditions.py +92 -0
  113. nthlayer_workers/observe/gate/correlator.py +154 -0
  114. nthlayer_workers/observe/gate/evaluator.py +192 -0
  115. nthlayer_workers/observe/gate/policies.py +226 -0
  116. nthlayer_workers/observe/gate_adapter.py +40 -0
  117. nthlayer_workers/observe/incident.py +36 -0
  118. nthlayer_workers/observe/portfolio/__init__.py +17 -0
  119. nthlayer_workers/observe/portfolio/aggregator.py +168 -0
  120. nthlayer_workers/observe/portfolio/scorer.py +13 -0
  121. nthlayer_workers/observe/slo/__init__.py +19 -0
  122. nthlayer_workers/observe/slo/collector.py +235 -0
  123. nthlayer_workers/observe/slo/spec_loader.py +40 -0
  124. nthlayer_workers/observe/sqlite_store.py +152 -0
  125. nthlayer_workers/observe/store.py +92 -0
  126. nthlayer_workers/observe/verification/__init__.py +22 -0
  127. nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
  128. nthlayer_workers/observe/verification/extractor.py +127 -0
  129. nthlayer_workers/observe/verification/models.py +101 -0
  130. nthlayer_workers/observe/verification/verifier.py +111 -0
  131. nthlayer_workers/observe/worker.py +332 -0
  132. nthlayer_workers/respond/__init__.py +2 -0
  133. nthlayer_workers/respond/__main__.py +4 -0
  134. nthlayer_workers/respond/agents/__init__.py +0 -0
  135. nthlayer_workers/respond/agents/base.py +556 -0
  136. nthlayer_workers/respond/agents/communication.py +115 -0
  137. nthlayer_workers/respond/agents/investigation.py +124 -0
  138. nthlayer_workers/respond/agents/remediation.py +219 -0
  139. nthlayer_workers/respond/agents/triage.py +132 -0
  140. nthlayer_workers/respond/cli.py +772 -0
  141. nthlayer_workers/respond/config.py +135 -0
  142. nthlayer_workers/respond/context_store.py +256 -0
  143. nthlayer_workers/respond/coordinator.py +487 -0
  144. nthlayer_workers/respond/metrics.py +104 -0
  145. nthlayer_workers/respond/notification_backends/__init__.py +1 -0
  146. nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
  147. nthlayer_workers/respond/notification_backends/protocol.py +59 -0
  148. nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
  149. nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
  150. nthlayer_workers/respond/notifications.py +247 -0
  151. nthlayer_workers/respond/oncall/__init__.py +1 -0
  152. nthlayer_workers/respond/oncall/escalation.py +103 -0
  153. nthlayer_workers/respond/oncall/runner.py +193 -0
  154. nthlayer_workers/respond/oncall/schedule.py +243 -0
  155. nthlayer_workers/respond/safe_actions/__init__.py +0 -0
  156. nthlayer_workers/respond/safe_actions/actions.py +139 -0
  157. nthlayer_workers/respond/safe_actions/registry.py +171 -0
  158. nthlayer_workers/respond/safe_actions/webhook.py +194 -0
  159. nthlayer_workers/respond/server.py +357 -0
  160. nthlayer_workers/respond/sre/__init__.py +1 -0
  161. nthlayer_workers/respond/sre/brief.py +175 -0
  162. nthlayer_workers/respond/sre/delegation.py +101 -0
  163. nthlayer_workers/respond/sre/post_incident.py +146 -0
  164. nthlayer_workers/respond/sre/shift_report.py +129 -0
  165. nthlayer_workers/respond/sre/suppression.py +91 -0
  166. nthlayer_workers/respond/types.py +109 -0
  167. nthlayer_workers/respond/verdict_submission.py +56 -0
  168. nthlayer_workers/respond/worker.py +533 -0
  169. nthlayer_workers/respond/worker_helpers.py +140 -0
  170. nthlayer_workers/runner.py +198 -0
  171. nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
  172. nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
  173. nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
  174. nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
  175. nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,111 @@
1
+ """SitRep configuration."""
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass, field
5
+ from typing import Any
6
+
7
+ import yaml
8
+
9
+
10
+ @dataclass
11
+ class SitRepConfig:
12
+ store_path: str = "sitrep-events.db"
13
+ ingestion_host: str = "127.0.0.1"
14
+ ingestion_port: int = 8081
15
+ correlation_window_minutes: int = 5
16
+ dedup_key_fields: list[str] = field(
17
+ default_factory=lambda: ["source", "service", "type", "environment"]
18
+ )
19
+ token_budget: int = 4000
20
+ cache_ttl_minutes: int = 15
21
+ model_name: str = "claude-sonnet-4-20250514"
22
+ model_max_tokens: int = 4096
23
+ verdict_store_path: str = "verdicts.db"
24
+ manifests_dir: str | None = None
25
+ watching_interval: int = 300
26
+ alert_interval: int = 60
27
+ incident_interval: int = 30
28
+ degraded_interval: int = 120
29
+ # Trace backend configuration
30
+ trace_backend: str | None = None
31
+ trace_detail: str = "full"
32
+ trace_baseline_window: str = "1h"
33
+ tempo_endpoint: str = "http://localhost:3200"
34
+ tempo_org_id: str = ""
35
+ tempo_timeout: int = 30
36
+ tempo_use_service_graphs: bool = True
37
+
38
+
39
+ def load_config(path: str | None = None) -> SitRepConfig:
40
+ """Load config from YAML file. Missing fields use defaults."""
41
+ if path is None:
42
+ return SitRepConfig()
43
+
44
+ with open(path) as f:
45
+ data = yaml.safe_load(f) or {}
46
+
47
+ kwargs: dict[str, Any] = {}
48
+
49
+ # Flatten nested YAML structure to flat config
50
+ store = data.get("store", {})
51
+ if "path" in store:
52
+ kwargs["store_path"] = store["path"]
53
+
54
+ ingestion = data.get("ingestion", {})
55
+ if "host" in ingestion:
56
+ kwargs["ingestion_host"] = ingestion["host"]
57
+ if "port" in ingestion:
58
+ kwargs["ingestion_port"] = ingestion["port"]
59
+
60
+ correlation = data.get("correlation", {})
61
+ if "window_minutes" in correlation:
62
+ kwargs["correlation_window_minutes"] = correlation["window_minutes"]
63
+ if "dedup_key_fields" in correlation:
64
+ kwargs["dedup_key_fields"] = correlation["dedup_key_fields"]
65
+
66
+ snapshot = data.get("snapshot", {})
67
+ if "token_budget" in snapshot:
68
+ kwargs["token_budget"] = snapshot["token_budget"]
69
+ if "cache_ttl_minutes" in snapshot:
70
+ kwargs["cache_ttl_minutes"] = snapshot["cache_ttl_minutes"]
71
+
72
+ model = data.get("model", {})
73
+ if "model" in model:
74
+ kwargs["model_name"] = model["model"]
75
+ if "max_tokens" in model:
76
+ kwargs["model_max_tokens"] = model["max_tokens"]
77
+
78
+ verdict = data.get("verdict", {}).get("store", {})
79
+ if "path" in verdict:
80
+ kwargs["verdict_store_path"] = verdict["path"]
81
+
82
+ topology = data.get("topology", {})
83
+ if "manifests_dir" in topology:
84
+ kwargs["manifests_dir"] = topology["manifests_dir"]
85
+
86
+ state = data.get("state", {})
87
+ for key in ["watching_interval_seconds", "alert_interval_seconds",
88
+ "incident_interval_seconds", "degraded_interval_seconds"]:
89
+ short_key = key.replace("_seconds", "")
90
+ if key in state:
91
+ kwargs[short_key] = state[key]
92
+
93
+ traces = data.get("traces", {})
94
+ if "backend" in traces:
95
+ kwargs["trace_backend"] = traces["backend"]
96
+ if "detail" in traces:
97
+ kwargs["trace_detail"] = traces["detail"]
98
+ if "baseline_window" in traces:
99
+ kwargs["trace_baseline_window"] = traces["baseline_window"]
100
+
101
+ tempo = traces.get("tempo", {})
102
+ if "endpoint" in tempo:
103
+ kwargs["tempo_endpoint"] = tempo["endpoint"]
104
+ if "org_id" in tempo:
105
+ kwargs["tempo_org_id"] = tempo["org_id"]
106
+ if "timeout_seconds" in tempo:
107
+ kwargs["tempo_timeout"] = tempo["timeout_seconds"]
108
+ if "use_service_graphs" in tempo:
109
+ kwargs["tempo_use_service_graphs"] = tempo["use_service_graphs"]
110
+
111
+ return SitRepConfig(**kwargs)
@@ -0,0 +1 @@
1
+ """Signal correlation for SitRep. All operations are deterministic transport."""
@@ -0,0 +1,87 @@
1
+ """Change candidate indexing. Deterministic transport."""
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime, timezone
5
+
6
+ from nthlayer_workers.correlate.store.protocol import EventStore
7
+ from nthlayer_workers.correlate.types import ChangeCandidate, TemporalGroup
8
+
9
+
10
+ def _parse_ts(ts: str) -> datetime:
11
+ """Parse ISO 8601 timestamp. Always returns timezone-aware (UTC default)."""
12
+ dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
13
+ if dt.tzinfo is None:
14
+ dt = dt.replace(tzinfo=timezone.utc)
15
+ return dt
16
+
17
+
18
+ def find_change_candidates(
19
+ store: EventStore,
20
+ temporal_groups: list[TemporalGroup],
21
+ topology: dict | None = None,
22
+ window_minutes: int = 30,
23
+ ) -> dict[str, list[ChangeCandidate]]:
24
+ """For each temporal group, find recent changes on the same service or dependencies.
25
+
26
+ Returns mapping from service name to list of change candidates,
27
+ sorted by temporal proximity (closest first).
28
+ """
29
+ result: dict[str, list[ChangeCandidate]] = {}
30
+
31
+ for group in temporal_groups:
32
+ service = group.service
33
+ candidates: list[ChangeCandidate] = []
34
+
35
+ # Parse group's first event timestamp for proximity calculation
36
+ group_first_ts = _parse_ts(group.time_window[0])
37
+
38
+ # Use the group's first event as reference time for "recent" lookback.
39
+ # This is essential for replay with historical timestamps — without it,
40
+ # get_recent_changes uses datetime('now') and finds nothing.
41
+ reference_time = group.time_window[0]
42
+
43
+ # Check for changes on the same service
44
+ same_service_changes = store.get_recent_changes(
45
+ service, window_minutes, reference_time=reference_time
46
+ )
47
+ for change in same_service_changes:
48
+ change_ts = _parse_ts(change.timestamp)
49
+ proximity = abs((group_first_ts - change_ts).total_seconds())
50
+ candidates.append(
51
+ ChangeCandidate(
52
+ change=change,
53
+ affected_service=service,
54
+ temporal_proximity_seconds=proximity,
55
+ same_service=True,
56
+ dependency_related=False,
57
+ )
58
+ )
59
+
60
+ # Check for changes on dependency services
61
+ if topology is not None:
62
+ svc_info = topology.get(service, {})
63
+ dependencies = svc_info.get("dependencies", [])
64
+ for dep_service in dependencies:
65
+ dep_changes = store.get_recent_changes(
66
+ dep_service, window_minutes, reference_time=reference_time
67
+ )
68
+ for change in dep_changes:
69
+ change_ts = _parse_ts(change.timestamp)
70
+ proximity = abs((group_first_ts - change_ts).total_seconds())
71
+ candidates.append(
72
+ ChangeCandidate(
73
+ change=change,
74
+ affected_service=service,
75
+ temporal_proximity_seconds=proximity,
76
+ same_service=False,
77
+ dependency_related=True,
78
+ )
79
+ )
80
+
81
+ # Sort by temporal proximity (closest first)
82
+ candidates.sort(key=lambda c: c.temporal_proximity_seconds)
83
+
84
+ if candidates:
85
+ result[service] = candidates
86
+
87
+ return result
@@ -0,0 +1,62 @@
1
+ """Signal deduplication. Deterministic transport."""
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import replace
5
+
6
+ from nthlayer_workers.correlate.types import SitRepEvent
7
+
8
+
9
+ def _dedup_key(event: SitRepEvent) -> str:
10
+ """Build dedup key from event fields."""
11
+ parts = [event.source, event.service, event.type.value, event.environment]
12
+ # Add alert_name or metric from payload if present
13
+ alert_name = event.payload.get("alert_name") or event.payload.get("metric")
14
+ if alert_name:
15
+ parts.append(str(alert_name))
16
+ return "|".join(parts)
17
+
18
+
19
+ def deduplicate(events: list[SitRepEvent]) -> list[SitRepEvent]:
20
+ """Collapse events with the same dedup key.
21
+
22
+ Returns deduplicated list. The first occurrence of each key is kept,
23
+ with _dedup_count and _dedup_duration_seconds added to its payload.
24
+ """
25
+ seen: dict[str, SitRepEvent] = {}
26
+ counts: dict[str, int] = {}
27
+ first_ts: dict[str, str] = {}
28
+ last_ts: dict[str, str] = {}
29
+
30
+ for event in events:
31
+ key = _dedup_key(event)
32
+ if key not in seen:
33
+ seen[key] = event
34
+ counts[key] = 1
35
+ first_ts[key] = event.timestamp
36
+ last_ts[key] = event.timestamp
37
+ else:
38
+ counts[key] += 1
39
+ if event.timestamp < first_ts[key]:
40
+ first_ts[key] = event.timestamp
41
+ if event.timestamp > last_ts[key]:
42
+ last_ts[key] = event.timestamp
43
+
44
+ result = []
45
+ for key, event in seen.items():
46
+ if counts[key] > 1:
47
+ # Add dedup metadata to payload (don't mutate original)
48
+ from datetime import datetime
49
+ try:
50
+ start = datetime.fromisoformat(first_ts[key].replace("Z", "+00:00"))
51
+ end = datetime.fromisoformat(last_ts[key].replace("Z", "+00:00"))
52
+ duration = (end - start).total_seconds()
53
+ except (ValueError, TypeError):
54
+ duration = 0.0
55
+
56
+ new_payload = dict(event.payload)
57
+ new_payload["_dedup_count"] = counts[key]
58
+ new_payload["_dedup_duration_seconds"] = duration
59
+ event = replace(event, payload=new_payload)
60
+ result.append(event)
61
+
62
+ return result
@@ -0,0 +1,244 @@
1
+ """Pre-correlation engine. Orchestrates all correlation steps."""
2
+ from __future__ import annotations
3
+
4
+ import uuid
5
+ from dataclasses import replace
6
+ from datetime import datetime, timedelta, timezone
7
+
8
+ from nthlayer_workers.correlate.correlation.changes import find_change_candidates
9
+ from nthlayer_workers.correlate.correlation.dedup import deduplicate
10
+ from nthlayer_workers.correlate.correlation.temporal import group_temporal
11
+ from nthlayer_workers.correlate.correlation.topology import group_topology
12
+ from nthlayer_workers.correlate.ingestion.severity import pre_score
13
+ from nthlayer_workers.correlate.store.protocol import EventStore
14
+ from nthlayer_workers.correlate.types import ChangeCandidate, CorrelationGroup, TemporalGroup
15
+
16
+
17
+ class CorrelationEngine:
18
+ """Orchestrates the full pre-correlation pipeline. All deterministic transport."""
19
+
20
+ def correlate(
21
+ self,
22
+ store: EventStore,
23
+ window_minutes: int = 5,
24
+ topology: dict | None = None,
25
+ slo_targets: dict | None = None,
26
+ ) -> list[CorrelationGroup]:
27
+ """Run full pre-correlation pipeline. All deterministic transport.
28
+
29
+ Steps:
30
+ 1. Query store for events in window
31
+ 2. Deduplicate
32
+ 3. Severity enrichment (second-pass for events without SLO context)
33
+ 4. Temporal grouping
34
+ 5. Topology-aware grouping
35
+ 6. Change candidate indexing
36
+ 7. Assemble CorrelationGroups with priority scoring
37
+ """
38
+ # Step 1: Query store for events in window
39
+ now = datetime.now(timezone.utc)
40
+ start = (now - timedelta(minutes=window_minutes)).isoformat()
41
+ end = now.isoformat()
42
+
43
+ events = store.get_by_time_window(start, end)
44
+ if not events:
45
+ return []
46
+
47
+ # Step 2: Deduplicate
48
+ deduped = deduplicate(events)
49
+
50
+ # Step 3: Severity enrichment
51
+ enriched = []
52
+ for event in deduped:
53
+ new_severity = pre_score(event, slo_targets)
54
+ if new_severity != event.severity:
55
+ event = replace(event, severity=new_severity)
56
+ enriched.append(event)
57
+
58
+ # Step 4: Temporal grouping
59
+ temporal_groups = group_temporal(enriched, window_minutes=window_minutes)
60
+ if not temporal_groups:
61
+ return []
62
+
63
+ # Step 5: Topology-aware grouping
64
+ topology_correlations = group_topology(temporal_groups, topology)
65
+
66
+ # Step 6: Change candidate indexing
67
+ change_candidates_map = find_change_candidates(
68
+ store, temporal_groups, topology=topology, window_minutes=max(window_minutes, 30)
69
+ )
70
+
71
+ # Step 7: Assemble CorrelationGroups with priority scoring
72
+ return self.assemble_groups(
73
+ temporal_groups, topology_correlations, change_candidates_map, topology
74
+ )
75
+
76
+ def assemble_groups(
77
+ self,
78
+ temporal_groups: list[TemporalGroup],
79
+ topology_correlations: list,
80
+ change_candidates_map: dict[str, list[ChangeCandidate]],
81
+ topology: dict | None,
82
+ ) -> list[CorrelationGroup]:
83
+ """Assemble CorrelationGroups from pre-computed correlation sub-step outputs.
84
+
85
+ Two-pass assembly: topology-linked groups first, then unassigned temporal groups.
86
+ Used by both correlate() and CLI replay.
87
+ """
88
+ assigned: set[int] = set()
89
+ correlation_groups: list[CorrelationGroup] = []
90
+
91
+ # First pass: create groups from topology correlations
92
+ for tc in topology_correlations:
93
+ involved_services = {tc.primary_service}
94
+ for rs in tc.related_services:
95
+ involved_services.add(rs["service"])
96
+
97
+ signals: list[TemporalGroup] = []
98
+ for gi, tg in enumerate(temporal_groups):
99
+ if tg.service in involved_services and gi not in assigned:
100
+ signals.append(tg)
101
+ assigned.add(gi)
102
+
103
+ if not signals:
104
+ continue
105
+
106
+ all_changes: list[ChangeCandidate] = []
107
+ for svc in involved_services:
108
+ all_changes.extend(change_candidates_map.get(svc, []))
109
+
110
+ peak_severity = max(s.peak_severity for s in signals)
111
+ priority = self._compute_priority(
112
+ peak_severity, list(involved_services), topology, topology_correlations
113
+ )
114
+
115
+ total_events = sum(s.count for s in signals)
116
+ primary_type = self._dominant_event_type(signals)
117
+ services_str = ", ".join(sorted(involved_services))
118
+ summary = (
119
+ f"{total_events} {primary_type}(s) on {services_str} "
120
+ f"with {len(all_changes)} recent change(s)"
121
+ )
122
+
123
+ all_timestamps = []
124
+ for s in signals:
125
+ all_timestamps.append(s.time_window[0])
126
+ all_timestamps.append(s.time_window[1])
127
+ first_seen = min(all_timestamps)
128
+ last_updated = max(all_timestamps)
129
+
130
+ group_id = f"cg-{uuid.uuid4().hex[:8]}"
131
+ correlation_groups.append(
132
+ CorrelationGroup(
133
+ id=group_id,
134
+ priority=priority,
135
+ summary=summary,
136
+ services=sorted(involved_services),
137
+ signals=signals,
138
+ topology=tc,
139
+ change_candidates=all_changes,
140
+ first_seen=first_seen,
141
+ last_updated=last_updated,
142
+ event_count=total_events,
143
+ )
144
+ )
145
+
146
+ # Second pass: create groups for unassigned temporal groups
147
+ for gi, tg in enumerate(temporal_groups):
148
+ if gi in assigned:
149
+ continue
150
+
151
+ service = tg.service
152
+ changes = change_candidates_map.get(service, [])
153
+ peak_severity = tg.peak_severity
154
+ priority = self._compute_priority(
155
+ peak_severity, [service], topology, topology_correlations
156
+ )
157
+
158
+ primary_type = self._dominant_event_type([tg])
159
+ summary = (
160
+ f"{tg.count} {primary_type}(s) on {service} "
161
+ f"with {len(changes)} recent change(s)"
162
+ )
163
+
164
+ group_id = f"cg-{uuid.uuid4().hex[:8]}"
165
+ correlation_groups.append(
166
+ CorrelationGroup(
167
+ id=group_id,
168
+ priority=priority,
169
+ summary=summary,
170
+ services=[service],
171
+ signals=[tg],
172
+ topology=None,
173
+ change_candidates=changes,
174
+ first_seen=tg.time_window[0],
175
+ last_updated=tg.time_window[1],
176
+ event_count=tg.count,
177
+ )
178
+ )
179
+
180
+ correlation_groups.sort(
181
+ key=lambda g: (g.priority, -max(s.peak_severity for s in g.signals))
182
+ )
183
+ return correlation_groups
184
+
185
+ def _compute_priority(
186
+ self,
187
+ peak_severity: float,
188
+ services: list[str],
189
+ topology: dict | None,
190
+ topology_correlations: list,
191
+ ) -> int:
192
+ """Compute priority tier for a correlation group.
193
+
194
+ P0: peak_severity > 0.8 AND service tier is "critical"
195
+ P1: peak_severity > 0.6 OR has topology correlation with a P0 group
196
+ P2: peak_severity > 0.3
197
+ P3: everything else
198
+ """
199
+ # Check if any service is critical tier
200
+ has_critical = False
201
+ if topology is not None:
202
+ for svc in services:
203
+ svc_info = topology.get(svc, {})
204
+ if svc_info.get("tier") == "critical":
205
+ has_critical = True
206
+ break
207
+
208
+ if peak_severity > 0.8 and has_critical:
209
+ return 0 # P0
210
+
211
+ if peak_severity > 0.6:
212
+ return 1 # P1
213
+
214
+ # P1 if has topology correlation with a P0-eligible group
215
+ if topology is not None and topology_correlations:
216
+ for tc in topology_correlations:
217
+ linked_services = {tc.primary_service}
218
+ for rs in tc.related_services:
219
+ linked_services.add(rs["service"])
220
+ if linked_services & set(services):
221
+ # Check if any linked service is P0-eligible
222
+ for svc in linked_services:
223
+ svc_info = topology.get(svc, {})
224
+ if svc_info.get("tier") == "critical":
225
+ return 1 # P1 — topology link to critical service
226
+
227
+ if peak_severity > 0.3:
228
+ return 2 # P2
229
+
230
+ return 3 # P3
231
+
232
+ @staticmethod
233
+ def _dominant_event_type(signals: list[TemporalGroup]) -> str:
234
+ """Find the most common event type across all signals."""
235
+ type_counts: dict[str, int] = {}
236
+ for signal in signals:
237
+ for event in signal.events:
238
+ t = event.type.value
239
+ type_counts[t] = type_counts.get(t, 0) + 1
240
+
241
+ if not type_counts:
242
+ return "event"
243
+
244
+ return max(type_counts, key=type_counts.get) # type: ignore[arg-type]
@@ -0,0 +1,79 @@
1
+ """Temporal grouping — windowed aggregation by service. Deterministic transport."""
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime, timezone
5
+
6
+ from nthlayer_workers.correlate.types import SitRepEvent, TemporalGroup
7
+
8
+
9
+ def _parse_ts(ts: str) -> datetime:
10
+ """Parse ISO 8601 timestamp. Always returns timezone-aware (UTC default)."""
11
+ dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
12
+ if dt.tzinfo is None:
13
+ dt = dt.replace(tzinfo=timezone.utc)
14
+ return dt
15
+
16
+
17
+ def group_temporal(
18
+ events: list[SitRepEvent], window_minutes: int = 5
19
+ ) -> list[TemporalGroup]:
20
+ """Group events by service within time windows.
21
+
22
+ Events on the same service within window_minutes of the earliest
23
+ event in the group are grouped together.
24
+ """
25
+ if not events:
26
+ return []
27
+
28
+ # Sort by timestamp
29
+ sorted_events = sorted(events, key=lambda e: e.timestamp)
30
+
31
+ # Group by service
32
+ by_service: dict[str, list[SitRepEvent]] = {}
33
+ for event in sorted_events:
34
+ by_service.setdefault(event.service, []).append(event)
35
+
36
+ groups: list[TemporalGroup] = []
37
+ window_seconds = window_minutes * 60
38
+
39
+ for service, svc_events in by_service.items():
40
+ # Split into windows
41
+ current_window: list[SitRepEvent] = [svc_events[0]]
42
+ window_start = _parse_ts(svc_events[0].timestamp)
43
+
44
+ for event in svc_events[1:]:
45
+ event_ts = _parse_ts(event.timestamp)
46
+ if (event_ts - window_start).total_seconds() <= window_seconds:
47
+ current_window.append(event)
48
+ else:
49
+ # Emit current window, start new one
50
+ groups.append(_make_group(service, current_window))
51
+ current_window = [event]
52
+ window_start = event_ts
53
+
54
+ # Emit last window
55
+ if current_window:
56
+ groups.append(_make_group(service, current_window))
57
+
58
+ return groups
59
+
60
+
61
+ def _make_group(service: str, events: list[SitRepEvent]) -> TemporalGroup:
62
+ """Create a TemporalGroup from a list of events."""
63
+ first_ts = events[0].timestamp
64
+ last_ts = events[-1].timestamp
65
+ try:
66
+ start = _parse_ts(first_ts)
67
+ end = _parse_ts(last_ts)
68
+ duration = (end - start).total_seconds()
69
+ except (ValueError, TypeError):
70
+ duration = 0.0
71
+
72
+ return TemporalGroup(
73
+ service=service,
74
+ time_window=(first_ts, last_ts),
75
+ events=events,
76
+ count=len(events),
77
+ peak_severity=max(e.severity for e in events),
78
+ duration_seconds=duration,
79
+ )
@@ -0,0 +1,104 @@
1
+ """Topology-aware grouping. Deterministic transport."""
2
+ from __future__ import annotations
3
+
4
+ from nthlayer_workers.correlate.types import TemporalGroup, TopologyCorrelation
5
+
6
+
7
+ def _get_tier_rank(topology: dict, service: str) -> int:
8
+ """Return numeric rank for service tier. Higher = more critical."""
9
+ tier_ranks = {"critical": 3, "standard": 2, "low": 1}
10
+ svc_info = topology.get(service, {})
11
+ tier = svc_info.get("tier", "standard")
12
+ return tier_ranks.get(tier, 1)
13
+
14
+
15
+ def _depends_on(topology: dict, service_a: str, service_b: str) -> bool:
16
+ """Check if service_a depends on service_b."""
17
+ svc_info = topology.get(service_a, {})
18
+ return service_b in svc_info.get("dependencies", [])
19
+
20
+
21
+ def group_topology(
22
+ temporal_groups: list[TemporalGroup],
23
+ topology: dict | None,
24
+ ) -> list[TopologyCorrelation]:
25
+ """Link temporal groups for services that have dependency relationships.
26
+
27
+ Args:
28
+ temporal_groups: Groups from temporal.group_temporal()
29
+ topology: Dict mapping service names to {dependencies: [...], dependents: [...], tier: ...}
30
+ Typically from OpenSRM manifests. None = skip topology grouping.
31
+
32
+ Returns:
33
+ List of TopologyCorrelations linking related service groups.
34
+ """
35
+ if topology is None:
36
+ return []
37
+
38
+ if len(temporal_groups) < 2:
39
+ return []
40
+
41
+ # Track which pairs have been linked to avoid duplicates
42
+ linked: set[tuple[str, str]] = set()
43
+ results: list[TopologyCorrelation] = []
44
+
45
+ for i, group_a in enumerate(temporal_groups):
46
+ for j, group_b in enumerate(temporal_groups):
47
+ if i >= j:
48
+ continue
49
+ if group_a.service == group_b.service:
50
+ continue
51
+
52
+ pair_key = tuple(sorted((group_a.service, group_b.service)))
53
+ if pair_key in linked:
54
+ continue
55
+
56
+ a_depends_on_b = _depends_on(topology, group_a.service, group_b.service)
57
+ b_depends_on_a = _depends_on(topology, group_b.service, group_a.service)
58
+
59
+ if not a_depends_on_b and not b_depends_on_a:
60
+ continue
61
+
62
+ linked.add(pair_key)
63
+
64
+ # Determine primary: higher severity wins, then higher tier
65
+ a_severity = group_a.peak_severity
66
+ b_severity = group_b.peak_severity
67
+ a_tier = _get_tier_rank(topology, group_a.service)
68
+ b_tier = _get_tier_rank(topology, group_b.service)
69
+
70
+ if a_severity > b_severity or (a_severity == b_severity and a_tier >= b_tier):
71
+ primary_service = group_a.service
72
+ secondary_service = group_b.service
73
+ secondary_group = group_b
74
+ else:
75
+ primary_service = group_b.service
76
+ secondary_service = group_a.service
77
+ secondary_group = group_a
78
+
79
+ # Determine relationship: from primary's perspective toward secondary
80
+ if _depends_on(topology, primary_service, secondary_service):
81
+ relationship = "depends_on"
82
+ else:
83
+ relationship = "depended_by"
84
+
85
+ related_services = [
86
+ {
87
+ "service": secondary_service,
88
+ "relationship": relationship,
89
+ "events": secondary_group.events,
90
+ }
91
+ ]
92
+
93
+ # Build topology path
94
+ topology_path = [primary_service, secondary_service]
95
+
96
+ results.append(
97
+ TopologyCorrelation(
98
+ primary_service=primary_service,
99
+ related_services=related_services,
100
+ topology_path=topology_path,
101
+ )
102
+ )
103
+
104
+ return results
@@ -0,0 +1 @@
1
+ """Event ingestion for SitRep."""