nthlayer-workers 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nthlayer_workers/__init__.py +5 -0
- nthlayer_workers/cli.py +234 -0
- nthlayer_workers/correlate/__init__.py +1 -0
- nthlayer_workers/correlate/cli.py +847 -0
- nthlayer_workers/correlate/config.py +111 -0
- nthlayer_workers/correlate/correlation/__init__.py +1 -0
- nthlayer_workers/correlate/correlation/changes.py +87 -0
- nthlayer_workers/correlate/correlation/dedup.py +62 -0
- nthlayer_workers/correlate/correlation/engine.py +244 -0
- nthlayer_workers/correlate/correlation/temporal.py +79 -0
- nthlayer_workers/correlate/correlation/topology.py +104 -0
- nthlayer_workers/correlate/ingestion/__init__.py +1 -0
- nthlayer_workers/correlate/ingestion/protocol.py +10 -0
- nthlayer_workers/correlate/ingestion/severity.py +18 -0
- nthlayer_workers/correlate/ingestion/webhook.py +197 -0
- nthlayer_workers/correlate/notifications.py +85 -0
- nthlayer_workers/correlate/prometheus.py +234 -0
- nthlayer_workers/correlate/reasoning.py +375 -0
- nthlayer_workers/correlate/session.py +189 -0
- nthlayer_workers/correlate/snapshot/__init__.py +1 -0
- nthlayer_workers/correlate/snapshot/generator.py +170 -0
- nthlayer_workers/correlate/snapshot/model.py +177 -0
- nthlayer_workers/correlate/snapshot/token.py +14 -0
- nthlayer_workers/correlate/state.py +88 -0
- nthlayer_workers/correlate/store/__init__.py +5 -0
- nthlayer_workers/correlate/store/protocol.py +48 -0
- nthlayer_workers/correlate/store/sqlite.py +443 -0
- nthlayer_workers/correlate/summary.py +180 -0
- nthlayer_workers/correlate/traces/__init__.py +1 -0
- nthlayer_workers/correlate/traces/protocol.py +120 -0
- nthlayer_workers/correlate/traces/tempo.py +667 -0
- nthlayer_workers/correlate/traces/topology.py +39 -0
- nthlayer_workers/correlate/types.py +77 -0
- nthlayer_workers/correlate/worker.py +630 -0
- nthlayer_workers/learn/__init__.py +5 -0
- nthlayer_workers/learn/__main__.py +5 -0
- nthlayer_workers/learn/cli.py +164 -0
- nthlayer_workers/learn/retrospective.py +381 -0
- nthlayer_workers/learn/trends.py +102 -0
- nthlayer_workers/learn/worker.py +366 -0
- nthlayer_workers/measure/__init__.py +3 -0
- nthlayer_workers/measure/__main__.py +5 -0
- nthlayer_workers/measure/_parsing.py +15 -0
- nthlayer_workers/measure/adapters/__init__.py +0 -0
- nthlayer_workers/measure/adapters/_util.py +24 -0
- nthlayer_workers/measure/adapters/devin.py +119 -0
- nthlayer_workers/measure/adapters/gastown.py +88 -0
- nthlayer_workers/measure/adapters/prometheus.py +277 -0
- nthlayer_workers/measure/adapters/protocol.py +20 -0
- nthlayer_workers/measure/adapters/webhook.py +161 -0
- nthlayer_workers/measure/api/__init__.py +0 -0
- nthlayer_workers/measure/api/normalise.py +50 -0
- nthlayer_workers/measure/api/queue.py +243 -0
- nthlayer_workers/measure/api/response.py +51 -0
- nthlayer_workers/measure/api/server.py +504 -0
- nthlayer_workers/measure/calibration/__init__.py +0 -0
- nthlayer_workers/measure/calibration/loop.py +62 -0
- nthlayer_workers/measure/calibration/slos.py +212 -0
- nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
- nthlayer_workers/measure/cli.py +753 -0
- nthlayer_workers/measure/config.py +191 -0
- nthlayer_workers/measure/detection/__init__.py +6 -0
- nthlayer_workers/measure/detection/detector.py +82 -0
- nthlayer_workers/measure/detection/protocol.py +29 -0
- nthlayer_workers/measure/governance/__init__.py +0 -0
- nthlayer_workers/measure/governance/engine.py +163 -0
- nthlayer_workers/measure/manifest.py +77 -0
- nthlayer_workers/measure/notifications.py +53 -0
- nthlayer_workers/measure/pipeline/__init__.py +0 -0
- nthlayer_workers/measure/pipeline/evaluator.py +155 -0
- nthlayer_workers/measure/pipeline/router.py +160 -0
- nthlayer_workers/measure/store/__init__.py +0 -0
- nthlayer_workers/measure/store/protocol.py +38 -0
- nthlayer_workers/measure/store/sqlite.py +276 -0
- nthlayer_workers/measure/telemetry.py +116 -0
- nthlayer_workers/measure/tiering/__init__.py +0 -0
- nthlayer_workers/measure/tiering/classifier.py +58 -0
- nthlayer_workers/measure/tiering/promotion.py +118 -0
- nthlayer_workers/measure/trends/__init__.py +0 -0
- nthlayer_workers/measure/trends/tracker.py +72 -0
- nthlayer_workers/measure/types.py +75 -0
- nthlayer_workers/measure/worker.py +439 -0
- nthlayer_workers/observe/__init__.py +25 -0
- nthlayer_workers/observe/__main__.py +5 -0
- nthlayer_workers/observe/api/__init__.py +1 -0
- nthlayer_workers/observe/assessment.py +95 -0
- nthlayer_workers/observe/cli.py +737 -0
- nthlayer_workers/observe/config.py +11 -0
- nthlayer_workers/observe/db/__init__.py +1 -0
- nthlayer_workers/observe/decision_records.py +220 -0
- nthlayer_workers/observe/dependencies/__init__.py +18 -0
- nthlayer_workers/observe/dependencies/discovery.py +294 -0
- nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
- nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
- nthlayer_workers/observe/dependencies/providers/base.py +76 -0
- nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
- nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
- nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
- nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
- nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
- nthlayer_workers/observe/deployments/__init__.py +1 -0
- nthlayer_workers/observe/discovery/__init__.py +14 -0
- nthlayer_workers/observe/discovery/classifier.py +66 -0
- nthlayer_workers/observe/discovery/client.py +189 -0
- nthlayer_workers/observe/discovery/models.py +53 -0
- nthlayer_workers/observe/drift/__init__.py +26 -0
- nthlayer_workers/observe/drift/analyzer.py +383 -0
- nthlayer_workers/observe/drift/models.py +174 -0
- nthlayer_workers/observe/drift/patterns.py +88 -0
- nthlayer_workers/observe/explanation.py +118 -0
- nthlayer_workers/observe/gate/__init__.py +39 -0
- nthlayer_workers/observe/gate/conditions.py +92 -0
- nthlayer_workers/observe/gate/correlator.py +154 -0
- nthlayer_workers/observe/gate/evaluator.py +192 -0
- nthlayer_workers/observe/gate/policies.py +226 -0
- nthlayer_workers/observe/gate_adapter.py +40 -0
- nthlayer_workers/observe/incident.py +36 -0
- nthlayer_workers/observe/portfolio/__init__.py +17 -0
- nthlayer_workers/observe/portfolio/aggregator.py +168 -0
- nthlayer_workers/observe/portfolio/scorer.py +13 -0
- nthlayer_workers/observe/slo/__init__.py +19 -0
- nthlayer_workers/observe/slo/collector.py +235 -0
- nthlayer_workers/observe/slo/spec_loader.py +40 -0
- nthlayer_workers/observe/sqlite_store.py +152 -0
- nthlayer_workers/observe/store.py +92 -0
- nthlayer_workers/observe/verification/__init__.py +22 -0
- nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
- nthlayer_workers/observe/verification/extractor.py +127 -0
- nthlayer_workers/observe/verification/models.py +101 -0
- nthlayer_workers/observe/verification/verifier.py +111 -0
- nthlayer_workers/observe/worker.py +332 -0
- nthlayer_workers/respond/__init__.py +2 -0
- nthlayer_workers/respond/__main__.py +4 -0
- nthlayer_workers/respond/agents/__init__.py +0 -0
- nthlayer_workers/respond/agents/base.py +556 -0
- nthlayer_workers/respond/agents/communication.py +115 -0
- nthlayer_workers/respond/agents/investigation.py +124 -0
- nthlayer_workers/respond/agents/remediation.py +219 -0
- nthlayer_workers/respond/agents/triage.py +132 -0
- nthlayer_workers/respond/cli.py +772 -0
- nthlayer_workers/respond/config.py +135 -0
- nthlayer_workers/respond/context_store.py +256 -0
- nthlayer_workers/respond/coordinator.py +487 -0
- nthlayer_workers/respond/metrics.py +104 -0
- nthlayer_workers/respond/notification_backends/__init__.py +1 -0
- nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
- nthlayer_workers/respond/notification_backends/protocol.py +59 -0
- nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
- nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
- nthlayer_workers/respond/notifications.py +247 -0
- nthlayer_workers/respond/oncall/__init__.py +1 -0
- nthlayer_workers/respond/oncall/escalation.py +103 -0
- nthlayer_workers/respond/oncall/runner.py +193 -0
- nthlayer_workers/respond/oncall/schedule.py +243 -0
- nthlayer_workers/respond/safe_actions/__init__.py +0 -0
- nthlayer_workers/respond/safe_actions/actions.py +139 -0
- nthlayer_workers/respond/safe_actions/registry.py +171 -0
- nthlayer_workers/respond/safe_actions/webhook.py +194 -0
- nthlayer_workers/respond/server.py +357 -0
- nthlayer_workers/respond/sre/__init__.py +1 -0
- nthlayer_workers/respond/sre/brief.py +175 -0
- nthlayer_workers/respond/sre/delegation.py +101 -0
- nthlayer_workers/respond/sre/post_incident.py +146 -0
- nthlayer_workers/respond/sre/shift_report.py +129 -0
- nthlayer_workers/respond/sre/suppression.py +91 -0
- nthlayer_workers/respond/types.py +109 -0
- nthlayer_workers/respond/verdict_submission.py +56 -0
- nthlayer_workers/respond/worker.py +533 -0
- nthlayer_workers/respond/worker_helpers.py +140 -0
- nthlayer_workers/runner.py +198 -0
- nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
- nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
- nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
- nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
- nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
"""SQLite FTS5 implementation of EventStore."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import sqlite3
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from nthlayer_workers.correlate.types import EventType, SitRepEvent
|
|
10
|
+
|
|
11
|
+
_SCHEMA_SQL = """\
|
|
12
|
+
CREATE TABLE IF NOT EXISTS events (
|
|
13
|
+
id TEXT PRIMARY KEY,
|
|
14
|
+
timestamp TEXT NOT NULL,
|
|
15
|
+
source TEXT NOT NULL,
|
|
16
|
+
type TEXT NOT NULL,
|
|
17
|
+
service TEXT NOT NULL,
|
|
18
|
+
environment TEXT NOT NULL,
|
|
19
|
+
severity REAL NOT NULL DEFAULT 0.5,
|
|
20
|
+
payload TEXT NOT NULL,
|
|
21
|
+
dependencies TEXT,
|
|
22
|
+
dependents TEXT,
|
|
23
|
+
ttl INTEGER NOT NULL DEFAULT 86400,
|
|
24
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
CREATE INDEX IF NOT EXISTS idx_events_timestamp
|
|
28
|
+
ON events(timestamp DESC);
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_events_service_time
|
|
30
|
+
ON events(service, timestamp DESC);
|
|
31
|
+
CREATE INDEX IF NOT EXISTS idx_events_type_time
|
|
32
|
+
ON events(type, timestamp DESC);
|
|
33
|
+
CREATE INDEX IF NOT EXISTS idx_events_changes
|
|
34
|
+
ON events(type, service, timestamp DESC) WHERE type = 'change';
|
|
35
|
+
CREATE INDEX IF NOT EXISTS idx_events_expiry
|
|
36
|
+
ON events(created_at, ttl);
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
_FTS_SCHEMA_SQL = """\
|
|
40
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS events_fts USING fts5(
|
|
41
|
+
id, service, source, type, payload_text,
|
|
42
|
+
content=events, content_rowid=rowid,
|
|
43
|
+
tokenize='porter'
|
|
44
|
+
);
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SQLiteEventStore:
|
|
49
|
+
"""SQLite FTS5 event store with WAL mode and BM25 ranking."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, db_path: str) -> None:
|
|
52
|
+
self._db_path = db_path
|
|
53
|
+
self._conn = sqlite3.connect(db_path)
|
|
54
|
+
self._conn.row_factory = sqlite3.Row
|
|
55
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
56
|
+
self._conn.execute("PRAGMA busy_timeout=5000")
|
|
57
|
+
self._conn.executescript(_SCHEMA_SQL)
|
|
58
|
+
self._conn.executescript(_FTS_SCHEMA_SQL)
|
|
59
|
+
self._conn.commit()
|
|
60
|
+
|
|
61
|
+
# -- context manager ------------------------------------------------
|
|
62
|
+
|
|
63
|
+
def __enter__(self) -> SQLiteEventStore:
|
|
64
|
+
return self
|
|
65
|
+
|
|
66
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
67
|
+
self.close()
|
|
68
|
+
|
|
69
|
+
def close(self) -> None:
|
|
70
|
+
"""Close the database connection."""
|
|
71
|
+
self._conn.close()
|
|
72
|
+
|
|
73
|
+
# -- helpers --------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
@staticmethod
|
|
76
|
+
def _flatten_payload(payload: dict[str, Any]) -> str:
|
|
77
|
+
"""Recursively flatten a dict to searchable text.
|
|
78
|
+
|
|
79
|
+
Example: {"alert_name": "latency_breach", "value": 0.5}
|
|
80
|
+
-> "alert_name latency_breach value 0.5"
|
|
81
|
+
"""
|
|
82
|
+
parts: list[str] = []
|
|
83
|
+
|
|
84
|
+
def _walk(obj: Any) -> None:
|
|
85
|
+
if isinstance(obj, dict):
|
|
86
|
+
for k, v in obj.items():
|
|
87
|
+
parts.append(str(k))
|
|
88
|
+
_walk(v)
|
|
89
|
+
elif isinstance(obj, list):
|
|
90
|
+
for item in obj:
|
|
91
|
+
_walk(item)
|
|
92
|
+
else:
|
|
93
|
+
parts.append(str(obj))
|
|
94
|
+
|
|
95
|
+
_walk(payload)
|
|
96
|
+
return " ".join(parts)
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def _row_to_event(row: sqlite3.Row) -> SitRepEvent:
|
|
100
|
+
"""Deserialize a database row into a SitRepEvent."""
|
|
101
|
+
deps_raw = row["dependencies"]
|
|
102
|
+
deps = json.loads(deps_raw) if deps_raw else []
|
|
103
|
+
depts_raw = row["dependents"]
|
|
104
|
+
depts = json.loads(depts_raw) if depts_raw else []
|
|
105
|
+
|
|
106
|
+
return SitRepEvent(
|
|
107
|
+
id=row["id"],
|
|
108
|
+
timestamp=row["timestamp"],
|
|
109
|
+
source=row["source"],
|
|
110
|
+
type=EventType(row["type"]),
|
|
111
|
+
service=row["service"],
|
|
112
|
+
environment=row["environment"],
|
|
113
|
+
severity=row["severity"],
|
|
114
|
+
payload=json.loads(row["payload"]),
|
|
115
|
+
dependencies=deps,
|
|
116
|
+
dependents=depts,
|
|
117
|
+
ttl=row["ttl"],
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# -- mutations ------------------------------------------------------
|
|
121
|
+
|
|
122
|
+
def insert(self, event: SitRepEvent) -> None:
|
|
123
|
+
"""Insert a single event into the store."""
|
|
124
|
+
payload_json = json.dumps(event.payload)
|
|
125
|
+
deps_json = json.dumps(event.dependencies) if event.dependencies else None
|
|
126
|
+
depts_json = json.dumps(event.dependents) if event.dependents else None
|
|
127
|
+
payload_text = self._flatten_payload(event.payload)
|
|
128
|
+
|
|
129
|
+
self._conn.execute(
|
|
130
|
+
"""INSERT INTO events
|
|
131
|
+
(id, timestamp, source, type, service, environment,
|
|
132
|
+
severity, payload, dependencies, dependents, ttl)
|
|
133
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
134
|
+
(
|
|
135
|
+
event.id,
|
|
136
|
+
event.timestamp,
|
|
137
|
+
event.source,
|
|
138
|
+
event.type.value,
|
|
139
|
+
event.service,
|
|
140
|
+
event.environment,
|
|
141
|
+
event.severity,
|
|
142
|
+
payload_json,
|
|
143
|
+
deps_json,
|
|
144
|
+
depts_json,
|
|
145
|
+
event.ttl,
|
|
146
|
+
),
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Insert into FTS5 index
|
|
150
|
+
self._conn.execute(
|
|
151
|
+
"""INSERT INTO events_fts
|
|
152
|
+
(rowid, id, service, source, type, payload_text)
|
|
153
|
+
SELECT rowid, id, service, source, type, ?
|
|
154
|
+
FROM events WHERE id = ?""",
|
|
155
|
+
(payload_text, event.id),
|
|
156
|
+
)
|
|
157
|
+
self._conn.commit()
|
|
158
|
+
|
|
159
|
+
def insert_batch(self, events: list[SitRepEvent]) -> None:
|
|
160
|
+
"""Insert multiple events in a single transaction."""
|
|
161
|
+
if not events:
|
|
162
|
+
return
|
|
163
|
+
with self._conn:
|
|
164
|
+
for event in events:
|
|
165
|
+
payload_json = json.dumps(event.payload)
|
|
166
|
+
deps_json = (
|
|
167
|
+
json.dumps(event.dependencies) if event.dependencies else None
|
|
168
|
+
)
|
|
169
|
+
depts_json = (
|
|
170
|
+
json.dumps(event.dependents) if event.dependents else None
|
|
171
|
+
)
|
|
172
|
+
payload_text = self._flatten_payload(event.payload)
|
|
173
|
+
|
|
174
|
+
self._conn.execute(
|
|
175
|
+
"""INSERT INTO events
|
|
176
|
+
(id, timestamp, source, type, service, environment,
|
|
177
|
+
severity, payload, dependencies, dependents, ttl)
|
|
178
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
179
|
+
(
|
|
180
|
+
event.id,
|
|
181
|
+
event.timestamp,
|
|
182
|
+
event.source,
|
|
183
|
+
event.type.value,
|
|
184
|
+
event.service,
|
|
185
|
+
event.environment,
|
|
186
|
+
event.severity,
|
|
187
|
+
payload_json,
|
|
188
|
+
deps_json,
|
|
189
|
+
depts_json,
|
|
190
|
+
event.ttl,
|
|
191
|
+
),
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
self._conn.execute(
|
|
195
|
+
"""INSERT INTO events_fts
|
|
196
|
+
(rowid, id, service, source, type, payload_text)
|
|
197
|
+
SELECT rowid, id, service, source, type, ?
|
|
198
|
+
FROM events WHERE id = ?""",
|
|
199
|
+
(payload_text, event.id),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# -- queries --------------------------------------------------------
|
|
203
|
+
|
|
204
|
+
def get_by_time_window(
|
|
205
|
+
self,
|
|
206
|
+
start: str,
|
|
207
|
+
end: str,
|
|
208
|
+
*,
|
|
209
|
+
service: str | None = None,
|
|
210
|
+
event_type: EventType | None = None,
|
|
211
|
+
min_severity: float | None = None,
|
|
212
|
+
) -> list[SitRepEvent]:
|
|
213
|
+
"""Query events within a time window with optional filters."""
|
|
214
|
+
clauses = ["timestamp >= ?", "timestamp <= ?"]
|
|
215
|
+
params: list[Any] = [start, end]
|
|
216
|
+
|
|
217
|
+
if service is not None:
|
|
218
|
+
clauses.append("service = ?")
|
|
219
|
+
params.append(service)
|
|
220
|
+
if event_type is not None:
|
|
221
|
+
clauses.append("type = ?")
|
|
222
|
+
params.append(event_type.value)
|
|
223
|
+
if min_severity is not None:
|
|
224
|
+
clauses.append("severity >= ?")
|
|
225
|
+
params.append(min_severity)
|
|
226
|
+
|
|
227
|
+
where = " AND ".join(clauses)
|
|
228
|
+
sql = f"SELECT * FROM events WHERE {where} ORDER BY timestamp DESC"
|
|
229
|
+
rows = self._conn.execute(sql, params).fetchall()
|
|
230
|
+
return [self._row_to_event(row) for row in rows]
|
|
231
|
+
|
|
232
|
+
def search(
|
|
233
|
+
self,
|
|
234
|
+
query: str,
|
|
235
|
+
*,
|
|
236
|
+
limit: int = 100,
|
|
237
|
+
time_window: tuple[str, str] | None = None,
|
|
238
|
+
service: str | None = None,
|
|
239
|
+
) -> list[SitRepEvent]:
|
|
240
|
+
"""Full-text search using FTS5 with BM25 ranking."""
|
|
241
|
+
clauses = ["events_fts MATCH ?"]
|
|
242
|
+
params: list[Any] = [query]
|
|
243
|
+
|
|
244
|
+
if time_window is not None:
|
|
245
|
+
clauses.append("e.timestamp >= ?")
|
|
246
|
+
clauses.append("e.timestamp <= ?")
|
|
247
|
+
params.extend(time_window)
|
|
248
|
+
if service is not None:
|
|
249
|
+
clauses.append("e.service = ?")
|
|
250
|
+
params.append(service)
|
|
251
|
+
|
|
252
|
+
params.append(limit)
|
|
253
|
+
|
|
254
|
+
where = " AND ".join(clauses)
|
|
255
|
+
sql = f"""\
|
|
256
|
+
SELECT e.* FROM events e
|
|
257
|
+
JOIN events_fts ON e.rowid = events_fts.rowid
|
|
258
|
+
WHERE {where}
|
|
259
|
+
ORDER BY bm25(events_fts)
|
|
260
|
+
LIMIT ?
|
|
261
|
+
"""
|
|
262
|
+
rows = self._conn.execute(sql, params).fetchall()
|
|
263
|
+
return [self._row_to_event(row) for row in rows]
|
|
264
|
+
|
|
265
|
+
def get_by_topology(
|
|
266
|
+
self, service: str, hops: int = 1
|
|
267
|
+
) -> list[SitRepEvent]:
|
|
268
|
+
"""Query events related to a service via topology (deps/dependents).
|
|
269
|
+
|
|
270
|
+
For hops > 1, iteratively expand the set of related services.
|
|
271
|
+
"""
|
|
272
|
+
visited: set[str] = {service}
|
|
273
|
+
frontier: set[str] = {service}
|
|
274
|
+
|
|
275
|
+
for _ in range(hops):
|
|
276
|
+
next_frontier: set[str] = set()
|
|
277
|
+
for svc in frontier:
|
|
278
|
+
# Find events where this service is the primary service
|
|
279
|
+
rows = self._conn.execute(
|
|
280
|
+
"SELECT dependencies, dependents FROM events WHERE service = ?",
|
|
281
|
+
(svc,),
|
|
282
|
+
).fetchall()
|
|
283
|
+
for row in rows:
|
|
284
|
+
if row["dependencies"]:
|
|
285
|
+
for dep in json.loads(row["dependencies"]):
|
|
286
|
+
if dep not in visited:
|
|
287
|
+
next_frontier.add(dep)
|
|
288
|
+
if row["dependents"]:
|
|
289
|
+
for dept in json.loads(row["dependents"]):
|
|
290
|
+
if dept not in visited:
|
|
291
|
+
next_frontier.add(dept)
|
|
292
|
+
|
|
293
|
+
# Find events that list this service in their dependencies or dependents
|
|
294
|
+
all_events = self._conn.execute(
|
|
295
|
+
"""SELECT service, dependencies, dependents FROM events
|
|
296
|
+
WHERE dependencies LIKE ? OR dependents LIKE ?""",
|
|
297
|
+
(f'%"{svc}"%', f'%"{svc}"%'),
|
|
298
|
+
).fetchall()
|
|
299
|
+
for row in all_events:
|
|
300
|
+
if row["service"] not in visited:
|
|
301
|
+
next_frontier.add(row["service"])
|
|
302
|
+
|
|
303
|
+
visited.update(next_frontier)
|
|
304
|
+
frontier = next_frontier
|
|
305
|
+
if not frontier:
|
|
306
|
+
break
|
|
307
|
+
|
|
308
|
+
# Now fetch all events for the visited services
|
|
309
|
+
if not visited:
|
|
310
|
+
return []
|
|
311
|
+
|
|
312
|
+
placeholders = ",".join("?" for _ in visited)
|
|
313
|
+
rows = self._conn.execute(
|
|
314
|
+
f"""\
|
|
315
|
+
SELECT DISTINCT e.* FROM events e
|
|
316
|
+
WHERE e.service IN ({placeholders})
|
|
317
|
+
ORDER BY e.timestamp DESC
|
|
318
|
+
""",
|
|
319
|
+
list(visited),
|
|
320
|
+
).fetchall()
|
|
321
|
+
|
|
322
|
+
# Also get events that mention any visited service in dependencies/dependents
|
|
323
|
+
seen_ids: set[str] = {row["id"] for row in rows}
|
|
324
|
+
result = [self._row_to_event(row) for row in rows]
|
|
325
|
+
|
|
326
|
+
for svc in visited:
|
|
327
|
+
extra_rows = self._conn.execute(
|
|
328
|
+
"""SELECT * FROM events
|
|
329
|
+
WHERE (dependencies LIKE ? OR dependents LIKE ?)
|
|
330
|
+
AND service NOT IN ({})""".format(
|
|
331
|
+
",".join("?" for _ in visited)
|
|
332
|
+
),
|
|
333
|
+
(f'%"{svc}"%', f'%"{svc}"%', *visited),
|
|
334
|
+
).fetchall()
|
|
335
|
+
for row in extra_rows:
|
|
336
|
+
if row["id"] not in seen_ids:
|
|
337
|
+
seen_ids.add(row["id"])
|
|
338
|
+
result.append(self._row_to_event(row))
|
|
339
|
+
|
|
340
|
+
return result
|
|
341
|
+
|
|
342
|
+
def get_recent_changes(
|
|
343
|
+
self, service: str, window_minutes: int = 30,
|
|
344
|
+
reference_time: str | None = None,
|
|
345
|
+
) -> list[SitRepEvent]:
|
|
346
|
+
"""Get recent change events for a service.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
service: Service name to query changes for.
|
|
350
|
+
window_minutes: How far back to look.
|
|
351
|
+
reference_time: ISO 8601 timestamp to use as "now".
|
|
352
|
+
Defaults to actual current time if None.
|
|
353
|
+
Essential for replay with historical timestamps.
|
|
354
|
+
"""
|
|
355
|
+
if reference_time is not None:
|
|
356
|
+
sql = """\
|
|
357
|
+
SELECT * FROM events
|
|
358
|
+
WHERE type = 'change'
|
|
359
|
+
AND service = ?
|
|
360
|
+
AND timestamp >= strftime('%Y-%m-%dT%H:%M:%fZ', ?, ? || ' minutes')
|
|
361
|
+
AND timestamp <= ?
|
|
362
|
+
ORDER BY timestamp DESC
|
|
363
|
+
"""
|
|
364
|
+
rows = self._conn.execute(
|
|
365
|
+
sql, (service, reference_time, f"-{window_minutes}", reference_time)
|
|
366
|
+
).fetchall()
|
|
367
|
+
else:
|
|
368
|
+
sql = """\
|
|
369
|
+
SELECT * FROM events
|
|
370
|
+
WHERE type = 'change'
|
|
371
|
+
AND service = ?
|
|
372
|
+
AND timestamp >= strftime('%Y-%m-%dT%H:%M:%fZ', 'now', ? || ' minutes')
|
|
373
|
+
ORDER BY timestamp DESC
|
|
374
|
+
"""
|
|
375
|
+
rows = self._conn.execute(sql, (service, f"-{window_minutes}")).fetchall()
|
|
376
|
+
return [self._row_to_event(row) for row in rows]
|
|
377
|
+
|
|
378
|
+
def expire_old(self) -> int:
|
|
379
|
+
"""Delete events whose TTL has been exceeded. Return count deleted."""
|
|
380
|
+
# Find IDs to delete (for FTS cleanup)
|
|
381
|
+
rows = self._conn.execute(
|
|
382
|
+
"""\
|
|
383
|
+
SELECT id, rowid FROM events
|
|
384
|
+
WHERE (julianday('now') - julianday(created_at)) * 86400 > ttl
|
|
385
|
+
"""
|
|
386
|
+
).fetchall()
|
|
387
|
+
|
|
388
|
+
if not rows:
|
|
389
|
+
return 0
|
|
390
|
+
|
|
391
|
+
ids = [row["id"] for row in rows]
|
|
392
|
+
rowids = [row["rowid"] for row in rows]
|
|
393
|
+
|
|
394
|
+
# Delete from FTS5 first — must pass exact original values
|
|
395
|
+
for rowid in rowids:
|
|
396
|
+
row = self._conn.execute(
|
|
397
|
+
"SELECT rowid, id, service, source, type, payload FROM events WHERE rowid = ?",
|
|
398
|
+
(rowid,),
|
|
399
|
+
).fetchone()
|
|
400
|
+
if row:
|
|
401
|
+
payload_text = self._flatten_payload(json.loads(row["payload"]))
|
|
402
|
+
self._conn.execute(
|
|
403
|
+
"""INSERT INTO events_fts(events_fts, rowid, id, service, source, type, payload_text)
|
|
404
|
+
VALUES ('delete', ?, ?, ?, ?, ?, ?)""",
|
|
405
|
+
(row["rowid"], row["id"], row["service"], row["source"], row["type"], payload_text),
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# Delete from events table
|
|
409
|
+
placeholders = ",".join("?" for _ in ids)
|
|
410
|
+
self._conn.execute(
|
|
411
|
+
f"DELETE FROM events WHERE id IN ({placeholders})", ids
|
|
412
|
+
)
|
|
413
|
+
self._conn.commit()
|
|
414
|
+
|
|
415
|
+
return len(ids)
|
|
416
|
+
|
|
417
|
+
def get_state_hash(self, time_window: tuple[str, str]) -> str:
|
|
418
|
+
"""SHA256 hash of sorted event IDs in the time window."""
|
|
419
|
+
rows = self._conn.execute(
|
|
420
|
+
"SELECT id FROM events WHERE timestamp >= ? AND timestamp <= ? ORDER BY id",
|
|
421
|
+
(time_window[0], time_window[1]),
|
|
422
|
+
).fetchall()
|
|
423
|
+
ids = [row["id"] for row in rows]
|
|
424
|
+
content = ",".join(ids)
|
|
425
|
+
return hashlib.sha256(content.encode()).hexdigest()
|
|
426
|
+
|
|
427
|
+
def get_stats(self) -> dict[str, Any]:
|
|
428
|
+
"""Return basic store statistics."""
|
|
429
|
+
row = self._conn.execute(
|
|
430
|
+
"""\
|
|
431
|
+
SELECT
|
|
432
|
+
COUNT(*) as event_count,
|
|
433
|
+
MIN(timestamp) as min_timestamp,
|
|
434
|
+
MAX(timestamp) as max_timestamp
|
|
435
|
+
FROM events
|
|
436
|
+
"""
|
|
437
|
+
).fetchone()
|
|
438
|
+
|
|
439
|
+
return {
|
|
440
|
+
"event_count": row["event_count"],
|
|
441
|
+
"min_timestamp": row["min_timestamp"],
|
|
442
|
+
"max_timestamp": row["max_timestamp"],
|
|
443
|
+
}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""NL summary generation for correlation snapshots.
|
|
2
|
+
|
|
3
|
+
Uses structured_call() from nthlayer-common with Instructor for validated
|
|
4
|
+
Pydantic output. LLM failure is non-blocking — returns None on timeout,
|
|
5
|
+
validation error, or LLM unavailability.
|
|
6
|
+
|
|
7
|
+
structured_call() is synchronous in v1.5; wrapped with asyncio.to_thread().
|
|
8
|
+
v2 LLM class refactor (V2-F deferred) makes this native-async.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import structlog
|
|
17
|
+
from pydantic import BaseModel, Field
|
|
18
|
+
|
|
19
|
+
from nthlayer_common.llm_structured import structured_call
|
|
20
|
+
from nthlayer_workers.correlate.types import SitRepEvent
|
|
21
|
+
|
|
22
|
+
logger = structlog.get_logger()
|
|
23
|
+
|
|
24
|
+
SYSTEM_PROMPT = (
|
|
25
|
+
"You are summarizing a correlation window for an on-call SRE operator. "
|
|
26
|
+
"Describe what happened in 2-4 sentences. Be specific about services, "
|
|
27
|
+
"metrics, and timeline. Do NOT speculate about root cause — describe "
|
|
28
|
+
"observations only. If you lack context to be specific, say what's "
|
|
29
|
+
"missing in notable_omissions."
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
SUMMARY_TIMEOUT = 5.0 # seconds
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SnapshotSummary(BaseModel):
|
|
36
|
+
"""Operator-legible summary of a correlation window.
|
|
37
|
+
|
|
38
|
+
2-4 sentences describing what happened. No root-cause speculation —
|
|
39
|
+
summary describes observations, not conclusions.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
summary: str = Field(max_length=500)
|
|
43
|
+
notable_omissions: list[str] = Field(default_factory=list)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
async def generate_summary(
|
|
47
|
+
snapshot_data: dict,
|
|
48
|
+
events: list[SitRepEvent],
|
|
49
|
+
model: str | None = None,
|
|
50
|
+
) -> dict | None:
|
|
51
|
+
"""Generate NL summary for a correlation snapshot.
|
|
52
|
+
|
|
53
|
+
Returns {"summary": "...", "notable_omissions": [...]} on success,
|
|
54
|
+
or None on any failure (timeout, LLM unavailable, validation error).
|
|
55
|
+
"""
|
|
56
|
+
user_prompt = _build_user_prompt(snapshot_data, events)
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
result = await asyncio.wait_for(
|
|
60
|
+
asyncio.to_thread(
|
|
61
|
+
structured_call,
|
|
62
|
+
system=SYSTEM_PROMPT,
|
|
63
|
+
user=user_prompt,
|
|
64
|
+
response_model=SnapshotSummary,
|
|
65
|
+
model=model,
|
|
66
|
+
timeout=SUMMARY_TIMEOUT,
|
|
67
|
+
max_retries=1, # single attempt — retry not worth operator latency
|
|
68
|
+
),
|
|
69
|
+
# Outer timeout is safety net — inner structured_call timeout
|
|
70
|
+
# should fire first so Instructor can surface a clean LLMError.
|
|
71
|
+
timeout=SUMMARY_TIMEOUT + 1.0,
|
|
72
|
+
)
|
|
73
|
+
return {
|
|
74
|
+
"summary": result.summary,
|
|
75
|
+
"notable_omissions": result.notable_omissions,
|
|
76
|
+
}
|
|
77
|
+
except Exception as e:
|
|
78
|
+
_record_failure(e, snapshot_data.get("domain", {}).get("service", "unknown"), model)
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _build_user_prompt(snapshot_data: dict, events: list[SitRepEvent]) -> str:
|
|
83
|
+
"""Build the user prompt from snapshot data and sample events."""
|
|
84
|
+
domain = snapshot_data.get("domain", {})
|
|
85
|
+
window = snapshot_data.get("window", {})
|
|
86
|
+
samples = _select_sample_events(events)
|
|
87
|
+
|
|
88
|
+
lines = [
|
|
89
|
+
f"Service: {domain.get('service', 'unknown')} ({domain.get('environment', 'unknown')})",
|
|
90
|
+
f"Window: {window.get('duration_seconds', 0):.0f}s, closed by {window.get('close_reason', 'unknown')}",
|
|
91
|
+
f"Events: {snapshot_data.get('event_count', 0)} total",
|
|
92
|
+
f"Event types: {snapshot_data.get('event_types', {})}",
|
|
93
|
+
f"Peak severity: {snapshot_data.get('peak_severity', 0.0)}",
|
|
94
|
+
f"Affected services: {', '.join(snapshot_data.get('affected_services', []))}",
|
|
95
|
+
"",
|
|
96
|
+
"Sample events:",
|
|
97
|
+
]
|
|
98
|
+
for s in samples:
|
|
99
|
+
lines.append(
|
|
100
|
+
f" [{s['timestamp']}] {s['service']} {s['type']} severity={s['severity']} source={s['source']}"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
return "\n".join(lines)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _select_sample_events(events: list[SitRepEvent], max_samples: int = 10) -> list[dict]:
|
|
107
|
+
"""Select representative events for the LLM prompt.
|
|
108
|
+
|
|
109
|
+
Includes: first event, most severe event, most recent per service.
|
|
110
|
+
Deduplicates by ID. Caps at max_samples.
|
|
111
|
+
"""
|
|
112
|
+
if not events:
|
|
113
|
+
return []
|
|
114
|
+
|
|
115
|
+
samples: dict[str, SitRepEvent] = {}
|
|
116
|
+
|
|
117
|
+
# First event (chronologically)
|
|
118
|
+
sorted_events = sorted(events, key=lambda e: e.timestamp)
|
|
119
|
+
samples[sorted_events[0].id] = sorted_events[0]
|
|
120
|
+
|
|
121
|
+
# Most severe event
|
|
122
|
+
most_severe = max(events, key=lambda e: e.severity)
|
|
123
|
+
samples[most_severe.id] = most_severe
|
|
124
|
+
|
|
125
|
+
# Most recent per service
|
|
126
|
+
by_service: dict[str, SitRepEvent] = {}
|
|
127
|
+
for e in sorted_events:
|
|
128
|
+
by_service[e.service] = e # last wins = most recent
|
|
129
|
+
for e in by_service.values():
|
|
130
|
+
samples[e.id] = e
|
|
131
|
+
|
|
132
|
+
result = list(samples.values())[:max_samples]
|
|
133
|
+
return [
|
|
134
|
+
{
|
|
135
|
+
"service": e.service,
|
|
136
|
+
"type": e.type.value,
|
|
137
|
+
"severity": e.severity,
|
|
138
|
+
"timestamp": e.timestamp,
|
|
139
|
+
"source": e.source,
|
|
140
|
+
}
|
|
141
|
+
for e in result
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _record_failure(error: Exception, snapshot_service: str, model: str | None = None) -> None:
|
|
146
|
+
"""Record summary generation failure via logging + metrics."""
|
|
147
|
+
reason = _classify_failure(error)
|
|
148
|
+
resolved_model = model or "default"
|
|
149
|
+
logger.warning(
|
|
150
|
+
"correlate_summary_failed",
|
|
151
|
+
reason=reason,
|
|
152
|
+
service=snapshot_service,
|
|
153
|
+
error=str(error),
|
|
154
|
+
)
|
|
155
|
+
try:
|
|
156
|
+
from nthlayer_common.metrics import errors_total
|
|
157
|
+
errors_total.labels(component="correlate", error_type=f"summary_{reason}").inc()
|
|
158
|
+
except Exception:
|
|
159
|
+
pass # metrics infra may not be configured
|
|
160
|
+
try:
|
|
161
|
+
from nthlayer_common.telemetry import emit_llm_event
|
|
162
|
+
emit_llm_event(
|
|
163
|
+
model=resolved_model,
|
|
164
|
+
provider="unknown",
|
|
165
|
+
caller="correlate.generate_summary",
|
|
166
|
+
success=False,
|
|
167
|
+
error=str(error),
|
|
168
|
+
)
|
|
169
|
+
except Exception:
|
|
170
|
+
pass # OTel may not be configured
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _classify_failure(error: Exception) -> str:
|
|
174
|
+
"""Classify a summary generation failure for metrics."""
|
|
175
|
+
if isinstance(error, asyncio.TimeoutError):
|
|
176
|
+
return "timeout"
|
|
177
|
+
error_str = type(error).__name__.lower()
|
|
178
|
+
if "validation" in error_str or "pydantic" in error_str:
|
|
179
|
+
return "validation_error"
|
|
180
|
+
return "llm_unavailable"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Trace backend adapters for nthlayer-correlate."""
|