nthlayer-workers 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. nthlayer_workers/__init__.py +5 -0
  2. nthlayer_workers/cli.py +234 -0
  3. nthlayer_workers/correlate/__init__.py +1 -0
  4. nthlayer_workers/correlate/cli.py +847 -0
  5. nthlayer_workers/correlate/config.py +111 -0
  6. nthlayer_workers/correlate/correlation/__init__.py +1 -0
  7. nthlayer_workers/correlate/correlation/changes.py +87 -0
  8. nthlayer_workers/correlate/correlation/dedup.py +62 -0
  9. nthlayer_workers/correlate/correlation/engine.py +244 -0
  10. nthlayer_workers/correlate/correlation/temporal.py +79 -0
  11. nthlayer_workers/correlate/correlation/topology.py +104 -0
  12. nthlayer_workers/correlate/ingestion/__init__.py +1 -0
  13. nthlayer_workers/correlate/ingestion/protocol.py +10 -0
  14. nthlayer_workers/correlate/ingestion/severity.py +18 -0
  15. nthlayer_workers/correlate/ingestion/webhook.py +197 -0
  16. nthlayer_workers/correlate/notifications.py +85 -0
  17. nthlayer_workers/correlate/prometheus.py +234 -0
  18. nthlayer_workers/correlate/reasoning.py +375 -0
  19. nthlayer_workers/correlate/session.py +189 -0
  20. nthlayer_workers/correlate/snapshot/__init__.py +1 -0
  21. nthlayer_workers/correlate/snapshot/generator.py +170 -0
  22. nthlayer_workers/correlate/snapshot/model.py +177 -0
  23. nthlayer_workers/correlate/snapshot/token.py +14 -0
  24. nthlayer_workers/correlate/state.py +88 -0
  25. nthlayer_workers/correlate/store/__init__.py +5 -0
  26. nthlayer_workers/correlate/store/protocol.py +48 -0
  27. nthlayer_workers/correlate/store/sqlite.py +443 -0
  28. nthlayer_workers/correlate/summary.py +180 -0
  29. nthlayer_workers/correlate/traces/__init__.py +1 -0
  30. nthlayer_workers/correlate/traces/protocol.py +120 -0
  31. nthlayer_workers/correlate/traces/tempo.py +667 -0
  32. nthlayer_workers/correlate/traces/topology.py +39 -0
  33. nthlayer_workers/correlate/types.py +77 -0
  34. nthlayer_workers/correlate/worker.py +630 -0
  35. nthlayer_workers/learn/__init__.py +5 -0
  36. nthlayer_workers/learn/__main__.py +5 -0
  37. nthlayer_workers/learn/cli.py +164 -0
  38. nthlayer_workers/learn/retrospective.py +381 -0
  39. nthlayer_workers/learn/trends.py +102 -0
  40. nthlayer_workers/learn/worker.py +366 -0
  41. nthlayer_workers/measure/__init__.py +3 -0
  42. nthlayer_workers/measure/__main__.py +5 -0
  43. nthlayer_workers/measure/_parsing.py +15 -0
  44. nthlayer_workers/measure/adapters/__init__.py +0 -0
  45. nthlayer_workers/measure/adapters/_util.py +24 -0
  46. nthlayer_workers/measure/adapters/devin.py +119 -0
  47. nthlayer_workers/measure/adapters/gastown.py +88 -0
  48. nthlayer_workers/measure/adapters/prometheus.py +277 -0
  49. nthlayer_workers/measure/adapters/protocol.py +20 -0
  50. nthlayer_workers/measure/adapters/webhook.py +161 -0
  51. nthlayer_workers/measure/api/__init__.py +0 -0
  52. nthlayer_workers/measure/api/normalise.py +50 -0
  53. nthlayer_workers/measure/api/queue.py +243 -0
  54. nthlayer_workers/measure/api/response.py +51 -0
  55. nthlayer_workers/measure/api/server.py +504 -0
  56. nthlayer_workers/measure/calibration/__init__.py +0 -0
  57. nthlayer_workers/measure/calibration/loop.py +62 -0
  58. nthlayer_workers/measure/calibration/slos.py +212 -0
  59. nthlayer_workers/measure/calibration/verdict_calibration.py +31 -0
  60. nthlayer_workers/measure/cli.py +753 -0
  61. nthlayer_workers/measure/config.py +191 -0
  62. nthlayer_workers/measure/detection/__init__.py +6 -0
  63. nthlayer_workers/measure/detection/detector.py +82 -0
  64. nthlayer_workers/measure/detection/protocol.py +29 -0
  65. nthlayer_workers/measure/governance/__init__.py +0 -0
  66. nthlayer_workers/measure/governance/engine.py +163 -0
  67. nthlayer_workers/measure/manifest.py +77 -0
  68. nthlayer_workers/measure/notifications.py +53 -0
  69. nthlayer_workers/measure/pipeline/__init__.py +0 -0
  70. nthlayer_workers/measure/pipeline/evaluator.py +155 -0
  71. nthlayer_workers/measure/pipeline/router.py +160 -0
  72. nthlayer_workers/measure/store/__init__.py +0 -0
  73. nthlayer_workers/measure/store/protocol.py +38 -0
  74. nthlayer_workers/measure/store/sqlite.py +276 -0
  75. nthlayer_workers/measure/telemetry.py +116 -0
  76. nthlayer_workers/measure/tiering/__init__.py +0 -0
  77. nthlayer_workers/measure/tiering/classifier.py +58 -0
  78. nthlayer_workers/measure/tiering/promotion.py +118 -0
  79. nthlayer_workers/measure/trends/__init__.py +0 -0
  80. nthlayer_workers/measure/trends/tracker.py +72 -0
  81. nthlayer_workers/measure/types.py +75 -0
  82. nthlayer_workers/measure/worker.py +439 -0
  83. nthlayer_workers/observe/__init__.py +25 -0
  84. nthlayer_workers/observe/__main__.py +5 -0
  85. nthlayer_workers/observe/api/__init__.py +1 -0
  86. nthlayer_workers/observe/assessment.py +95 -0
  87. nthlayer_workers/observe/cli.py +737 -0
  88. nthlayer_workers/observe/config.py +11 -0
  89. nthlayer_workers/observe/db/__init__.py +1 -0
  90. nthlayer_workers/observe/decision_records.py +220 -0
  91. nthlayer_workers/observe/dependencies/__init__.py +18 -0
  92. nthlayer_workers/observe/dependencies/discovery.py +294 -0
  93. nthlayer_workers/observe/dependencies/providers/__init__.py +48 -0
  94. nthlayer_workers/observe/dependencies/providers/backstage.py +467 -0
  95. nthlayer_workers/observe/dependencies/providers/base.py +76 -0
  96. nthlayer_workers/observe/dependencies/providers/consul.py +518 -0
  97. nthlayer_workers/observe/dependencies/providers/etcd.py +360 -0
  98. nthlayer_workers/observe/dependencies/providers/kubernetes.py +682 -0
  99. nthlayer_workers/observe/dependencies/providers/prometheus.py +368 -0
  100. nthlayer_workers/observe/dependencies/providers/zookeeper.py +399 -0
  101. nthlayer_workers/observe/deployments/__init__.py +1 -0
  102. nthlayer_workers/observe/discovery/__init__.py +14 -0
  103. nthlayer_workers/observe/discovery/classifier.py +66 -0
  104. nthlayer_workers/observe/discovery/client.py +189 -0
  105. nthlayer_workers/observe/discovery/models.py +53 -0
  106. nthlayer_workers/observe/drift/__init__.py +26 -0
  107. nthlayer_workers/observe/drift/analyzer.py +383 -0
  108. nthlayer_workers/observe/drift/models.py +174 -0
  109. nthlayer_workers/observe/drift/patterns.py +88 -0
  110. nthlayer_workers/observe/explanation.py +118 -0
  111. nthlayer_workers/observe/gate/__init__.py +39 -0
  112. nthlayer_workers/observe/gate/conditions.py +92 -0
  113. nthlayer_workers/observe/gate/correlator.py +154 -0
  114. nthlayer_workers/observe/gate/evaluator.py +192 -0
  115. nthlayer_workers/observe/gate/policies.py +226 -0
  116. nthlayer_workers/observe/gate_adapter.py +40 -0
  117. nthlayer_workers/observe/incident.py +36 -0
  118. nthlayer_workers/observe/portfolio/__init__.py +17 -0
  119. nthlayer_workers/observe/portfolio/aggregator.py +168 -0
  120. nthlayer_workers/observe/portfolio/scorer.py +13 -0
  121. nthlayer_workers/observe/slo/__init__.py +19 -0
  122. nthlayer_workers/observe/slo/collector.py +235 -0
  123. nthlayer_workers/observe/slo/spec_loader.py +40 -0
  124. nthlayer_workers/observe/sqlite_store.py +152 -0
  125. nthlayer_workers/observe/store.py +92 -0
  126. nthlayer_workers/observe/verification/__init__.py +22 -0
  127. nthlayer_workers/observe/verification/exporter_guidance.py +146 -0
  128. nthlayer_workers/observe/verification/extractor.py +127 -0
  129. nthlayer_workers/observe/verification/models.py +101 -0
  130. nthlayer_workers/observe/verification/verifier.py +111 -0
  131. nthlayer_workers/observe/worker.py +332 -0
  132. nthlayer_workers/respond/__init__.py +2 -0
  133. nthlayer_workers/respond/__main__.py +4 -0
  134. nthlayer_workers/respond/agents/__init__.py +0 -0
  135. nthlayer_workers/respond/agents/base.py +556 -0
  136. nthlayer_workers/respond/agents/communication.py +115 -0
  137. nthlayer_workers/respond/agents/investigation.py +124 -0
  138. nthlayer_workers/respond/agents/remediation.py +219 -0
  139. nthlayer_workers/respond/agents/triage.py +132 -0
  140. nthlayer_workers/respond/cli.py +772 -0
  141. nthlayer_workers/respond/config.py +135 -0
  142. nthlayer_workers/respond/context_store.py +256 -0
  143. nthlayer_workers/respond/coordinator.py +487 -0
  144. nthlayer_workers/respond/metrics.py +104 -0
  145. nthlayer_workers/respond/notification_backends/__init__.py +1 -0
  146. nthlayer_workers/respond/notification_backends/ntfy_backend.py +158 -0
  147. nthlayer_workers/respond/notification_backends/protocol.py +59 -0
  148. nthlayer_workers/respond/notification_backends/slack_backend.py +203 -0
  149. nthlayer_workers/respond/notification_backends/stdout_backend.py +56 -0
  150. nthlayer_workers/respond/notifications.py +247 -0
  151. nthlayer_workers/respond/oncall/__init__.py +1 -0
  152. nthlayer_workers/respond/oncall/escalation.py +103 -0
  153. nthlayer_workers/respond/oncall/runner.py +193 -0
  154. nthlayer_workers/respond/oncall/schedule.py +243 -0
  155. nthlayer_workers/respond/safe_actions/__init__.py +0 -0
  156. nthlayer_workers/respond/safe_actions/actions.py +139 -0
  157. nthlayer_workers/respond/safe_actions/registry.py +171 -0
  158. nthlayer_workers/respond/safe_actions/webhook.py +194 -0
  159. nthlayer_workers/respond/server.py +357 -0
  160. nthlayer_workers/respond/sre/__init__.py +1 -0
  161. nthlayer_workers/respond/sre/brief.py +175 -0
  162. nthlayer_workers/respond/sre/delegation.py +101 -0
  163. nthlayer_workers/respond/sre/post_incident.py +146 -0
  164. nthlayer_workers/respond/sre/shift_report.py +129 -0
  165. nthlayer_workers/respond/sre/suppression.py +91 -0
  166. nthlayer_workers/respond/types.py +109 -0
  167. nthlayer_workers/respond/verdict_submission.py +56 -0
  168. nthlayer_workers/respond/worker.py +533 -0
  169. nthlayer_workers/respond/worker_helpers.py +140 -0
  170. nthlayer_workers/runner.py +198 -0
  171. nthlayer_workers-1.0.0.dist-info/METADATA +19 -0
  172. nthlayer_workers-1.0.0.dist-info/RECORD +175 -0
  173. nthlayer_workers-1.0.0.dist-info/WHEEL +5 -0
  174. nthlayer_workers-1.0.0.dist-info/entry_points.txt +2 -0
  175. nthlayer_workers-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,443 @@
1
+ """SQLite FTS5 implementation of EventStore."""
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+ import json
6
+ import sqlite3
7
+ from typing import Any
8
+
9
+ from nthlayer_workers.correlate.types import EventType, SitRepEvent
10
+
11
+ _SCHEMA_SQL = """\
12
+ CREATE TABLE IF NOT EXISTS events (
13
+ id TEXT PRIMARY KEY,
14
+ timestamp TEXT NOT NULL,
15
+ source TEXT NOT NULL,
16
+ type TEXT NOT NULL,
17
+ service TEXT NOT NULL,
18
+ environment TEXT NOT NULL,
19
+ severity REAL NOT NULL DEFAULT 0.5,
20
+ payload TEXT NOT NULL,
21
+ dependencies TEXT,
22
+ dependents TEXT,
23
+ ttl INTEGER NOT NULL DEFAULT 86400,
24
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
25
+ );
26
+
27
+ CREATE INDEX IF NOT EXISTS idx_events_timestamp
28
+ ON events(timestamp DESC);
29
+ CREATE INDEX IF NOT EXISTS idx_events_service_time
30
+ ON events(service, timestamp DESC);
31
+ CREATE INDEX IF NOT EXISTS idx_events_type_time
32
+ ON events(type, timestamp DESC);
33
+ CREATE INDEX IF NOT EXISTS idx_events_changes
34
+ ON events(type, service, timestamp DESC) WHERE type = 'change';
35
+ CREATE INDEX IF NOT EXISTS idx_events_expiry
36
+ ON events(created_at, ttl);
37
+ """
38
+
39
+ _FTS_SCHEMA_SQL = """\
40
+ CREATE VIRTUAL TABLE IF NOT EXISTS events_fts USING fts5(
41
+ id, service, source, type, payload_text,
42
+ content=events, content_rowid=rowid,
43
+ tokenize='porter'
44
+ );
45
+ """
46
+
47
+
48
+ class SQLiteEventStore:
49
+ """SQLite FTS5 event store with WAL mode and BM25 ranking."""
50
+
51
+ def __init__(self, db_path: str) -> None:
52
+ self._db_path = db_path
53
+ self._conn = sqlite3.connect(db_path)
54
+ self._conn.row_factory = sqlite3.Row
55
+ self._conn.execute("PRAGMA journal_mode=WAL")
56
+ self._conn.execute("PRAGMA busy_timeout=5000")
57
+ self._conn.executescript(_SCHEMA_SQL)
58
+ self._conn.executescript(_FTS_SCHEMA_SQL)
59
+ self._conn.commit()
60
+
61
+ # -- context manager ------------------------------------------------
62
+
63
+ def __enter__(self) -> SQLiteEventStore:
64
+ return self
65
+
66
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
67
+ self.close()
68
+
69
+ def close(self) -> None:
70
+ """Close the database connection."""
71
+ self._conn.close()
72
+
73
+ # -- helpers --------------------------------------------------------
74
+
75
+ @staticmethod
76
+ def _flatten_payload(payload: dict[str, Any]) -> str:
77
+ """Recursively flatten a dict to searchable text.
78
+
79
+ Example: {"alert_name": "latency_breach", "value": 0.5}
80
+ -> "alert_name latency_breach value 0.5"
81
+ """
82
+ parts: list[str] = []
83
+
84
+ def _walk(obj: Any) -> None:
85
+ if isinstance(obj, dict):
86
+ for k, v in obj.items():
87
+ parts.append(str(k))
88
+ _walk(v)
89
+ elif isinstance(obj, list):
90
+ for item in obj:
91
+ _walk(item)
92
+ else:
93
+ parts.append(str(obj))
94
+
95
+ _walk(payload)
96
+ return " ".join(parts)
97
+
98
+ @staticmethod
99
+ def _row_to_event(row: sqlite3.Row) -> SitRepEvent:
100
+ """Deserialize a database row into a SitRepEvent."""
101
+ deps_raw = row["dependencies"]
102
+ deps = json.loads(deps_raw) if deps_raw else []
103
+ depts_raw = row["dependents"]
104
+ depts = json.loads(depts_raw) if depts_raw else []
105
+
106
+ return SitRepEvent(
107
+ id=row["id"],
108
+ timestamp=row["timestamp"],
109
+ source=row["source"],
110
+ type=EventType(row["type"]),
111
+ service=row["service"],
112
+ environment=row["environment"],
113
+ severity=row["severity"],
114
+ payload=json.loads(row["payload"]),
115
+ dependencies=deps,
116
+ dependents=depts,
117
+ ttl=row["ttl"],
118
+ )
119
+
120
+ # -- mutations ------------------------------------------------------
121
+
122
+ def insert(self, event: SitRepEvent) -> None:
123
+ """Insert a single event into the store."""
124
+ payload_json = json.dumps(event.payload)
125
+ deps_json = json.dumps(event.dependencies) if event.dependencies else None
126
+ depts_json = json.dumps(event.dependents) if event.dependents else None
127
+ payload_text = self._flatten_payload(event.payload)
128
+
129
+ self._conn.execute(
130
+ """INSERT INTO events
131
+ (id, timestamp, source, type, service, environment,
132
+ severity, payload, dependencies, dependents, ttl)
133
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
134
+ (
135
+ event.id,
136
+ event.timestamp,
137
+ event.source,
138
+ event.type.value,
139
+ event.service,
140
+ event.environment,
141
+ event.severity,
142
+ payload_json,
143
+ deps_json,
144
+ depts_json,
145
+ event.ttl,
146
+ ),
147
+ )
148
+
149
+ # Insert into FTS5 index
150
+ self._conn.execute(
151
+ """INSERT INTO events_fts
152
+ (rowid, id, service, source, type, payload_text)
153
+ SELECT rowid, id, service, source, type, ?
154
+ FROM events WHERE id = ?""",
155
+ (payload_text, event.id),
156
+ )
157
+ self._conn.commit()
158
+
159
+ def insert_batch(self, events: list[SitRepEvent]) -> None:
160
+ """Insert multiple events in a single transaction."""
161
+ if not events:
162
+ return
163
+ with self._conn:
164
+ for event in events:
165
+ payload_json = json.dumps(event.payload)
166
+ deps_json = (
167
+ json.dumps(event.dependencies) if event.dependencies else None
168
+ )
169
+ depts_json = (
170
+ json.dumps(event.dependents) if event.dependents else None
171
+ )
172
+ payload_text = self._flatten_payload(event.payload)
173
+
174
+ self._conn.execute(
175
+ """INSERT INTO events
176
+ (id, timestamp, source, type, service, environment,
177
+ severity, payload, dependencies, dependents, ttl)
178
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
179
+ (
180
+ event.id,
181
+ event.timestamp,
182
+ event.source,
183
+ event.type.value,
184
+ event.service,
185
+ event.environment,
186
+ event.severity,
187
+ payload_json,
188
+ deps_json,
189
+ depts_json,
190
+ event.ttl,
191
+ ),
192
+ )
193
+
194
+ self._conn.execute(
195
+ """INSERT INTO events_fts
196
+ (rowid, id, service, source, type, payload_text)
197
+ SELECT rowid, id, service, source, type, ?
198
+ FROM events WHERE id = ?""",
199
+ (payload_text, event.id),
200
+ )
201
+
202
+ # -- queries --------------------------------------------------------
203
+
204
+ def get_by_time_window(
205
+ self,
206
+ start: str,
207
+ end: str,
208
+ *,
209
+ service: str | None = None,
210
+ event_type: EventType | None = None,
211
+ min_severity: float | None = None,
212
+ ) -> list[SitRepEvent]:
213
+ """Query events within a time window with optional filters."""
214
+ clauses = ["timestamp >= ?", "timestamp <= ?"]
215
+ params: list[Any] = [start, end]
216
+
217
+ if service is not None:
218
+ clauses.append("service = ?")
219
+ params.append(service)
220
+ if event_type is not None:
221
+ clauses.append("type = ?")
222
+ params.append(event_type.value)
223
+ if min_severity is not None:
224
+ clauses.append("severity >= ?")
225
+ params.append(min_severity)
226
+
227
+ where = " AND ".join(clauses)
228
+ sql = f"SELECT * FROM events WHERE {where} ORDER BY timestamp DESC"
229
+ rows = self._conn.execute(sql, params).fetchall()
230
+ return [self._row_to_event(row) for row in rows]
231
+
232
+ def search(
233
+ self,
234
+ query: str,
235
+ *,
236
+ limit: int = 100,
237
+ time_window: tuple[str, str] | None = None,
238
+ service: str | None = None,
239
+ ) -> list[SitRepEvent]:
240
+ """Full-text search using FTS5 with BM25 ranking."""
241
+ clauses = ["events_fts MATCH ?"]
242
+ params: list[Any] = [query]
243
+
244
+ if time_window is not None:
245
+ clauses.append("e.timestamp >= ?")
246
+ clauses.append("e.timestamp <= ?")
247
+ params.extend(time_window)
248
+ if service is not None:
249
+ clauses.append("e.service = ?")
250
+ params.append(service)
251
+
252
+ params.append(limit)
253
+
254
+ where = " AND ".join(clauses)
255
+ sql = f"""\
256
+ SELECT e.* FROM events e
257
+ JOIN events_fts ON e.rowid = events_fts.rowid
258
+ WHERE {where}
259
+ ORDER BY bm25(events_fts)
260
+ LIMIT ?
261
+ """
262
+ rows = self._conn.execute(sql, params).fetchall()
263
+ return [self._row_to_event(row) for row in rows]
264
+
265
+ def get_by_topology(
266
+ self, service: str, hops: int = 1
267
+ ) -> list[SitRepEvent]:
268
+ """Query events related to a service via topology (deps/dependents).
269
+
270
+ For hops > 1, iteratively expand the set of related services.
271
+ """
272
+ visited: set[str] = {service}
273
+ frontier: set[str] = {service}
274
+
275
+ for _ in range(hops):
276
+ next_frontier: set[str] = set()
277
+ for svc in frontier:
278
+ # Find events where this service is the primary service
279
+ rows = self._conn.execute(
280
+ "SELECT dependencies, dependents FROM events WHERE service = ?",
281
+ (svc,),
282
+ ).fetchall()
283
+ for row in rows:
284
+ if row["dependencies"]:
285
+ for dep in json.loads(row["dependencies"]):
286
+ if dep not in visited:
287
+ next_frontier.add(dep)
288
+ if row["dependents"]:
289
+ for dept in json.loads(row["dependents"]):
290
+ if dept not in visited:
291
+ next_frontier.add(dept)
292
+
293
+ # Find events that list this service in their dependencies or dependents
294
+ all_events = self._conn.execute(
295
+ """SELECT service, dependencies, dependents FROM events
296
+ WHERE dependencies LIKE ? OR dependents LIKE ?""",
297
+ (f'%"{svc}"%', f'%"{svc}"%'),
298
+ ).fetchall()
299
+ for row in all_events:
300
+ if row["service"] not in visited:
301
+ next_frontier.add(row["service"])
302
+
303
+ visited.update(next_frontier)
304
+ frontier = next_frontier
305
+ if not frontier:
306
+ break
307
+
308
+ # Now fetch all events for the visited services
309
+ if not visited:
310
+ return []
311
+
312
+ placeholders = ",".join("?" for _ in visited)
313
+ rows = self._conn.execute(
314
+ f"""\
315
+ SELECT DISTINCT e.* FROM events e
316
+ WHERE e.service IN ({placeholders})
317
+ ORDER BY e.timestamp DESC
318
+ """,
319
+ list(visited),
320
+ ).fetchall()
321
+
322
+ # Also get events that mention any visited service in dependencies/dependents
323
+ seen_ids: set[str] = {row["id"] for row in rows}
324
+ result = [self._row_to_event(row) for row in rows]
325
+
326
+ for svc in visited:
327
+ extra_rows = self._conn.execute(
328
+ """SELECT * FROM events
329
+ WHERE (dependencies LIKE ? OR dependents LIKE ?)
330
+ AND service NOT IN ({})""".format(
331
+ ",".join("?" for _ in visited)
332
+ ),
333
+ (f'%"{svc}"%', f'%"{svc}"%', *visited),
334
+ ).fetchall()
335
+ for row in extra_rows:
336
+ if row["id"] not in seen_ids:
337
+ seen_ids.add(row["id"])
338
+ result.append(self._row_to_event(row))
339
+
340
+ return result
341
+
342
+ def get_recent_changes(
343
+ self, service: str, window_minutes: int = 30,
344
+ reference_time: str | None = None,
345
+ ) -> list[SitRepEvent]:
346
+ """Get recent change events for a service.
347
+
348
+ Args:
349
+ service: Service name to query changes for.
350
+ window_minutes: How far back to look.
351
+ reference_time: ISO 8601 timestamp to use as "now".
352
+ Defaults to actual current time if None.
353
+ Essential for replay with historical timestamps.
354
+ """
355
+ if reference_time is not None:
356
+ sql = """\
357
+ SELECT * FROM events
358
+ WHERE type = 'change'
359
+ AND service = ?
360
+ AND timestamp >= strftime('%Y-%m-%dT%H:%M:%fZ', ?, ? || ' minutes')
361
+ AND timestamp <= ?
362
+ ORDER BY timestamp DESC
363
+ """
364
+ rows = self._conn.execute(
365
+ sql, (service, reference_time, f"-{window_minutes}", reference_time)
366
+ ).fetchall()
367
+ else:
368
+ sql = """\
369
+ SELECT * FROM events
370
+ WHERE type = 'change'
371
+ AND service = ?
372
+ AND timestamp >= strftime('%Y-%m-%dT%H:%M:%fZ', 'now', ? || ' minutes')
373
+ ORDER BY timestamp DESC
374
+ """
375
+ rows = self._conn.execute(sql, (service, f"-{window_minutes}")).fetchall()
376
+ return [self._row_to_event(row) for row in rows]
377
+
378
+ def expire_old(self) -> int:
379
+ """Delete events whose TTL has been exceeded. Return count deleted."""
380
+ # Find IDs to delete (for FTS cleanup)
381
+ rows = self._conn.execute(
382
+ """\
383
+ SELECT id, rowid FROM events
384
+ WHERE (julianday('now') - julianday(created_at)) * 86400 > ttl
385
+ """
386
+ ).fetchall()
387
+
388
+ if not rows:
389
+ return 0
390
+
391
+ ids = [row["id"] for row in rows]
392
+ rowids = [row["rowid"] for row in rows]
393
+
394
+ # Delete from FTS5 first — must pass exact original values
395
+ for rowid in rowids:
396
+ row = self._conn.execute(
397
+ "SELECT rowid, id, service, source, type, payload FROM events WHERE rowid = ?",
398
+ (rowid,),
399
+ ).fetchone()
400
+ if row:
401
+ payload_text = self._flatten_payload(json.loads(row["payload"]))
402
+ self._conn.execute(
403
+ """INSERT INTO events_fts(events_fts, rowid, id, service, source, type, payload_text)
404
+ VALUES ('delete', ?, ?, ?, ?, ?, ?)""",
405
+ (row["rowid"], row["id"], row["service"], row["source"], row["type"], payload_text),
406
+ )
407
+
408
+ # Delete from events table
409
+ placeholders = ",".join("?" for _ in ids)
410
+ self._conn.execute(
411
+ f"DELETE FROM events WHERE id IN ({placeholders})", ids
412
+ )
413
+ self._conn.commit()
414
+
415
+ return len(ids)
416
+
417
+ def get_state_hash(self, time_window: tuple[str, str]) -> str:
418
+ """SHA256 hash of sorted event IDs in the time window."""
419
+ rows = self._conn.execute(
420
+ "SELECT id FROM events WHERE timestamp >= ? AND timestamp <= ? ORDER BY id",
421
+ (time_window[0], time_window[1]),
422
+ ).fetchall()
423
+ ids = [row["id"] for row in rows]
424
+ content = ",".join(ids)
425
+ return hashlib.sha256(content.encode()).hexdigest()
426
+
427
+ def get_stats(self) -> dict[str, Any]:
428
+ """Return basic store statistics."""
429
+ row = self._conn.execute(
430
+ """\
431
+ SELECT
432
+ COUNT(*) as event_count,
433
+ MIN(timestamp) as min_timestamp,
434
+ MAX(timestamp) as max_timestamp
435
+ FROM events
436
+ """
437
+ ).fetchone()
438
+
439
+ return {
440
+ "event_count": row["event_count"],
441
+ "min_timestamp": row["min_timestamp"],
442
+ "max_timestamp": row["max_timestamp"],
443
+ }
@@ -0,0 +1,180 @@
1
+ """NL summary generation for correlation snapshots.
2
+
3
+ Uses structured_call() from nthlayer-common with Instructor for validated
4
+ Pydantic output. LLM failure is non-blocking — returns None on timeout,
5
+ validation error, or LLM unavailability.
6
+
7
+ structured_call() is synchronous in v1.5; wrapped with asyncio.to_thread().
8
+ v2 LLM class refactor (V2-F deferred) makes this native-async.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ from typing import Any
15
+
16
+ import structlog
17
+ from pydantic import BaseModel, Field
18
+
19
+ from nthlayer_common.llm_structured import structured_call
20
+ from nthlayer_workers.correlate.types import SitRepEvent
21
+
22
+ logger = structlog.get_logger()
23
+
24
+ SYSTEM_PROMPT = (
25
+ "You are summarizing a correlation window for an on-call SRE operator. "
26
+ "Describe what happened in 2-4 sentences. Be specific about services, "
27
+ "metrics, and timeline. Do NOT speculate about root cause — describe "
28
+ "observations only. If you lack context to be specific, say what's "
29
+ "missing in notable_omissions."
30
+ )
31
+
32
+ SUMMARY_TIMEOUT = 5.0 # seconds
33
+
34
+
35
+ class SnapshotSummary(BaseModel):
36
+ """Operator-legible summary of a correlation window.
37
+
38
+ 2-4 sentences describing what happened. No root-cause speculation —
39
+ summary describes observations, not conclusions.
40
+ """
41
+
42
+ summary: str = Field(max_length=500)
43
+ notable_omissions: list[str] = Field(default_factory=list)
44
+
45
+
46
+ async def generate_summary(
47
+ snapshot_data: dict,
48
+ events: list[SitRepEvent],
49
+ model: str | None = None,
50
+ ) -> dict | None:
51
+ """Generate NL summary for a correlation snapshot.
52
+
53
+ Returns {"summary": "...", "notable_omissions": [...]} on success,
54
+ or None on any failure (timeout, LLM unavailable, validation error).
55
+ """
56
+ user_prompt = _build_user_prompt(snapshot_data, events)
57
+
58
+ try:
59
+ result = await asyncio.wait_for(
60
+ asyncio.to_thread(
61
+ structured_call,
62
+ system=SYSTEM_PROMPT,
63
+ user=user_prompt,
64
+ response_model=SnapshotSummary,
65
+ model=model,
66
+ timeout=SUMMARY_TIMEOUT,
67
+ max_retries=1, # single attempt — retry not worth operator latency
68
+ ),
69
+ # Outer timeout is safety net — inner structured_call timeout
70
+ # should fire first so Instructor can surface a clean LLMError.
71
+ timeout=SUMMARY_TIMEOUT + 1.0,
72
+ )
73
+ return {
74
+ "summary": result.summary,
75
+ "notable_omissions": result.notable_omissions,
76
+ }
77
+ except Exception as e:
78
+ _record_failure(e, snapshot_data.get("domain", {}).get("service", "unknown"), model)
79
+ return None
80
+
81
+
82
+ def _build_user_prompt(snapshot_data: dict, events: list[SitRepEvent]) -> str:
83
+ """Build the user prompt from snapshot data and sample events."""
84
+ domain = snapshot_data.get("domain", {})
85
+ window = snapshot_data.get("window", {})
86
+ samples = _select_sample_events(events)
87
+
88
+ lines = [
89
+ f"Service: {domain.get('service', 'unknown')} ({domain.get('environment', 'unknown')})",
90
+ f"Window: {window.get('duration_seconds', 0):.0f}s, closed by {window.get('close_reason', 'unknown')}",
91
+ f"Events: {snapshot_data.get('event_count', 0)} total",
92
+ f"Event types: {snapshot_data.get('event_types', {})}",
93
+ f"Peak severity: {snapshot_data.get('peak_severity', 0.0)}",
94
+ f"Affected services: {', '.join(snapshot_data.get('affected_services', []))}",
95
+ "",
96
+ "Sample events:",
97
+ ]
98
+ for s in samples:
99
+ lines.append(
100
+ f" [{s['timestamp']}] {s['service']} {s['type']} severity={s['severity']} source={s['source']}"
101
+ )
102
+
103
+ return "\n".join(lines)
104
+
105
+
106
+ def _select_sample_events(events: list[SitRepEvent], max_samples: int = 10) -> list[dict]:
107
+ """Select representative events for the LLM prompt.
108
+
109
+ Includes: first event, most severe event, most recent per service.
110
+ Deduplicates by ID. Caps at max_samples.
111
+ """
112
+ if not events:
113
+ return []
114
+
115
+ samples: dict[str, SitRepEvent] = {}
116
+
117
+ # First event (chronologically)
118
+ sorted_events = sorted(events, key=lambda e: e.timestamp)
119
+ samples[sorted_events[0].id] = sorted_events[0]
120
+
121
+ # Most severe event
122
+ most_severe = max(events, key=lambda e: e.severity)
123
+ samples[most_severe.id] = most_severe
124
+
125
+ # Most recent per service
126
+ by_service: dict[str, SitRepEvent] = {}
127
+ for e in sorted_events:
128
+ by_service[e.service] = e # last wins = most recent
129
+ for e in by_service.values():
130
+ samples[e.id] = e
131
+
132
+ result = list(samples.values())[:max_samples]
133
+ return [
134
+ {
135
+ "service": e.service,
136
+ "type": e.type.value,
137
+ "severity": e.severity,
138
+ "timestamp": e.timestamp,
139
+ "source": e.source,
140
+ }
141
+ for e in result
142
+ ]
143
+
144
+
145
+ def _record_failure(error: Exception, snapshot_service: str, model: str | None = None) -> None:
146
+ """Record summary generation failure via logging + metrics."""
147
+ reason = _classify_failure(error)
148
+ resolved_model = model or "default"
149
+ logger.warning(
150
+ "correlate_summary_failed",
151
+ reason=reason,
152
+ service=snapshot_service,
153
+ error=str(error),
154
+ )
155
+ try:
156
+ from nthlayer_common.metrics import errors_total
157
+ errors_total.labels(component="correlate", error_type=f"summary_{reason}").inc()
158
+ except Exception:
159
+ pass # metrics infra may not be configured
160
+ try:
161
+ from nthlayer_common.telemetry import emit_llm_event
162
+ emit_llm_event(
163
+ model=resolved_model,
164
+ provider="unknown",
165
+ caller="correlate.generate_summary",
166
+ success=False,
167
+ error=str(error),
168
+ )
169
+ except Exception:
170
+ pass # OTel may not be configured
171
+
172
+
173
+ def _classify_failure(error: Exception) -> str:
174
+ """Classify a summary generation failure for metrics."""
175
+ if isinstance(error, asyncio.TimeoutError):
176
+ return "timeout"
177
+ error_str = type(error).__name__.lower()
178
+ if "validation" in error_str or "pydantic" in error_str:
179
+ return "validation_error"
180
+ return "llm_unavailable"
@@ -0,0 +1 @@
1
+ """Trace backend adapters for nthlayer-correlate."""