cortexdb-mcp 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortexdb_mcp/__init__.py +3 -0
- cortexdb_mcp/api.py +132 -0
- cortexdb_mcp/config.py +40 -0
- cortexdb_mcp/insights.py +640 -0
- cortexdb_mcp/server.py +1085 -0
- cortexdb_mcp-0.2.0.dist-info/METADATA +276 -0
- cortexdb_mcp-0.2.0.dist-info/RECORD +9 -0
- cortexdb_mcp-0.2.0.dist-info/WHEEL +4 -0
- cortexdb_mcp-0.2.0.dist-info/entry_points.txt +2 -0
cortexdb_mcp/insights.py
ADDED
|
@@ -0,0 +1,640 @@
|
|
|
1
|
+
"""Proactive insights engine that analyzes CortexDB episodes to generate actionable intelligence.
|
|
2
|
+
|
|
3
|
+
Runs heuristic-based analysis over episodes and entities stored in CortexDB to
|
|
4
|
+
surface patterns such as incident spikes, new dependencies, knowledge gaps, and
|
|
5
|
+
deployment risks. All detection is done with simple temporal and co-occurrence
|
|
6
|
+
analysis -- no LLM calls on the hot path -- so insights can be generated in
|
|
7
|
+
sub-second time.
|
|
8
|
+
|
|
9
|
+
Typical output examples:
|
|
10
|
+
|
|
11
|
+
- "payments-service had 3 incidents this week (up from 0 last week)"
|
|
12
|
+
- "New dependency detected: checkout -> stripe-gateway-v2 (since March 10)"
|
|
13
|
+
- "Knowledge gap: nobody documented the billing reconciliation migration"
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import hashlib
|
|
19
|
+
import logging
|
|
20
|
+
import time
|
|
21
|
+
from collections import Counter, defaultdict
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from datetime import datetime, timedelta, timezone
|
|
24
|
+
from enum import Enum
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
import httpx
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger("cortexdb_mcp.insights")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Data model
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class InsightType(str, Enum):
|
|
38
|
+
"""Categories of proactive insights the engine can produce."""
|
|
39
|
+
|
|
40
|
+
incident_spike = "incident_spike"
|
|
41
|
+
new_dependency = "new_dependency"
|
|
42
|
+
knowledge_gap = "knowledge_gap"
|
|
43
|
+
ownership_change = "ownership_change"
|
|
44
|
+
deployment_risk = "deployment_risk"
|
|
45
|
+
stale_documentation = "stale_documentation"
|
|
46
|
+
recurring_issue = "recurring_issue"
|
|
47
|
+
team_bottleneck = "team_bottleneck"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Severity(str, Enum):
|
|
51
|
+
"""Severity level for an insight."""
|
|
52
|
+
|
|
53
|
+
info = "info"
|
|
54
|
+
warning = "warning"
|
|
55
|
+
critical = "critical"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class Insight:
|
|
60
|
+
"""A single actionable insight generated by the engine."""
|
|
61
|
+
|
|
62
|
+
id: str
|
|
63
|
+
insight_type: InsightType
|
|
64
|
+
title: str
|
|
65
|
+
description: str
|
|
66
|
+
severity: Severity
|
|
67
|
+
entities: list[str]
|
|
68
|
+
evidence: list[str]
|
|
69
|
+
generated_at: datetime
|
|
70
|
+
confidence: float
|
|
71
|
+
|
|
72
|
+
def to_dict(self) -> dict[str, Any]:
|
|
73
|
+
"""Serialize the insight to a JSON-safe dictionary."""
|
|
74
|
+
return {
|
|
75
|
+
"id": self.id,
|
|
76
|
+
"insight_type": self.insight_type.value,
|
|
77
|
+
"title": self.title,
|
|
78
|
+
"description": self.description,
|
|
79
|
+
"severity": self.severity.value,
|
|
80
|
+
"entities": self.entities,
|
|
81
|
+
"evidence": self.evidence,
|
|
82
|
+
"generated_at": self.generated_at.isoformat(),
|
|
83
|
+
"confidence": self.confidence,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
# Helpers
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _make_id(*parts: str) -> str:
|
|
93
|
+
"""Produce a deterministic short insight ID from constituent parts."""
|
|
94
|
+
raw = ":".join(parts)
|
|
95
|
+
return "ins_" + hashlib.sha256(raw.encode()).hexdigest()[:12]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _now() -> datetime:
|
|
99
|
+
"""Return the current UTC time."""
|
|
100
|
+
return datetime.now(timezone.utc)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _week_ago(weeks: int = 1) -> datetime:
|
|
104
|
+
"""Return a datetime ``weeks`` weeks before now."""
|
|
105
|
+
return _now() - timedelta(weeks=weeks)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
# Engine
|
|
110
|
+
# ---------------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class InsightsEngine:
|
|
114
|
+
"""Analyze CortexDB episodes and generate proactive insights.
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
cortex_url:
|
|
119
|
+
Base URL of the CortexDB HTTP API.
|
|
120
|
+
api_key:
|
|
121
|
+
Optional bearer token for authenticated access.
|
|
122
|
+
tenant_id:
|
|
123
|
+
Tenant scope used when querying CortexDB.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
# Episode types used in heuristics.
|
|
127
|
+
_INCIDENT_TYPES = {"incident", "alert", "outage", "error", "failure"}
|
|
128
|
+
_DOCUMENT_TYPES = {"document", "doc", "runbook", "wiki", "documentation", "readme"}
|
|
129
|
+
_DEPLOY_TYPES = {"deploy", "deployment", "release", "rollout"}
|
|
130
|
+
_DEPENDENCY_TYPES = {"dependency", "integration", "api_call", "import"}
|
|
131
|
+
|
|
132
|
+
def __init__(
|
|
133
|
+
self,
|
|
134
|
+
cortex_url: str = "http://localhost:3141",
|
|
135
|
+
api_key: str | None = None,
|
|
136
|
+
tenant_id: str | None = None,
|
|
137
|
+
) -> None:
|
|
138
|
+
self.cortex_url = cortex_url.rstrip("/")
|
|
139
|
+
self.api_key = api_key
|
|
140
|
+
self.tenant_id = tenant_id
|
|
141
|
+
|
|
142
|
+
# -- HTTP helpers -------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
def _headers(self) -> dict[str, str]:
|
|
145
|
+
"""Build HTTP headers for CortexDB requests."""
|
|
146
|
+
headers: dict[str, str] = {"Content-Type": "application/json"}
|
|
147
|
+
if self.api_key:
|
|
148
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
149
|
+
return headers
|
|
150
|
+
|
|
151
|
+
async def _get(self, path: str, params: dict[str, Any] | None = None) -> Any:
|
|
152
|
+
"""Perform a GET request against CortexDB and return parsed JSON."""
|
|
153
|
+
async with httpx.AsyncClient(
|
|
154
|
+
base_url=self.cortex_url,
|
|
155
|
+
headers=self._headers(),
|
|
156
|
+
timeout=30.0,
|
|
157
|
+
) as client:
|
|
158
|
+
resp = await client.get(path, params=params or {})
|
|
159
|
+
resp.raise_for_status()
|
|
160
|
+
return resp.json()
|
|
161
|
+
|
|
162
|
+
async def _fetch_episodes(
|
|
163
|
+
self,
|
|
164
|
+
*,
|
|
165
|
+
episode_type: str | None = None,
|
|
166
|
+
since: datetime | None = None,
|
|
167
|
+
until: datetime | None = None,
|
|
168
|
+
) -> list[dict[str, Any]]:
|
|
169
|
+
"""Fetch episodes from CortexDB with optional type and time filters.
|
|
170
|
+
|
|
171
|
+
Returns a list of episode dicts as returned by the ``GET /v1/episodes``
|
|
172
|
+
endpoint.
|
|
173
|
+
"""
|
|
174
|
+
params: dict[str, Any] = {}
|
|
175
|
+
if episode_type is not None:
|
|
176
|
+
params["episode_type"] = episode_type
|
|
177
|
+
if since is not None:
|
|
178
|
+
params["since"] = since.isoformat()
|
|
179
|
+
if until is not None:
|
|
180
|
+
params["until"] = until.isoformat()
|
|
181
|
+
if self.tenant_id is not None:
|
|
182
|
+
params["tenant_id"] = self.tenant_id
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
data = await self._get("/v1/episodes", params=params)
|
|
186
|
+
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
|
|
187
|
+
logger.warning("Failed to fetch episodes: %s", exc)
|
|
188
|
+
return []
|
|
189
|
+
|
|
190
|
+
if isinstance(data, list):
|
|
191
|
+
return data
|
|
192
|
+
return data.get("episodes", data.get("items", []))
|
|
193
|
+
|
|
194
|
+
async def _fetch_entities(self) -> list[dict[str, Any]]:
|
|
195
|
+
"""Fetch the entity list from CortexDB."""
|
|
196
|
+
params: dict[str, Any] = {}
|
|
197
|
+
if self.tenant_id is not None:
|
|
198
|
+
params["tenant_id"] = self.tenant_id
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
data = await self._get("/v1/entities", params=params)
|
|
202
|
+
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
|
|
203
|
+
logger.warning("Failed to fetch entities: %s", exc)
|
|
204
|
+
return []
|
|
205
|
+
|
|
206
|
+
if isinstance(data, list):
|
|
207
|
+
return data
|
|
208
|
+
return data.get("entities", data.get("items", []))
|
|
209
|
+
|
|
210
|
+
# -- Utility extractors -------------------------------------------------
|
|
211
|
+
|
|
212
|
+
@staticmethod
|
|
213
|
+
def _extract_entities(episode: dict[str, Any]) -> list[str]:
|
|
214
|
+
"""Return the list of entity names mentioned in an episode."""
|
|
215
|
+
entities = episode.get("entities", [])
|
|
216
|
+
if isinstance(entities, list):
|
|
217
|
+
return [
|
|
218
|
+
e.get("name", e) if isinstance(e, dict) else str(e)
|
|
219
|
+
for e in entities
|
|
220
|
+
]
|
|
221
|
+
return []
|
|
222
|
+
|
|
223
|
+
@staticmethod
|
|
224
|
+
def _episode_type(episode: dict[str, Any]) -> str:
|
|
225
|
+
"""Return the normalised episode type string."""
|
|
226
|
+
return (episode.get("episode_type") or episode.get("type") or "").lower()
|
|
227
|
+
|
|
228
|
+
@staticmethod
|
|
229
|
+
def _episode_time(episode: dict[str, Any]) -> datetime | None:
|
|
230
|
+
"""Parse the episode timestamp into a datetime, or None on failure."""
|
|
231
|
+
raw = episode.get("timestamp") or episode.get("created_at") or episode.get("occurred_at")
|
|
232
|
+
if raw is None:
|
|
233
|
+
return None
|
|
234
|
+
if isinstance(raw, datetime):
|
|
235
|
+
return raw
|
|
236
|
+
try:
|
|
237
|
+
return datetime.fromisoformat(str(raw).replace("Z", "+00:00"))
|
|
238
|
+
except (ValueError, TypeError):
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
@staticmethod
|
|
242
|
+
def _episode_id(episode: dict[str, Any]) -> str:
|
|
243
|
+
"""Return a stable identifier for an episode."""
|
|
244
|
+
return str(
|
|
245
|
+
episode.get("episode_id")
|
|
246
|
+
or episode.get("event_id")
|
|
247
|
+
or episode.get("id")
|
|
248
|
+
or "unknown"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# -- Insight generators -------------------------------------------------
|
|
252
|
+
|
|
253
|
+
async def generate_all(self) -> list[Insight]:
|
|
254
|
+
"""Run every insight generator and return the combined results."""
|
|
255
|
+
results: list[Insight] = []
|
|
256
|
+
generators = [
|
|
257
|
+
self.incident_spike_detection,
|
|
258
|
+
self.new_dependency_detection,
|
|
259
|
+
self.knowledge_gap_detection,
|
|
260
|
+
self.deployment_risk_assessment,
|
|
261
|
+
self.stale_documentation_detection,
|
|
262
|
+
self.recurring_issue_detection,
|
|
263
|
+
]
|
|
264
|
+
for gen in generators:
|
|
265
|
+
try:
|
|
266
|
+
insights = await gen()
|
|
267
|
+
results.extend(insights)
|
|
268
|
+
except Exception:
|
|
269
|
+
logger.exception("Insight generator %s failed", gen.__name__)
|
|
270
|
+
return results
|
|
271
|
+
|
|
272
|
+
# 1. Incident spike -------------------------------------------------------
|
|
273
|
+
|
|
274
|
+
async def incident_spike_detection(self) -> list[Insight]:
|
|
275
|
+
"""Compare incident counts this week vs last week per service.
|
|
276
|
+
|
|
277
|
+
Generates a warning if a service has more incidents this week than last,
|
|
278
|
+
and a critical insight if the spike is 3x or more.
|
|
279
|
+
"""
|
|
280
|
+
now = _now()
|
|
281
|
+
this_week_start = now - timedelta(weeks=1)
|
|
282
|
+
last_week_start = now - timedelta(weeks=2)
|
|
283
|
+
|
|
284
|
+
all_episodes = await self._fetch_episodes(since=last_week_start)
|
|
285
|
+
|
|
286
|
+
this_week: Counter[str] = Counter()
|
|
287
|
+
last_week: Counter[str] = Counter()
|
|
288
|
+
|
|
289
|
+
for ep in all_episodes:
|
|
290
|
+
if self._episode_type(ep) not in self._INCIDENT_TYPES:
|
|
291
|
+
continue
|
|
292
|
+
ts = self._episode_time(ep)
|
|
293
|
+
if ts is None:
|
|
294
|
+
continue
|
|
295
|
+
for entity in self._extract_entities(ep):
|
|
296
|
+
if ts >= this_week_start:
|
|
297
|
+
this_week[entity] += 1
|
|
298
|
+
elif ts >= last_week_start:
|
|
299
|
+
last_week[entity] += 1
|
|
300
|
+
|
|
301
|
+
insights: list[Insight] = []
|
|
302
|
+
all_services = set(this_week) | set(last_week)
|
|
303
|
+
for svc in sorted(all_services):
|
|
304
|
+
cur = this_week.get(svc, 0)
|
|
305
|
+
prev = last_week.get(svc, 0)
|
|
306
|
+
if cur <= prev:
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
if prev == 0:
|
|
310
|
+
severity = Severity.critical if cur >= 3 else Severity.warning
|
|
311
|
+
desc = (
|
|
312
|
+
f"{svc} had {cur} incident(s) this week (up from 0 last week)."
|
|
313
|
+
)
|
|
314
|
+
else:
|
|
315
|
+
ratio = cur / prev
|
|
316
|
+
severity = Severity.critical if ratio >= 3 else Severity.warning
|
|
317
|
+
desc = (
|
|
318
|
+
f"{svc} had {cur} incident(s) this week "
|
|
319
|
+
f"(up from {prev} last week, {ratio:.1f}x increase)."
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Collect evidence episode IDs.
|
|
323
|
+
evidence = [
|
|
324
|
+
self._episode_id(ep)
|
|
325
|
+
for ep in all_episodes
|
|
326
|
+
if self._episode_type(ep) in self._INCIDENT_TYPES
|
|
327
|
+
and svc in self._extract_entities(ep)
|
|
328
|
+
and (self._episode_time(ep) or now) >= this_week_start
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
insights.append(
|
|
332
|
+
Insight(
|
|
333
|
+
id=_make_id("incident_spike", svc, str(cur)),
|
|
334
|
+
insight_type=InsightType.incident_spike,
|
|
335
|
+
title=f"Incident spike: {svc}",
|
|
336
|
+
description=desc,
|
|
337
|
+
severity=severity,
|
|
338
|
+
entities=[svc],
|
|
339
|
+
evidence=evidence,
|
|
340
|
+
generated_at=now,
|
|
341
|
+
confidence=0.85,
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return insights
|
|
346
|
+
|
|
347
|
+
# 2. New dependency detection ---------------------------------------------
|
|
348
|
+
|
|
349
|
+
async def new_dependency_detection(self) -> list[Insight]:
|
|
350
|
+
"""Find entity pairs that appear together for the first time recently.
|
|
351
|
+
|
|
352
|
+
Compares co-occurrences in the last 7 days against co-occurrences in
|
|
353
|
+
episodes older than 7 days to surface newly-discovered dependencies.
|
|
354
|
+
"""
|
|
355
|
+
now = _now()
|
|
356
|
+
recent_cutoff = now - timedelta(days=7)
|
|
357
|
+
older_cutoff = now - timedelta(days=90)
|
|
358
|
+
|
|
359
|
+
all_episodes = await self._fetch_episodes(since=older_cutoff)
|
|
360
|
+
|
|
361
|
+
recent_pairs: dict[tuple[str, str], list[str]] = defaultdict(list)
|
|
362
|
+
older_pairs: set[tuple[str, str]] = set()
|
|
363
|
+
|
|
364
|
+
for ep in all_episodes:
|
|
365
|
+
entities = sorted(set(self._extract_entities(ep)))
|
|
366
|
+
ts = self._episode_time(ep)
|
|
367
|
+
if ts is None:
|
|
368
|
+
continue
|
|
369
|
+
for i in range(len(entities)):
|
|
370
|
+
for j in range(i + 1, len(entities)):
|
|
371
|
+
pair = (entities[i], entities[j])
|
|
372
|
+
if ts >= recent_cutoff:
|
|
373
|
+
recent_pairs[pair].append(self._episode_id(ep))
|
|
374
|
+
else:
|
|
375
|
+
older_pairs.add(pair)
|
|
376
|
+
|
|
377
|
+
insights: list[Insight] = []
|
|
378
|
+
for pair, evidence in sorted(recent_pairs.items()):
|
|
379
|
+
if pair in older_pairs:
|
|
380
|
+
continue
|
|
381
|
+
a, b = pair
|
|
382
|
+
first_ts = None
|
|
383
|
+
for ep in all_episodes:
|
|
384
|
+
ents = self._extract_entities(ep)
|
|
385
|
+
if a in ents and b in ents:
|
|
386
|
+
t = self._episode_time(ep)
|
|
387
|
+
if t and (first_ts is None or t < first_ts):
|
|
388
|
+
first_ts = t
|
|
389
|
+
|
|
390
|
+
since_str = first_ts.strftime("%B %d") if first_ts else "recently"
|
|
391
|
+
insights.append(
|
|
392
|
+
Insight(
|
|
393
|
+
id=_make_id("new_dep", a, b),
|
|
394
|
+
insight_type=InsightType.new_dependency,
|
|
395
|
+
title=f"New dependency detected: {a} -> {b}",
|
|
396
|
+
description=(
|
|
397
|
+
f"New dependency detected: {a} -> {b} (since {since_str}). "
|
|
398
|
+
f"These entities were never seen together before the last 7 days."
|
|
399
|
+
),
|
|
400
|
+
severity=Severity.info,
|
|
401
|
+
entities=[a, b],
|
|
402
|
+
evidence=evidence,
|
|
403
|
+
generated_at=now,
|
|
404
|
+
confidence=0.7,
|
|
405
|
+
)
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
return insights
|
|
409
|
+
|
|
410
|
+
# 3. Knowledge gap detection ----------------------------------------------
|
|
411
|
+
|
|
412
|
+
async def knowledge_gap_detection(self) -> list[Insight]:
|
|
413
|
+
"""Find services with many incidents but few document-type episodes.
|
|
414
|
+
|
|
415
|
+
A service that is frequently involved in incidents but has little or no
|
|
416
|
+
documentation is a knowledge gap that should be addressed.
|
|
417
|
+
"""
|
|
418
|
+
now = _now()
|
|
419
|
+
since = now - timedelta(days=30)
|
|
420
|
+
all_episodes = await self._fetch_episodes(since=since)
|
|
421
|
+
|
|
422
|
+
incident_counts: Counter[str] = Counter()
|
|
423
|
+
doc_counts: Counter[str] = Counter()
|
|
424
|
+
|
|
425
|
+
for ep in all_episodes:
|
|
426
|
+
ep_type = self._episode_type(ep)
|
|
427
|
+
for entity in self._extract_entities(ep):
|
|
428
|
+
if ep_type in self._INCIDENT_TYPES:
|
|
429
|
+
incident_counts[entity] += 1
|
|
430
|
+
if ep_type in self._DOCUMENT_TYPES:
|
|
431
|
+
doc_counts[entity] += 1
|
|
432
|
+
|
|
433
|
+
insights: list[Insight] = []
|
|
434
|
+
for svc, incidents in sorted(incident_counts.items(), key=lambda x: -x[1]):
|
|
435
|
+
docs = doc_counts.get(svc, 0)
|
|
436
|
+
if incidents >= 2 and docs == 0:
|
|
437
|
+
severity = Severity.critical if incidents >= 5 else Severity.warning
|
|
438
|
+
evidence = [
|
|
439
|
+
self._episode_id(ep)
|
|
440
|
+
for ep in all_episodes
|
|
441
|
+
if self._episode_type(ep) in self._INCIDENT_TYPES
|
|
442
|
+
and svc in self._extract_entities(ep)
|
|
443
|
+
]
|
|
444
|
+
insights.append(
|
|
445
|
+
Insight(
|
|
446
|
+
id=_make_id("knowledge_gap", svc),
|
|
447
|
+
insight_type=InsightType.knowledge_gap,
|
|
448
|
+
title=f"Knowledge gap: {svc}",
|
|
449
|
+
description=(
|
|
450
|
+
f"Knowledge gap: nobody documented {svc}. "
|
|
451
|
+
f"It had {incidents} incident(s) in the last 30 days "
|
|
452
|
+
f"but 0 documentation episodes."
|
|
453
|
+
),
|
|
454
|
+
severity=severity,
|
|
455
|
+
entities=[svc],
|
|
456
|
+
evidence=evidence,
|
|
457
|
+
generated_at=now,
|
|
458
|
+
confidence=0.75,
|
|
459
|
+
)
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
return insights
|
|
463
|
+
|
|
464
|
+
# 4. Deployment risk assessment -------------------------------------------
|
|
465
|
+
|
|
466
|
+
async def deployment_risk_assessment(self) -> list[Insight]:
|
|
467
|
+
"""Find services with recent deployments and recent incidents.
|
|
468
|
+
|
|
469
|
+
A service that was deployed recently and also had incidents is flagged
|
|
470
|
+
as a deployment risk.
|
|
471
|
+
"""
|
|
472
|
+
now = _now()
|
|
473
|
+
since = now - timedelta(days=7)
|
|
474
|
+
all_episodes = await self._fetch_episodes(since=since)
|
|
475
|
+
|
|
476
|
+
deployed: dict[str, list[str]] = defaultdict(list)
|
|
477
|
+
incident_svcs: dict[str, list[str]] = defaultdict(list)
|
|
478
|
+
|
|
479
|
+
for ep in all_episodes:
|
|
480
|
+
ep_type = self._episode_type(ep)
|
|
481
|
+
eid = self._episode_id(ep)
|
|
482
|
+
for entity in self._extract_entities(ep):
|
|
483
|
+
if ep_type in self._DEPLOY_TYPES:
|
|
484
|
+
deployed[entity].append(eid)
|
|
485
|
+
if ep_type in self._INCIDENT_TYPES:
|
|
486
|
+
incident_svcs[entity].append(eid)
|
|
487
|
+
|
|
488
|
+
insights: list[Insight] = []
|
|
489
|
+
for svc in sorted(set(deployed) & set(incident_svcs)):
|
|
490
|
+
deploy_count = len(deployed[svc])
|
|
491
|
+
incident_count = len(incident_svcs[svc])
|
|
492
|
+
severity = Severity.critical if incident_count >= 3 else Severity.warning
|
|
493
|
+
evidence = deployed[svc] + incident_svcs[svc]
|
|
494
|
+
insights.append(
|
|
495
|
+
Insight(
|
|
496
|
+
id=_make_id("deploy_risk", svc),
|
|
497
|
+
insight_type=InsightType.deployment_risk,
|
|
498
|
+
title=f"Deployment risk: {svc}",
|
|
499
|
+
description=(
|
|
500
|
+
f"{svc} had {deploy_count} deployment(s) and "
|
|
501
|
+
f"{incident_count} incident(s) in the last 7 days. "
|
|
502
|
+
f"Recent deploys may be correlated with failures."
|
|
503
|
+
),
|
|
504
|
+
severity=severity,
|
|
505
|
+
entities=[svc],
|
|
506
|
+
evidence=evidence,
|
|
507
|
+
generated_at=now,
|
|
508
|
+
confidence=0.8,
|
|
509
|
+
)
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
return insights
|
|
513
|
+
|
|
514
|
+
# 5. Stale documentation detection ----------------------------------------
|
|
515
|
+
|
|
516
|
+
async def stale_documentation_detection(self) -> list[Insight]:
|
|
517
|
+
"""Find document episodes older than 90 days for services that are still active.
|
|
518
|
+
|
|
519
|
+
An active service (has any episode in the last 30 days) whose most
|
|
520
|
+
recent documentation is older than 90 days is flagged.
|
|
521
|
+
"""
|
|
522
|
+
now = _now()
|
|
523
|
+
stale_threshold = now - timedelta(days=90)
|
|
524
|
+
active_since = now - timedelta(days=30)
|
|
525
|
+
|
|
526
|
+
# Fetch a wide window to capture old docs.
|
|
527
|
+
all_episodes = await self._fetch_episodes(since=now - timedelta(days=365))
|
|
528
|
+
|
|
529
|
+
latest_doc: dict[str, datetime] = {}
|
|
530
|
+
active_services: set[str] = set()
|
|
531
|
+
|
|
532
|
+
for ep in all_episodes:
|
|
533
|
+
ep_type = self._episode_type(ep)
|
|
534
|
+
ts = self._episode_time(ep)
|
|
535
|
+
if ts is None:
|
|
536
|
+
continue
|
|
537
|
+
for entity in self._extract_entities(ep):
|
|
538
|
+
if ts >= active_since:
|
|
539
|
+
active_services.add(entity)
|
|
540
|
+
if ep_type in self._DOCUMENT_TYPES:
|
|
541
|
+
if entity not in latest_doc or ts > latest_doc[entity]:
|
|
542
|
+
latest_doc[entity] = ts
|
|
543
|
+
|
|
544
|
+
insights: list[Insight] = []
|
|
545
|
+
for svc in sorted(active_services):
|
|
546
|
+
last_doc = latest_doc.get(svc)
|
|
547
|
+
if last_doc is None:
|
|
548
|
+
continue # knowledge_gap_detection handles missing docs
|
|
549
|
+
if last_doc >= stale_threshold:
|
|
550
|
+
continue
|
|
551
|
+
days_stale = (now - last_doc).days
|
|
552
|
+
insights.append(
|
|
553
|
+
Insight(
|
|
554
|
+
id=_make_id("stale_doc", svc),
|
|
555
|
+
insight_type=InsightType.stale_documentation,
|
|
556
|
+
title=f"Stale documentation: {svc}",
|
|
557
|
+
description=(
|
|
558
|
+
f"Documentation for {svc} was last updated {days_stale} days ago. "
|
|
559
|
+
f"The service is still active -- consider refreshing its docs."
|
|
560
|
+
),
|
|
561
|
+
severity=Severity.warning,
|
|
562
|
+
entities=[svc],
|
|
563
|
+
evidence=[],
|
|
564
|
+
generated_at=now,
|
|
565
|
+
confidence=0.65,
|
|
566
|
+
)
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
return insights
|
|
570
|
+
|
|
571
|
+
# 6. Recurring issue detection --------------------------------------------
|
|
572
|
+
|
|
573
|
+
async def recurring_issue_detection(self) -> list[Insight]:
|
|
574
|
+
"""Find similar incident patterns repeating for the same service.
|
|
575
|
+
|
|
576
|
+
If a service has three or more incidents in the last 30 days with
|
|
577
|
+
overlapping content tokens, the pattern is flagged as recurring.
|
|
578
|
+
"""
|
|
579
|
+
now = _now()
|
|
580
|
+
since = now - timedelta(days=30)
|
|
581
|
+
all_episodes = await self._fetch_episodes(since=since)
|
|
582
|
+
|
|
583
|
+
service_incidents: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
584
|
+
|
|
585
|
+
for ep in all_episodes:
|
|
586
|
+
if self._episode_type(ep) not in self._INCIDENT_TYPES:
|
|
587
|
+
continue
|
|
588
|
+
for entity in self._extract_entities(ep):
|
|
589
|
+
service_incidents[entity].append(ep)
|
|
590
|
+
|
|
591
|
+
insights: list[Insight] = []
|
|
592
|
+
for svc, incidents in sorted(service_incidents.items()):
|
|
593
|
+
if len(incidents) < 3:
|
|
594
|
+
continue
|
|
595
|
+
|
|
596
|
+
# Simple token-overlap heuristic: extract content words and find
|
|
597
|
+
# commonalities across incidents.
|
|
598
|
+
token_sets: list[set[str]] = []
|
|
599
|
+
for ep in incidents:
|
|
600
|
+
content = (ep.get("content") or ep.get("description") or "").lower()
|
|
601
|
+
tokens = set(content.split())
|
|
602
|
+
# Remove very short and very common words.
|
|
603
|
+
tokens = {t for t in tokens if len(t) > 3}
|
|
604
|
+
token_sets.append(tokens)
|
|
605
|
+
|
|
606
|
+
if not token_sets:
|
|
607
|
+
continue
|
|
608
|
+
|
|
609
|
+
# Find tokens that appear in at least half the incidents.
|
|
610
|
+
all_tokens: Counter[str] = Counter()
|
|
611
|
+
for ts in token_sets:
|
|
612
|
+
for t in ts:
|
|
613
|
+
all_tokens[t] += 1
|
|
614
|
+
|
|
615
|
+
threshold = max(2, len(incidents) // 2)
|
|
616
|
+
common = [t for t, c in all_tokens.most_common(10) if c >= threshold]
|
|
617
|
+
|
|
618
|
+
if not common:
|
|
619
|
+
continue
|
|
620
|
+
|
|
621
|
+
evidence = [self._episode_id(ep) for ep in incidents]
|
|
622
|
+
pattern_hint = ", ".join(common[:5])
|
|
623
|
+
insights.append(
|
|
624
|
+
Insight(
|
|
625
|
+
id=_make_id("recurring", svc, pattern_hint),
|
|
626
|
+
insight_type=InsightType.recurring_issue,
|
|
627
|
+
title=f"Recurring issue: {svc}",
|
|
628
|
+
description=(
|
|
629
|
+
f"{svc} has {len(incidents)} similar incidents in the last 30 days. "
|
|
630
|
+
f"Common keywords: {pattern_hint}."
|
|
631
|
+
),
|
|
632
|
+
severity=Severity.warning,
|
|
633
|
+
entities=[svc],
|
|
634
|
+
evidence=evidence,
|
|
635
|
+
generated_at=now,
|
|
636
|
+
confidence=0.6,
|
|
637
|
+
)
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
return insights
|