@smilintux/skcapstone 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1104 @@
1
+ """
2
+ SKCapstone ITIL Service Management — Incident, Problem, and Change Management.
3
+
4
+ Conflict-free design: each ITIL record has a ``managed_by`` field — only that
5
+ agent writes to the file. CAB votes use per-agent files to avoid conflicts.
6
+
7
+ Directory layout:
8
+ ~/.skcapstone/coordination/itil/
9
+ ├── incidents/ # One JSON per incident (managed_by agent owns it)
10
+ ├── problems/ # One JSON per problem
11
+ ├── changes/ # One JSON per RFC
12
+ ├── kedb/ # Known Error Database entries
13
+ ├── cab-decisions/ # Per-agent CAB vote files (conflict-free)
14
+ └── ITIL-BOARD.md # Auto-generated overview
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import logging
21
+ import re
22
+ import uuid
23
+ from datetime import datetime, timezone
24
+ from enum import Enum
25
+ from pathlib import Path
26
+ from typing import Any, Optional
27
+
28
+ from pydantic import BaseModel, Field
29
+
30
+ logger = logging.getLogger("skcapstone.itil")
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Enums
35
+ # ---------------------------------------------------------------------------
36
+
37
+
38
+ class Severity(str, Enum):
39
+ SEV1 = "sev1"
40
+ SEV2 = "sev2"
41
+ SEV3 = "sev3"
42
+ SEV4 = "sev4"
43
+
44
+
45
+ class IncidentStatus(str, Enum):
46
+ DETECTED = "detected"
47
+ ACKNOWLEDGED = "acknowledged"
48
+ INVESTIGATING = "investigating"
49
+ ESCALATED = "escalated"
50
+ RESOLVED = "resolved"
51
+ CLOSED = "closed"
52
+
53
+
54
+ class ProblemStatus(str, Enum):
55
+ IDENTIFIED = "identified"
56
+ ANALYZING = "analyzing"
57
+ KNOWN_ERROR = "known_error"
58
+ RESOLVED = "resolved"
59
+
60
+
61
+ class ChangeType(str, Enum):
62
+ STANDARD = "standard"
63
+ NORMAL = "normal"
64
+ EMERGENCY = "emergency"
65
+
66
+
67
+ class ChangeStatus(str, Enum):
68
+ PROPOSED = "proposed"
69
+ REVIEWING = "reviewing"
70
+ APPROVED = "approved"
71
+ REJECTED = "rejected"
72
+ IMPLEMENTING = "implementing"
73
+ DEPLOYED = "deployed"
74
+ VERIFIED = "verified"
75
+ FAILED = "failed"
76
+ CLOSED = "closed"
77
+
78
+
79
+ class Risk(str, Enum):
80
+ LOW = "low"
81
+ MEDIUM = "medium"
82
+ HIGH = "high"
83
+
84
+
85
+ class CABDecisionValue(str, Enum):
86
+ APPROVED = "approved"
87
+ REJECTED = "rejected"
88
+ ABSTAIN = "abstain"
89
+
90
+
91
+ # ---------------------------------------------------------------------------
92
+ # Lifecycle state machines — valid transitions
93
+ # ---------------------------------------------------------------------------
94
+
95
+ _INCIDENT_TRANSITIONS: dict[str, set[str]] = {
96
+ "detected": {"acknowledged", "escalated", "resolved"},
97
+ "acknowledged": {"investigating", "escalated", "resolved"},
98
+ "investigating": {"escalated", "resolved"},
99
+ "escalated": {"investigating", "resolved"},
100
+ "resolved": {"closed"},
101
+ "closed": set(),
102
+ }
103
+
104
+ _PROBLEM_TRANSITIONS: dict[str, set[str]] = {
105
+ "identified": {"analyzing"},
106
+ "analyzing": {"known_error", "resolved"},
107
+ "known_error": {"resolved"},
108
+ "resolved": set(),
109
+ }
110
+
111
+ _CHANGE_TRANSITIONS: dict[str, set[str]] = {
112
+ "proposed": {"reviewing", "approved", "rejected"},
113
+ "reviewing": {"approved", "rejected"},
114
+ "approved": {"implementing", "rejected"},
115
+ "rejected": {"closed"},
116
+ "implementing": {"deployed", "failed"},
117
+ "deployed": {"verified", "failed"},
118
+ "verified": {"closed"},
119
+ "failed": {"implementing", "closed"},
120
+ "closed": set(),
121
+ }
122
+
123
+
124
+ # ---------------------------------------------------------------------------
125
+ # Pydantic models
126
+ # ---------------------------------------------------------------------------
127
+
128
+
129
+ class TimelineEntry(BaseModel):
130
+ ts: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
131
+ agent: str
132
+ action: str
133
+ note: str = ""
134
+
135
+
136
+ class Incident(BaseModel):
137
+ id: str = Field(default_factory=lambda: f"inc-{uuid.uuid4().hex[:8]}")
138
+ type: str = "incident"
139
+ title: str
140
+ severity: Severity = Severity.SEV3
141
+ status: IncidentStatus = IncidentStatus.DETECTED
142
+ source: str = "manual"
143
+ affected_services: list[str] = Field(default_factory=list)
144
+ impact: str = ""
145
+ managed_by: str = ""
146
+ created_by: str = ""
147
+ detected_at: str = Field(
148
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
149
+ )
150
+ acknowledged_at: Optional[str] = None
151
+ resolved_at: Optional[str] = None
152
+ closed_at: Optional[str] = None
153
+ timeline: list[dict[str, Any]] = Field(default_factory=list)
154
+ related_problem_id: Optional[str] = None
155
+ gtd_item_ids: list[str] = Field(default_factory=list)
156
+ resolution_summary: Optional[str] = None
157
+ tags: list[str] = Field(default_factory=list)
158
+
159
+
160
+ class Problem(BaseModel):
161
+ id: str = Field(default_factory=lambda: f"prb-{uuid.uuid4().hex[:8]}")
162
+ type: str = "problem"
163
+ title: str
164
+ status: ProblemStatus = ProblemStatus.IDENTIFIED
165
+ root_cause: Optional[str] = None
166
+ workaround: Optional[str] = None
167
+ managed_by: str = ""
168
+ created_by: str = ""
169
+ created_at: str = Field(
170
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
171
+ )
172
+ related_incident_ids: list[str] = Field(default_factory=list)
173
+ related_change_id: Optional[str] = None
174
+ kedb_id: Optional[str] = None
175
+ timeline: list[dict[str, Any]] = Field(default_factory=list)
176
+ tags: list[str] = Field(default_factory=list)
177
+
178
+
179
+ class Change(BaseModel):
180
+ id: str = Field(default_factory=lambda: f"chg-{uuid.uuid4().hex[:8]}")
181
+ type: str = "change"
182
+ title: str
183
+ change_type: ChangeType = ChangeType.NORMAL
184
+ status: ChangeStatus = ChangeStatus.PROPOSED
185
+ risk: Risk = Risk.MEDIUM
186
+ rollback_plan: str = ""
187
+ test_plan: str = ""
188
+ managed_by: str = ""
189
+ created_by: str = ""
190
+ implementer: Optional[str] = None
191
+ cab_required: bool = True
192
+ created_at: str = Field(
193
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
194
+ )
195
+ related_problem_id: Optional[str] = None
196
+ gtd_item_ids: list[str] = Field(default_factory=list)
197
+ timeline: list[dict[str, Any]] = Field(default_factory=list)
198
+ tags: list[str] = Field(default_factory=list)
199
+
200
+
201
+ class KEDBEntry(BaseModel):
202
+ id: str = Field(default_factory=lambda: f"ke-{uuid.uuid4().hex[:8]}")
203
+ title: str
204
+ symptoms: list[str] = Field(default_factory=list)
205
+ root_cause: str = ""
206
+ workaround: str = ""
207
+ permanent_fix_change_id: Optional[str] = None
208
+ related_problem_id: Optional[str] = None
209
+ managed_by: str = ""
210
+ created_at: str = Field(
211
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
212
+ )
213
+ tags: list[str] = Field(default_factory=list)
214
+
215
+
216
+ class CABDecision(BaseModel):
217
+ change_id: str
218
+ agent: str
219
+ decision: CABDecisionValue = CABDecisionValue.ABSTAIN
220
+ conditions: str = ""
221
+ decided_at: str = Field(
222
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
223
+ )
224
+
225
+
226
+ # ---------------------------------------------------------------------------
227
+ # Helpers
228
+ # ---------------------------------------------------------------------------
229
+
230
+
231
+ def _slugify(text: str) -> str:
232
+ """Convert text to a filesystem-safe slug."""
233
+ slug = text.lower().strip()
234
+ slug = re.sub(r'[/\\:*?"<>|]', '-', slug)
235
+ slug = re.sub(r'[^\w\s-]', '', slug)
236
+ slug = re.sub(r'[\s_]+', '-', slug)
237
+ return slug.strip('-')[:40]
238
+
239
+
240
+ def _now_iso() -> str:
241
+ return datetime.now(timezone.utc).isoformat()
242
+
243
+
244
+ def _make_timeline_entry(agent: str, action: str, note: str = "") -> dict[str, str]:
245
+ return {
246
+ "ts": _now_iso(),
247
+ "agent": agent,
248
+ "action": action,
249
+ "note": note,
250
+ }
251
+
252
+
253
+ # ---------------------------------------------------------------------------
254
+ # ITILManager
255
+ # ---------------------------------------------------------------------------
256
+
257
+
258
+ class ITILManager:
259
+ """Manages ITIL records on disk with lifecycle validation.
260
+
261
+ Args:
262
+ home: Path to the shared root (``~/.skcapstone`` or equivalent).
263
+ """
264
+
265
+ def __init__(self, home: Path) -> None:
266
+ self.home = Path(home).expanduser()
267
+ self.itil_dir = self.home / "coordination" / "itil"
268
+ self.incidents_dir = self.itil_dir / "incidents"
269
+ self.problems_dir = self.itil_dir / "problems"
270
+ self.changes_dir = self.itil_dir / "changes"
271
+ self.kedb_dir = self.itil_dir / "kedb"
272
+ self.cab_dir = self.itil_dir / "cab-decisions"
273
+
274
+ def ensure_dirs(self) -> None:
275
+ """Create ITIL directories if they don't exist."""
276
+ for d in (
277
+ self.incidents_dir,
278
+ self.problems_dir,
279
+ self.changes_dir,
280
+ self.kedb_dir,
281
+ self.cab_dir,
282
+ ):
283
+ d.mkdir(parents=True, exist_ok=True)
284
+
285
+ # ── File I/O ──────────────────────────────────────────────────────
286
+
287
+ def _write_record(self, directory: Path, record_id: str, title: str, data: dict) -> Path:
288
+ """Write a record JSON file."""
289
+ self.ensure_dirs()
290
+ slug = _slugify(title)
291
+ filename = f"{record_id}-{slug}.json" if slug else f"{record_id}.json"
292
+ path = directory / filename
293
+ path.write_text(
294
+ json.dumps(data, indent=2, default=str) + "\n", encoding="utf-8"
295
+ )
296
+ return path
297
+
298
+ def _load_records(self, directory: Path, model_class: type) -> list:
299
+ """Load all JSON records from a directory, validating with model_class."""
300
+ records = []
301
+ if not directory.exists():
302
+ return records
303
+ for f in sorted(directory.glob("*.json")):
304
+ try:
305
+ data = json.loads(f.read_text(encoding="utf-8"))
306
+ records.append(model_class.model_validate(data))
307
+ except (json.JSONDecodeError, Exception):
308
+ continue
309
+ return records
310
+
311
+ def _find_record_path(self, directory: Path, record_id: str) -> Optional[Path]:
312
+ """Find a record file by ID prefix in filename."""
313
+ if not directory.exists():
314
+ return None
315
+ for f in directory.glob(f"{record_id}*.json"):
316
+ return f
317
+ return None
318
+
319
+ def _load_record(self, directory: Path, record_id: str, model_class: type):
320
+ """Load a single record by ID."""
321
+ path = self._find_record_path(directory, record_id)
322
+ if path is None:
323
+ return None
324
+ try:
325
+ data = json.loads(path.read_text(encoding="utf-8"))
326
+ return model_class.model_validate(data)
327
+ except (json.JSONDecodeError, Exception):
328
+ return None
329
+
330
+ def _update_record(self, directory: Path, record_id: str, title: str, data: dict) -> Path:
331
+ """Update a record, removing old file if slug changed."""
332
+ old_path = self._find_record_path(directory, record_id)
333
+ new_path = self._write_record(directory, record_id, title, data)
334
+ if old_path and old_path != new_path and old_path.exists():
335
+ old_path.unlink()
336
+ return new_path
337
+
338
+ # ── Incidents ─────────────────────────────────────────────────────
339
+
340
+ def create_incident(
341
+ self,
342
+ title: str,
343
+ severity: str = "sev3",
344
+ source: str = "manual",
345
+ affected_services: list[str] | None = None,
346
+ impact: str = "",
347
+ managed_by: str = "",
348
+ created_by: str = "",
349
+ tags: list[str] | None = None,
350
+ ) -> Incident:
351
+ """Create a new incident record."""
352
+ agent = managed_by or created_by or "unknown"
353
+ incident = Incident(
354
+ title=title,
355
+ severity=Severity(severity),
356
+ source=source,
357
+ affected_services=affected_services or [],
358
+ impact=impact,
359
+ managed_by=agent,
360
+ created_by=created_by or agent,
361
+ tags=tags or [],
362
+ )
363
+ incident.timeline.append(
364
+ _make_timeline_entry(agent, "created", f"Incident detected: {title}")
365
+ )
366
+ self._write_record(
367
+ self.incidents_dir, incident.id, title, incident.model_dump()
368
+ )
369
+
370
+ # Publish event
371
+ self._publish_event("itil.incident.created", {
372
+ "id": incident.id,
373
+ "title": title,
374
+ "severity": severity,
375
+ "managed_by": agent,
376
+ })
377
+
378
+ # Auto-create GTD item
379
+ gtd_id = self._create_gtd_item_for_incident(incident)
380
+ if gtd_id:
381
+ incident.gtd_item_ids.append(gtd_id)
382
+ self._update_record(
383
+ self.incidents_dir, incident.id, title, incident.model_dump()
384
+ )
385
+
386
+ return incident
387
+
388
+ def update_incident(
389
+ self,
390
+ incident_id: str,
391
+ agent: str,
392
+ new_status: str | None = None,
393
+ severity: str | None = None,
394
+ note: str = "",
395
+ resolution_summary: str | None = None,
396
+ related_problem_id: str | None = None,
397
+ ) -> Incident:
398
+ """Update an incident's status, severity, or metadata."""
399
+ inc = self._load_record(self.incidents_dir, incident_id, Incident)
400
+ if inc is None:
401
+ raise ValueError(f"Incident {incident_id} not found")
402
+
403
+ if new_status:
404
+ current = inc.status.value
405
+ if new_status not in _INCIDENT_TRANSITIONS.get(current, set()):
406
+ raise ValueError(
407
+ f"Invalid transition: {current} -> {new_status}. "
408
+ f"Valid: {_INCIDENT_TRANSITIONS.get(current, set())}"
409
+ )
410
+ old_status = current
411
+ inc.status = IncidentStatus(new_status)
412
+ inc.timeline.append(
413
+ _make_timeline_entry(agent, f"status:{old_status}->{new_status}", note)
414
+ )
415
+
416
+ if new_status == "acknowledged":
417
+ inc.acknowledged_at = _now_iso()
418
+ elif new_status == "resolved":
419
+ inc.resolved_at = _now_iso()
420
+ if resolution_summary:
421
+ inc.resolution_summary = resolution_summary
422
+ self._complete_gtd_items(inc.gtd_item_ids)
423
+ elif new_status == "closed":
424
+ inc.closed_at = _now_iso()
425
+
426
+ if severity and severity != inc.severity.value:
427
+ old_sev = inc.severity.value
428
+ inc.severity = Severity(severity)
429
+ inc.timeline.append(
430
+ _make_timeline_entry(agent, f"severity:{old_sev}->{severity}", note)
431
+ )
432
+ self._publish_event("itil.incident.escalated", {
433
+ "id": inc.id,
434
+ "old_severity": old_sev,
435
+ "new_severity": severity,
436
+ })
437
+
438
+ if related_problem_id:
439
+ inc.related_problem_id = related_problem_id
440
+
441
+ if note and not new_status and not severity:
442
+ inc.timeline.append(_make_timeline_entry(agent, "note", note))
443
+
444
+ self._update_record(
445
+ self.incidents_dir, inc.id, inc.title, inc.model_dump()
446
+ )
447
+ return inc
448
+
449
+ def list_incidents(
450
+ self,
451
+ status: str | None = None,
452
+ severity: str | None = None,
453
+ service: str | None = None,
454
+ ) -> list[Incident]:
455
+ """List incidents with optional filters."""
456
+ incidents = self._load_records(self.incidents_dir, Incident)
457
+ if status:
458
+ incidents = [i for i in incidents if i.status.value == status]
459
+ if severity:
460
+ incidents = [i for i in incidents if i.severity.value == severity]
461
+ if service:
462
+ incidents = [
463
+ i for i in incidents if service in i.affected_services
464
+ ]
465
+ return incidents
466
+
467
+ def find_open_incident_for_service(self, service: str) -> Optional[Incident]:
468
+ """Find an existing open incident for a service (dedup check)."""
469
+ open_statuses = {"detected", "acknowledged", "investigating", "escalated"}
470
+ for inc in self.list_incidents():
471
+ if inc.status.value in open_statuses and service in inc.affected_services:
472
+ return inc
473
+ return None
474
+
475
+ # ── Problems ──────────────────────────────────────────────────────
476
+
477
+ def create_problem(
478
+ self,
479
+ title: str,
480
+ managed_by: str = "",
481
+ created_by: str = "",
482
+ related_incident_ids: list[str] | None = None,
483
+ workaround: str = "",
484
+ tags: list[str] | None = None,
485
+ ) -> Problem:
486
+ """Create a new problem record."""
487
+ agent = managed_by or created_by or "unknown"
488
+ problem = Problem(
489
+ title=title,
490
+ managed_by=agent,
491
+ created_by=created_by or agent,
492
+ related_incident_ids=related_incident_ids or [],
493
+ workaround=workaround,
494
+ tags=tags or [],
495
+ )
496
+ problem.timeline.append(
497
+ _make_timeline_entry(agent, "created", f"Problem identified: {title}")
498
+ )
499
+ self._write_record(
500
+ self.problems_dir, problem.id, title, problem.model_dump()
501
+ )
502
+
503
+ self._publish_event("itil.problem.created", {
504
+ "id": problem.id,
505
+ "title": title,
506
+ "related_incidents": related_incident_ids or [],
507
+ })
508
+
509
+ # Auto-create GTD project
510
+ self._create_gtd_project_for_problem(problem)
511
+
512
+ return problem
513
+
514
+ def update_problem(
515
+ self,
516
+ problem_id: str,
517
+ agent: str,
518
+ new_status: str | None = None,
519
+ root_cause: str | None = None,
520
+ workaround: str | None = None,
521
+ note: str = "",
522
+ create_kedb: bool = False,
523
+ ) -> Problem:
524
+ """Update a problem's status or metadata."""
525
+ prb = self._load_record(self.problems_dir, problem_id, Problem)
526
+ if prb is None:
527
+ raise ValueError(f"Problem {problem_id} not found")
528
+
529
+ if new_status:
530
+ current = prb.status.value
531
+ if new_status not in _PROBLEM_TRANSITIONS.get(current, set()):
532
+ raise ValueError(
533
+ f"Invalid transition: {current} -> {new_status}. "
534
+ f"Valid: {_PROBLEM_TRANSITIONS.get(current, set())}"
535
+ )
536
+ prb.status = ProblemStatus(new_status)
537
+ prb.timeline.append(
538
+ _make_timeline_entry(agent, f"status:{current}->{new_status}", note)
539
+ )
540
+
541
+ if root_cause:
542
+ prb.root_cause = root_cause
543
+ if workaround:
544
+ prb.workaround = workaround
545
+
546
+ if note and not new_status:
547
+ prb.timeline.append(_make_timeline_entry(agent, "note", note))
548
+
549
+ # Auto-create KEDB entry when transitioning to known_error
550
+ if create_kedb and prb.root_cause:
551
+ kedb = self.create_kedb_entry(
552
+ title=prb.title,
553
+ symptoms=[],
554
+ root_cause=prb.root_cause,
555
+ workaround=prb.workaround or "",
556
+ related_problem_id=prb.id,
557
+ managed_by=agent,
558
+ )
559
+ prb.kedb_id = kedb.id
560
+
561
+ self._update_record(
562
+ self.problems_dir, prb.id, prb.title, prb.model_dump()
563
+ )
564
+ return prb
565
+
566
+ def list_problems(self, status: str | None = None) -> list[Problem]:
567
+ """List problems with optional status filter."""
568
+ problems = self._load_records(self.problems_dir, Problem)
569
+ if status:
570
+ problems = [p for p in problems if p.status.value == status]
571
+ return problems
572
+
573
+ # ── Changes ───────────────────────────────────────────────────────
574
+
575
+ def propose_change(
576
+ self,
577
+ title: str,
578
+ change_type: str = "normal",
579
+ risk: str = "medium",
580
+ rollback_plan: str = "",
581
+ test_plan: str = "",
582
+ managed_by: str = "",
583
+ created_by: str = "",
584
+ implementer: str | None = None,
585
+ related_problem_id: str | None = None,
586
+ tags: list[str] | None = None,
587
+ ) -> Change:
588
+ """Propose a new change (RFC)."""
589
+ agent = managed_by or created_by or "unknown"
590
+ ct = ChangeType(change_type)
591
+ change = Change(
592
+ title=title,
593
+ change_type=ct,
594
+ risk=Risk(risk),
595
+ rollback_plan=rollback_plan,
596
+ test_plan=test_plan,
597
+ managed_by=agent,
598
+ created_by=created_by or agent,
599
+ implementer=implementer,
600
+ cab_required=ct != ChangeType.STANDARD,
601
+ related_problem_id=related_problem_id,
602
+ tags=tags or [],
603
+ )
604
+ change.timeline.append(
605
+ _make_timeline_entry(agent, "proposed", f"RFC: {title}")
606
+ )
607
+
608
+ # Standard changes auto-approve
609
+ if ct == ChangeType.STANDARD:
610
+ change.status = ChangeStatus.APPROVED
611
+ change.timeline.append(
612
+ _make_timeline_entry(agent, "auto-approved", "Standard change")
613
+ )
614
+
615
+ self._write_record(
616
+ self.changes_dir, change.id, title, change.model_dump()
617
+ )
618
+
619
+ self._publish_event("itil.change.proposed", {
620
+ "id": change.id,
621
+ "title": title,
622
+ "change_type": change_type,
623
+ "cab_required": change.cab_required,
624
+ })
625
+
626
+ return change
627
+
628
+ def update_change(
629
+ self,
630
+ change_id: str,
631
+ agent: str,
632
+ new_status: str | None = None,
633
+ note: str = "",
634
+ ) -> Change:
635
+ """Update a change's status."""
636
+ chg = self._load_record(self.changes_dir, change_id, Change)
637
+ if chg is None:
638
+ raise ValueError(f"Change {change_id} not found")
639
+
640
+ if new_status:
641
+ current = chg.status.value
642
+ if new_status not in _CHANGE_TRANSITIONS.get(current, set()):
643
+ raise ValueError(
644
+ f"Invalid transition: {current} -> {new_status}. "
645
+ f"Valid: {_CHANGE_TRANSITIONS.get(current, set())}"
646
+ )
647
+ chg.status = ChangeStatus(new_status)
648
+ chg.timeline.append(
649
+ _make_timeline_entry(agent, f"status:{current}->{new_status}", note)
650
+ )
651
+
652
+ if new_status == "approved":
653
+ self._publish_event("itil.change.approved", {
654
+ "id": chg.id, "title": chg.title, "implementer": chg.implementer,
655
+ })
656
+ # Auto-create GTD next-action for implementer
657
+ if chg.implementer:
658
+ self._create_gtd_item_for_change(chg)
659
+ elif new_status == "deployed":
660
+ self._publish_event("itil.change.deployed", {
661
+ "id": chg.id, "title": chg.title,
662
+ })
663
+
664
+ if note and not new_status:
665
+ chg.timeline.append(_make_timeline_entry(agent, "note", note))
666
+
667
+ self._update_record(
668
+ self.changes_dir, chg.id, chg.title, chg.model_dump()
669
+ )
670
+ return chg
671
+
672
+ def list_changes(self, status: str | None = None) -> list[Change]:
673
+ """List changes with optional status filter."""
674
+ changes = self._load_records(self.changes_dir, Change)
675
+ if status:
676
+ changes = [c for c in changes if c.status.value == status]
677
+ return changes
678
+
679
+ # ── CAB ───────────────────────────────────────────────────────────
680
+
681
+ def submit_cab_vote(
682
+ self,
683
+ change_id: str,
684
+ agent: str,
685
+ decision: str = "abstain",
686
+ conditions: str = "",
687
+ ) -> CABDecision:
688
+ """Submit a CAB vote for a change (per-agent file)."""
689
+ self.ensure_dirs()
690
+ vote = CABDecision(
691
+ change_id=change_id,
692
+ agent=agent,
693
+ decision=CABDecisionValue(decision),
694
+ conditions=conditions,
695
+ )
696
+ filename = f"{change_id}-{agent}.json"
697
+ path = self.cab_dir / filename
698
+ path.write_text(
699
+ json.dumps(vote.model_dump(), indent=2, default=str) + "\n",
700
+ encoding="utf-8",
701
+ )
702
+
703
+ # Check if all required votes are in and auto-approve/reject
704
+ self._evaluate_cab(change_id)
705
+
706
+ return vote
707
+
708
+ def get_cab_votes(self, change_id: str) -> list[CABDecision]:
709
+ """Get all CAB votes for a change."""
710
+ votes = []
711
+ if not self.cab_dir.exists():
712
+ return votes
713
+ for f in self.cab_dir.glob(f"{change_id}-*.json"):
714
+ try:
715
+ data = json.loads(f.read_text(encoding="utf-8"))
716
+ votes.append(CABDecision.model_validate(data))
717
+ except (json.JSONDecodeError, Exception):
718
+ continue
719
+ return votes
720
+
721
+ def _evaluate_cab(self, change_id: str) -> None:
722
+ """Evaluate CAB votes and auto-transition if decisive."""
723
+ chg = self._load_record(self.changes_dir, change_id, Change)
724
+ if chg is None or chg.status.value not in ("proposed", "reviewing"):
725
+ return
726
+
727
+ votes = self.get_cab_votes(change_id)
728
+ if not votes:
729
+ return
730
+
731
+ rejections = [v for v in votes if v.decision == CABDecisionValue.REJECTED]
732
+ approvals = [v for v in votes if v.decision == CABDecisionValue.APPROVED]
733
+
734
+ # Any rejection blocks the change
735
+ if rejections:
736
+ try:
737
+ self.update_change(change_id, "cab-system", new_status="rejected",
738
+ note=f"Rejected by: {', '.join(v.agent for v in rejections)}")
739
+ except ValueError:
740
+ pass
741
+ return
742
+
743
+ # Need at least one human approval for normal changes
744
+ human_approvals = [v for v in approvals if v.agent == "human"]
745
+ if human_approvals:
746
+ try:
747
+ self.update_change(change_id, "cab-system", new_status="approved",
748
+ note=f"Approved by: {', '.join(v.agent for v in approvals)}")
749
+ except ValueError:
750
+ pass
751
+
752
+ # ── KEDB ──────────────────────────────────────────────────────────
753
+
754
+ def create_kedb_entry(
755
+ self,
756
+ title: str,
757
+ symptoms: list[str],
758
+ root_cause: str = "",
759
+ workaround: str = "",
760
+ permanent_fix_change_id: str | None = None,
761
+ related_problem_id: str | None = None,
762
+ managed_by: str = "",
763
+ tags: list[str] | None = None,
764
+ ) -> KEDBEntry:
765
+ """Create a Known Error Database entry."""
766
+ entry = KEDBEntry(
767
+ title=title,
768
+ symptoms=symptoms,
769
+ root_cause=root_cause,
770
+ workaround=workaround,
771
+ permanent_fix_change_id=permanent_fix_change_id,
772
+ related_problem_id=related_problem_id,
773
+ managed_by=managed_by,
774
+ tags=tags or [],
775
+ )
776
+ self._write_record(
777
+ self.kedb_dir, entry.id, title, entry.model_dump()
778
+ )
779
+ return entry
780
+
781
+ def search_kedb(self, query: str) -> list[KEDBEntry]:
782
+ """Search KEDB entries by matching query against title, symptoms, root_cause."""
783
+ entries = self._load_records(self.kedb_dir, KEDBEntry)
784
+ query_lower = query.lower()
785
+ results = []
786
+ for e in entries:
787
+ searchable = " ".join([
788
+ e.title,
789
+ " ".join(e.symptoms),
790
+ e.root_cause,
791
+ e.workaround,
792
+ " ".join(e.tags),
793
+ ]).lower()
794
+ if query_lower in searchable:
795
+ results.append(e)
796
+ return results
797
+
798
+ # ── Status dashboard ──────────────────────────────────────────────
799
+
800
+ def get_status(self) -> dict[str, Any]:
801
+ """Return a dashboard summary of all ITIL records."""
802
+ incidents = self._load_records(self.incidents_dir, Incident)
803
+ problems = self._load_records(self.problems_dir, Problem)
804
+ changes = self._load_records(self.changes_dir, Change)
805
+ kedb = self._load_records(self.kedb_dir, KEDBEntry)
806
+
807
+ open_inc_statuses = {"detected", "acknowledged", "investigating", "escalated"}
808
+ open_incidents = [i for i in incidents if i.status.value in open_inc_statuses]
809
+ active_problems = [p for p in problems if p.status.value != "resolved"]
810
+ pending_changes = [
811
+ c for c in changes
812
+ if c.status.value in ("proposed", "reviewing", "approved", "implementing")
813
+ ]
814
+
815
+ return {
816
+ "incidents": {
817
+ "total": len(incidents),
818
+ "open": len(open_incidents),
819
+ "by_severity": {
820
+ sev.value: sum(1 for i in open_incidents if i.severity == sev)
821
+ for sev in Severity
822
+ },
823
+ "open_list": [
824
+ {
825
+ "id": i.id,
826
+ "title": i.title,
827
+ "severity": i.severity.value,
828
+ "status": i.status.value,
829
+ "managed_by": i.managed_by,
830
+ "detected_at": i.detected_at,
831
+ }
832
+ for i in open_incidents
833
+ ],
834
+ },
835
+ "problems": {
836
+ "total": len(problems),
837
+ "active": len(active_problems),
838
+ "active_list": [
839
+ {
840
+ "id": p.id,
841
+ "title": p.title,
842
+ "status": p.status.value,
843
+ "managed_by": p.managed_by,
844
+ }
845
+ for p in active_problems
846
+ ],
847
+ },
848
+ "changes": {
849
+ "total": len(changes),
850
+ "pending": len(pending_changes),
851
+ "pending_list": [
852
+ {
853
+ "id": c.id,
854
+ "title": c.title,
855
+ "status": c.status.value,
856
+ "change_type": c.change_type.value,
857
+ "managed_by": c.managed_by,
858
+ }
859
+ for c in pending_changes
860
+ ],
861
+ },
862
+ "kedb": {
863
+ "total": len(kedb),
864
+ },
865
+ }
866
+
867
+ # ── Auto-close / Escalation (for scheduled tasks) ─────────────────
868
+
869
+ def auto_close_resolved(self, stable_hours: int = 24) -> list[str]:
870
+ """Auto-close incidents that have been resolved for stable_hours."""
871
+ now = datetime.now(timezone.utc)
872
+ closed_ids = []
873
+ for inc in self.list_incidents(status="resolved"):
874
+ if inc.resolved_at:
875
+ try:
876
+ resolved = datetime.fromisoformat(
877
+ inc.resolved_at.replace("Z", "+00:00")
878
+ )
879
+ hours = (now - resolved).total_seconds() / 3600
880
+ if hours >= stable_hours:
881
+ self.update_incident(
882
+ inc.id, "auto-close",
883
+ new_status="closed",
884
+ note=f"Auto-closed after {int(hours)}h stable",
885
+ )
886
+ closed_ids.append(inc.id)
887
+ except (ValueError, TypeError):
888
+ continue
889
+ return closed_ids
890
+
891
+ def check_sla_breaches(self) -> list[dict[str, Any]]:
892
+ """Check for SLA breaches on open incidents."""
893
+ now = datetime.now(timezone.utc)
894
+ breaches = []
895
+ sla_minutes = {"sev1": 5, "sev2": 15, "sev3": 60, "sev4": 240}
896
+
897
+ for inc in self.list_incidents():
898
+ if inc.status.value in ("resolved", "closed"):
899
+ continue
900
+ if inc.status.value == "detected" and inc.detected_at:
901
+ try:
902
+ detected = datetime.fromisoformat(
903
+ inc.detected_at.replace("Z", "+00:00")
904
+ )
905
+ elapsed_min = (now - detected).total_seconds() / 60
906
+ limit = sla_minutes.get(inc.severity.value, 60)
907
+ if elapsed_min > limit:
908
+ breaches.append({
909
+ "id": inc.id,
910
+ "severity": inc.severity.value,
911
+ "breach_type": "unacknowledged",
912
+ "elapsed_minutes": round(elapsed_min),
913
+ "sla_minutes": limit,
914
+ })
915
+ self._publish_event("itil.sla.breach", {
916
+ "id": inc.id,
917
+ "severity": inc.severity.value,
918
+ "breach_type": "unacknowledged",
919
+ })
920
+ except (ValueError, TypeError):
921
+ continue
922
+ return breaches
923
+
924
+ # ── ITIL Board generation ─────────────────────────────────────────
925
+
926
+ def generate_board_md(self) -> str:
927
+ """Generate an ITIL-BOARD.md overview."""
928
+ status = self.get_status()
929
+ now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
930
+
931
+ lines = [
932
+ "# ITIL Service Management Board",
933
+ f"*Auto-generated {now} — do not edit manually*",
934
+ "",
935
+ ]
936
+
937
+ # Incidents
938
+ inc = status["incidents"]
939
+ lines.append(f"## Open Incidents ({inc['open']})")
940
+ lines.append("")
941
+ if inc["open_list"]:
942
+ for i in inc["open_list"]:
943
+ sev_icon = {"sev1": "P1", "sev2": "P2", "sev3": "P3", "sev4": "P4"}.get(
944
+ i["severity"], "?"
945
+ )
946
+ lines.append(
947
+ f"- **[{i['id']}]** {sev_icon} {i['title']} "
948
+ f"({i['status']}) @{i['managed_by']}"
949
+ )
950
+ else:
951
+ lines.append("*No open incidents*")
952
+ lines.append("")
953
+
954
+ # Problems
955
+ prb = status["problems"]
956
+ lines.append(f"## Active Problems ({prb['active']})")
957
+ lines.append("")
958
+ if prb["active_list"]:
959
+ for p in prb["active_list"]:
960
+ lines.append(
961
+ f"- **[{p['id']}]** {p['title']} ({p['status']}) @{p['managed_by']}"
962
+ )
963
+ else:
964
+ lines.append("*No active problems*")
965
+ lines.append("")
966
+
967
+ # Changes
968
+ chg = status["changes"]
969
+ lines.append(f"## Pending Changes ({chg['pending']})")
970
+ lines.append("")
971
+ if chg["pending_list"]:
972
+ for c in chg["pending_list"]:
973
+ lines.append(
974
+ f"- **[{c['id']}]** {c['title']} ({c['status']}, "
975
+ f"{c['change_type']}) @{c['managed_by']}"
976
+ )
977
+ else:
978
+ lines.append("*No pending changes*")
979
+ lines.append("")
980
+
981
+ # KEDB
982
+ lines.append(f"## Known Errors ({status['kedb']['total']})")
983
+ lines.append("")
984
+
985
+ return "\n".join(lines)
986
+
987
+ def write_board_md(self) -> Path:
988
+ """Write ITIL-BOARD.md to the ITIL directory."""
989
+ self.ensure_dirs()
990
+ content = self.generate_board_md()
991
+ path = self.itil_dir / "ITIL-BOARD.md"
992
+ path.write_text(content, encoding="utf-8")
993
+ return path
994
+
995
+ # ── GTD integration helpers ───────────────────────────────────────
996
+
997
+ def _create_gtd_item_for_incident(self, incident: Incident) -> Optional[str]:
998
+ """Auto-create a GTD inbox/next-action item for an incident."""
999
+ try:
1000
+ from .mcp_tools.gtd_tools import _make_item, _load_list, _save_list
1001
+
1002
+ priority_map = {"sev1": "critical", "sev2": "high", "sev3": "medium", "sev4": "low"}
1003
+ priority = priority_map.get(incident.severity.value, "medium")
1004
+
1005
+ text = f"[ITIL:{incident.id}] {incident.title}"
1006
+ item = _make_item(text=text, source="itil", context="@ops")
1007
+ item["priority"] = priority
1008
+
1009
+ if incident.severity.value in ("sev1", "sev2"):
1010
+ # Urgent: go straight to next-actions
1011
+ item["status"] = "next"
1012
+ items = _load_list("next-actions")
1013
+ items.append(item)
1014
+ _save_list("next-actions", items)
1015
+ else:
1016
+ # Minor: inbox for processing
1017
+ items = _load_list("inbox")
1018
+ items.append(item)
1019
+ _save_list("inbox", items)
1020
+
1021
+ return item["id"]
1022
+ except Exception:
1023
+ logger.debug("Failed to create GTD item for incident %s", incident.id)
1024
+ return None
1025
+
1026
+ def _create_gtd_project_for_problem(self, problem: Problem) -> Optional[str]:
1027
+ """Auto-create a GTD project for a problem investigation."""
1028
+ try:
1029
+ from .mcp_tools.gtd_tools import _make_item, _load_list, _save_list
1030
+
1031
+ text = f"[ITIL:{problem.id}] Investigate: {problem.title}"
1032
+ item = _make_item(text=text, source="itil", context="@ops")
1033
+ item["status"] = "project"
1034
+
1035
+ projects = _load_list("projects")
1036
+ projects.append(item)
1037
+ _save_list("projects", projects)
1038
+
1039
+ return item["id"]
1040
+ except Exception:
1041
+ logger.debug("Failed to create GTD project for problem %s", problem.id)
1042
+ return None
1043
+
1044
+ def _create_gtd_item_for_change(self, change: Change) -> Optional[str]:
1045
+ """Auto-create a GTD next-action for an approved change."""
1046
+ try:
1047
+ from .mcp_tools.gtd_tools import _make_item, _load_list, _save_list
1048
+
1049
+ text = f"[ITIL:{change.id}] Implement: {change.title}"
1050
+ item = _make_item(text=text, source="itil", context="@ops")
1051
+ item["status"] = "next"
1052
+ item["priority"] = "high"
1053
+
1054
+ items = _load_list("next-actions")
1055
+ items.append(item)
1056
+ _save_list("next-actions", items)
1057
+
1058
+ change.gtd_item_ids.append(item["id"])
1059
+ return item["id"]
1060
+ except Exception:
1061
+ logger.debug("Failed to create GTD item for change %s", change.id)
1062
+ return None
1063
+
1064
+ def _complete_gtd_items(self, gtd_item_ids: list[str]) -> None:
1065
+ """Mark linked GTD items as done when an incident is resolved."""
1066
+ try:
1067
+ from .mcp_tools.gtd_tools import (
1068
+ _find_item_across_lists,
1069
+ _remove_item_from_list,
1070
+ _load_archive,
1071
+ _save_archive,
1072
+ )
1073
+
1074
+ for item_id in gtd_item_ids:
1075
+ source_list, item, _ = _find_item_across_lists(item_id)
1076
+ if source_list and item:
1077
+ _remove_item_from_list(source_list, item_id)
1078
+ item["status"] = "done"
1079
+ item["completed_at"] = _now_iso()
1080
+ archive = _load_archive()
1081
+ archive.append(item)
1082
+ _save_archive(archive)
1083
+ except Exception:
1084
+ logger.debug("Failed to complete GTD items: %s", gtd_item_ids)
1085
+
1086
+ # ── PubSub helper ─────────────────────────────────────────────────
1087
+
1088
+ def _publish_event(self, topic: str, payload: dict) -> None:
1089
+ """Publish an ITIL event via PubSub (best-effort)."""
1090
+ try:
1091
+ from .pubsub import PubSub
1092
+
1093
+ agent_name = payload.get("managed_by", "itil-system")
1094
+ bus = PubSub(self.home, agent_name=agent_name)
1095
+ bus.publish(topic, payload, ttl_seconds=86400)
1096
+ except Exception:
1097
+ logger.debug("Failed to publish event %s", topic)
1098
+
1099
+ # Also push to activity bus
1100
+ try:
1101
+ from . import activity
1102
+ activity.push(topic, payload)
1103
+ except Exception:
1104
+ pass