@smilintux/skcapstone 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/CUSTOM_AGENT.md +184 -0
- package/docs/GETTING_STARTED.md +3 -0
- package/package.json +1 -1
- package/scripts/archive-sessions.sh +72 -0
- package/scripts/nvidia-proxy.mjs +79 -15
- package/scripts/telegram-catchup-all.sh +136 -0
- package/src/skcapstone/blueprints/builtins/itil-operations.yaml +40 -0
- package/src/skcapstone/cli/__init__.py +2 -0
- package/src/skcapstone/cli/itil.py +434 -0
- package/src/skcapstone/coordination.py +1 -0
- package/src/skcapstone/itil.py +1104 -0
- package/src/skcapstone/mcp_server.py +258 -0
- package/src/skcapstone/mcp_tools/__init__.py +2 -0
- package/src/skcapstone/mcp_tools/gtd_tools.py +1 -1
- package/src/skcapstone/mcp_tools/itil_tools.py +657 -0
- package/src/skcapstone/scheduled_tasks.py +62 -0
- package/src/skcapstone/service_health.py +81 -2
|
@@ -374,6 +374,50 @@ def make_dreaming_task(
|
|
|
374
374
|
return _run
|
|
375
375
|
|
|
376
376
|
|
|
377
|
+
def make_itil_auto_close_task(home: Path) -> Callable[[], None]:
|
|
378
|
+
"""Return a callback that auto-closes resolved incidents after 24h stable.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
home: Shared root directory.
|
|
382
|
+
"""
|
|
383
|
+
|
|
384
|
+
def _run() -> None:
|
|
385
|
+
from .itil import ITILManager
|
|
386
|
+
|
|
387
|
+
mgr = ITILManager(home)
|
|
388
|
+
closed = mgr.auto_close_resolved(stable_hours=24)
|
|
389
|
+
if closed:
|
|
390
|
+
logger.info("ITIL auto-close: %d incident(s) closed: %s", len(closed), closed)
|
|
391
|
+
else:
|
|
392
|
+
logger.debug("ITIL auto-close: no incidents to close")
|
|
393
|
+
|
|
394
|
+
return _run
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def make_itil_escalation_task(home: Path) -> Callable[[], None]:
|
|
398
|
+
"""Return a callback that checks SLA breaches on open incidents.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
home: Shared root directory.
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
def _run() -> None:
|
|
405
|
+
from .itil import ITILManager
|
|
406
|
+
|
|
407
|
+
mgr = ITILManager(home)
|
|
408
|
+
breaches = mgr.check_sla_breaches()
|
|
409
|
+
if breaches:
|
|
410
|
+
for b in breaches:
|
|
411
|
+
logger.warning(
|
|
412
|
+
"ITIL SLA breach: %s (%s) unacknowledged for %d min (limit: %d min)",
|
|
413
|
+
b["id"], b["severity"], b["elapsed_minutes"], b["sla_minutes"],
|
|
414
|
+
)
|
|
415
|
+
else:
|
|
416
|
+
logger.debug("ITIL escalation check: no SLA breaches")
|
|
417
|
+
|
|
418
|
+
return _run
|
|
419
|
+
|
|
420
|
+
|
|
377
421
|
# ---------------------------------------------------------------------------
|
|
378
422
|
# Convenience builder
|
|
379
423
|
# ---------------------------------------------------------------------------
|
|
@@ -481,4 +525,22 @@ def build_scheduler(
|
|
|
481
525
|
except ImportError:
|
|
482
526
|
logger.debug("service_health not available — service_health_check task skipped")
|
|
483
527
|
|
|
528
|
+
# ITIL escalation check — SLA breach detection every 5 minutes
|
|
529
|
+
try:
|
|
530
|
+
from . import SHARED_ROOT
|
|
531
|
+
|
|
532
|
+
shared = Path(SHARED_ROOT).expanduser()
|
|
533
|
+
scheduler.register(
|
|
534
|
+
name="itil_escalation_check",
|
|
535
|
+
interval_seconds=300, # 5 minutes
|
|
536
|
+
callback=make_itil_escalation_task(shared),
|
|
537
|
+
)
|
|
538
|
+
scheduler.register(
|
|
539
|
+
name="itil_auto_close",
|
|
540
|
+
interval_seconds=1800, # 30 minutes
|
|
541
|
+
callback=make_itil_auto_close_task(shared),
|
|
542
|
+
)
|
|
543
|
+
except Exception:
|
|
544
|
+
logger.debug("ITIL scheduled tasks not available — skipped")
|
|
545
|
+
|
|
484
546
|
return scheduler
|
|
@@ -197,16 +197,90 @@ def check_all_services() -> list[dict[str, Any]]:
|
|
|
197
197
|
# ---------------------------------------------------------------------------
|
|
198
198
|
|
|
199
199
|
|
|
200
|
+
def _create_incident_for_down_service(service_result: dict[str, Any]) -> None:
|
|
201
|
+
"""Auto-create an ITIL incident for a down service (with dedup).
|
|
202
|
+
|
|
203
|
+
Only creates a new incident if there is no existing open incident
|
|
204
|
+
for the same service. Uses best-effort: failures are logged but
|
|
205
|
+
never block the health check.
|
|
206
|
+
"""
|
|
207
|
+
try:
|
|
208
|
+
from . import SHARED_ROOT
|
|
209
|
+
from .itil import ITILManager
|
|
210
|
+
|
|
211
|
+
svc_name = service_result["name"]
|
|
212
|
+
mgr = ITILManager(os.path.expanduser(SHARED_ROOT))
|
|
213
|
+
|
|
214
|
+
# Dedup: skip if there's already an open incident for this service
|
|
215
|
+
existing = mgr.find_open_incident_for_service(svc_name)
|
|
216
|
+
if existing:
|
|
217
|
+
logger.debug(
|
|
218
|
+
"Skipping incident creation for %s — open incident %s exists",
|
|
219
|
+
svc_name, existing.id,
|
|
220
|
+
)
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
error_info = service_result.get("error") or "unreachable"
|
|
224
|
+
mgr.create_incident(
|
|
225
|
+
title=f"{svc_name} down",
|
|
226
|
+
severity="sev3",
|
|
227
|
+
source="service_health",
|
|
228
|
+
affected_services=[svc_name],
|
|
229
|
+
impact=f"Service unreachable: {error_info}",
|
|
230
|
+
managed_by="lumina",
|
|
231
|
+
created_by="service_health",
|
|
232
|
+
tags=["auto-detected", "service-health"],
|
|
233
|
+
)
|
|
234
|
+
logger.info("Auto-created incident for down service: %s", svc_name)
|
|
235
|
+
except Exception as exc:
|
|
236
|
+
logger.debug("Failed to create incident for %s: %s", service_result.get("name"), exc)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _auto_resolve_recovered_service(service_result: dict[str, Any]) -> None:
|
|
240
|
+
"""Auto-resolve sev4 incidents when a service recovers."""
|
|
241
|
+
try:
|
|
242
|
+
from . import SHARED_ROOT
|
|
243
|
+
from .itil import ITILManager
|
|
244
|
+
|
|
245
|
+
svc_name = service_result["name"]
|
|
246
|
+
mgr = ITILManager(os.path.expanduser(SHARED_ROOT))
|
|
247
|
+
existing = mgr.find_open_incident_for_service(svc_name)
|
|
248
|
+
if existing is None:
|
|
249
|
+
return
|
|
250
|
+
|
|
251
|
+
if existing.severity.value == "sev4":
|
|
252
|
+
mgr.update_incident(
|
|
253
|
+
existing.id, "service_health",
|
|
254
|
+
new_status="resolved",
|
|
255
|
+
note=f"Service {svc_name} recovered automatically",
|
|
256
|
+
resolution_summary="Auto-resolved: service came back up",
|
|
257
|
+
)
|
|
258
|
+
logger.info("Auto-resolved sev4 incident %s for recovered service %s",
|
|
259
|
+
existing.id, svc_name)
|
|
260
|
+
else:
|
|
261
|
+
mgr.update_incident(
|
|
262
|
+
existing.id, "service_health",
|
|
263
|
+
note=f"Service {svc_name} appears to be back up",
|
|
264
|
+
)
|
|
265
|
+
except Exception as exc:
|
|
266
|
+
logger.debug("Failed to auto-resolve incident for %s: %s",
|
|
267
|
+
service_result.get("name"), exc)
|
|
268
|
+
|
|
269
|
+
|
|
200
270
|
def make_service_health_task() -> callable:
|
|
201
271
|
"""Return a zero-arg callback suitable for TaskScheduler.register().
|
|
202
272
|
|
|
203
273
|
Runs check_all_services() and logs results. Down services are logged
|
|
204
|
-
at WARNING level; all-up is logged at DEBUG level.
|
|
274
|
+
at WARNING level; all-up is logged at DEBUG level. Auto-creates ITIL
|
|
275
|
+
incidents for down services and auto-resolves sev4 incidents for
|
|
276
|
+
recovered services.
|
|
205
277
|
"""
|
|
206
278
|
|
|
207
279
|
def _run() -> None:
|
|
208
280
|
results = check_all_services()
|
|
209
281
|
down = [r for r in results if r["status"] == "down"]
|
|
282
|
+
up = [r for r in results if r["status"] == "up"]
|
|
283
|
+
|
|
210
284
|
if down:
|
|
211
285
|
names = ", ".join(r["name"] for r in down)
|
|
212
286
|
logger.warning(
|
|
@@ -216,8 +290,9 @@ def make_service_health_task() -> callable:
|
|
|
216
290
|
logger.warning(
|
|
217
291
|
" %s (%s): %s", r["name"], r["url"], r["error"] or "unreachable"
|
|
218
292
|
)
|
|
293
|
+
_create_incident_for_down_service(r)
|
|
219
294
|
else:
|
|
220
|
-
up_count =
|
|
295
|
+
up_count = len(up)
|
|
221
296
|
logger.debug(
|
|
222
297
|
"Service health: %d/%d up, %d unknown",
|
|
223
298
|
up_count,
|
|
@@ -225,4 +300,8 @@ def make_service_health_task() -> callable:
|
|
|
225
300
|
len(results) - up_count,
|
|
226
301
|
)
|
|
227
302
|
|
|
303
|
+
# Check for recovered services
|
|
304
|
+
for r in up:
|
|
305
|
+
_auto_resolve_recovered_service(r)
|
|
306
|
+
|
|
228
307
|
return _run
|