@smilintux/skcapstone 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -374,6 +374,50 @@ def make_dreaming_task(
374
374
  return _run
375
375
 
376
376
 
377
+ def make_itil_auto_close_task(home: Path) -> Callable[[], None]:
378
+ """Return a callback that auto-closes resolved incidents after 24h stable.
379
+
380
+ Args:
381
+ home: Shared root directory.
382
+ """
383
+
384
+ def _run() -> None:
385
+ from .itil import ITILManager
386
+
387
+ mgr = ITILManager(home)
388
+ closed = mgr.auto_close_resolved(stable_hours=24)
389
+ if closed:
390
+ logger.info("ITIL auto-close: %d incident(s) closed: %s", len(closed), closed)
391
+ else:
392
+ logger.debug("ITIL auto-close: no incidents to close")
393
+
394
+ return _run
395
+
396
+
397
+ def make_itil_escalation_task(home: Path) -> Callable[[], None]:
398
+ """Return a callback that checks SLA breaches on open incidents.
399
+
400
+ Args:
401
+ home: Shared root directory.
402
+ """
403
+
404
+ def _run() -> None:
405
+ from .itil import ITILManager
406
+
407
+ mgr = ITILManager(home)
408
+ breaches = mgr.check_sla_breaches()
409
+ if breaches:
410
+ for b in breaches:
411
+ logger.warning(
412
+ "ITIL SLA breach: %s (%s) unacknowledged for %d min (limit: %d min)",
413
+ b["id"], b["severity"], b["elapsed_minutes"], b["sla_minutes"],
414
+ )
415
+ else:
416
+ logger.debug("ITIL escalation check: no SLA breaches")
417
+
418
+ return _run
419
+
420
+
377
421
  # ---------------------------------------------------------------------------
378
422
  # Convenience builder
379
423
  # ---------------------------------------------------------------------------
@@ -481,4 +525,22 @@ def build_scheduler(
481
525
  except ImportError:
482
526
  logger.debug("service_health not available — service_health_check task skipped")
483
527
 
528
+ # ITIL escalation check — SLA breach detection every 5 minutes
529
+ try:
530
+ from . import SHARED_ROOT
531
+
532
+ shared = Path(SHARED_ROOT).expanduser()
533
+ scheduler.register(
534
+ name="itil_escalation_check",
535
+ interval_seconds=300, # 5 minutes
536
+ callback=make_itil_escalation_task(shared),
537
+ )
538
+ scheduler.register(
539
+ name="itil_auto_close",
540
+ interval_seconds=1800, # 30 minutes
541
+ callback=make_itil_auto_close_task(shared),
542
+ )
543
+ except Exception:
544
+ logger.debug("ITIL scheduled tasks not available — skipped")
545
+
484
546
  return scheduler
@@ -197,16 +197,90 @@ def check_all_services() -> list[dict[str, Any]]:
197
197
  # ---------------------------------------------------------------------------
198
198
 
199
199
 
200
+ def _create_incident_for_down_service(service_result: dict[str, Any]) -> None:
201
+ """Auto-create an ITIL incident for a down service (with dedup).
202
+
203
+ Only creates a new incident if there is no existing open incident
204
+ for the same service. Uses best-effort: failures are logged but
205
+ never block the health check.
206
+ """
207
+ try:
208
+ from . import SHARED_ROOT
209
+ from .itil import ITILManager
210
+
211
+ svc_name = service_result["name"]
212
+ mgr = ITILManager(os.path.expanduser(SHARED_ROOT))
213
+
214
+ # Dedup: skip if there's already an open incident for this service
215
+ existing = mgr.find_open_incident_for_service(svc_name)
216
+ if existing:
217
+ logger.debug(
218
+ "Skipping incident creation for %s — open incident %s exists",
219
+ svc_name, existing.id,
220
+ )
221
+ return
222
+
223
+ error_info = service_result.get("error") or "unreachable"
224
+ mgr.create_incident(
225
+ title=f"{svc_name} down",
226
+ severity="sev3",
227
+ source="service_health",
228
+ affected_services=[svc_name],
229
+ impact=f"Service unreachable: {error_info}",
230
+ managed_by="lumina",
231
+ created_by="service_health",
232
+ tags=["auto-detected", "service-health"],
233
+ )
234
+ logger.info("Auto-created incident for down service: %s", svc_name)
235
+ except Exception as exc:
236
+ logger.debug("Failed to create incident for %s: %s", service_result.get("name"), exc)
237
+
238
+
239
+ def _auto_resolve_recovered_service(service_result: dict[str, Any]) -> None:
240
+ """Auto-resolve sev4 incidents when a service recovers."""
241
+ try:
242
+ from . import SHARED_ROOT
243
+ from .itil import ITILManager
244
+
245
+ svc_name = service_result["name"]
246
+ mgr = ITILManager(os.path.expanduser(SHARED_ROOT))
247
+ existing = mgr.find_open_incident_for_service(svc_name)
248
+ if existing is None:
249
+ return
250
+
251
+ if existing.severity.value == "sev4":
252
+ mgr.update_incident(
253
+ existing.id, "service_health",
254
+ new_status="resolved",
255
+ note=f"Service {svc_name} recovered automatically",
256
+ resolution_summary="Auto-resolved: service came back up",
257
+ )
258
+ logger.info("Auto-resolved sev4 incident %s for recovered service %s",
259
+ existing.id, svc_name)
260
+ else:
261
+ mgr.update_incident(
262
+ existing.id, "service_health",
263
+ note=f"Service {svc_name} appears to be back up",
264
+ )
265
+ except Exception as exc:
266
+ logger.debug("Failed to auto-resolve incident for %s: %s",
267
+ service_result.get("name"), exc)
268
+
269
+
200
270
  def make_service_health_task() -> callable:
201
271
  """Return a zero-arg callback suitable for TaskScheduler.register().
202
272
 
203
273
  Runs check_all_services() and logs results. Down services are logged
204
- at WARNING level; all-up is logged at DEBUG level.
274
+ at WARNING level; all-up is logged at DEBUG level. Auto-creates ITIL
275
+ incidents for down services and auto-resolves sev4 incidents for
276
+ recovered services.
205
277
  """
206
278
 
207
279
  def _run() -> None:
208
280
  results = check_all_services()
209
281
  down = [r for r in results if r["status"] == "down"]
282
+ up = [r for r in results if r["status"] == "up"]
283
+
210
284
  if down:
211
285
  names = ", ".join(r["name"] for r in down)
212
286
  logger.warning(
@@ -216,8 +290,9 @@ def make_service_health_task() -> callable:
216
290
  logger.warning(
217
291
  " %s (%s): %s", r["name"], r["url"], r["error"] or "unreachable"
218
292
  )
293
+ _create_incident_for_down_service(r)
219
294
  else:
220
- up_count = sum(1 for r in results if r["status"] == "up")
295
+ up_count = len(up)
221
296
  logger.debug(
222
297
  "Service health: %d/%d up, %d unknown",
223
298
  up_count,
@@ -225,4 +300,8 @@ def make_service_health_task() -> callable:
225
300
  len(results) - up_count,
226
301
  )
227
302
 
303
+ # Check for recovered services
304
+ for r in up:
305
+ _auto_resolve_recovered_service(r)
306
+
228
307
  return _run