shkit 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. healing_kit/__init__.py +3 -0
  2. healing_kit/auth.py +79 -0
  3. healing_kit/clients/__init__.py +1 -0
  4. healing_kit/clients/databricks_client.py +183 -0
  5. healing_kit/clients/teams_client.py +128 -0
  6. healing_kit/models/__init__.py +1 -0
  7. healing_kit/models/diagnosis.py +45 -0
  8. healing_kit/models/events.py +30 -0
  9. healing_kit/models/evidence.py +83 -0
  10. healing_kit/runtime/__init__.py +6 -0
  11. healing_kit/runtime/approval.py +141 -0
  12. healing_kit/runtime/maintenance.py +52 -0
  13. healing_kit/services/__init__.py +1 -0
  14. healing_kit/services/cache_service.py +120 -0
  15. healing_kit/services/circuit_breaker.py +114 -0
  16. healing_kit/services/context_agent.py +127 -0
  17. healing_kit/services/dependency_graph.py +141 -0
  18. healing_kit/services/diagnosis_engine.py +165 -0
  19. healing_kit/services/identity.py +61 -0
  20. healing_kit/services/model_router.py +52 -0
  21. healing_kit/services/query_guard.py +168 -0
  22. healing_kit/services/resolution_verifier.py +100 -0
  23. healing_kit/services/token_budget.py +137 -0
  24. healing_kit/utils/__init__.py +1 -0
  25. healing_kit/utils/error_hash.py +15 -0
  26. healing_kit/utils/hmac_tokens.py +86 -0
  27. healing_kit/utils/sql_safety.py +84 -0
  28. iic/__init__.py +51 -0
  29. iic/__main__.py +18 -0
  30. iic/_console.py +235 -0
  31. iic/_doctor.py +143 -0
  32. iic/change/__init__.py +7 -0
  33. iic/change/change_detector.py +154 -0
  34. iic/context/__init__.py +7 -0
  35. iic/context/context_builder.py +117 -0
  36. iic/dependency/__init__.py +7 -0
  37. iic/dependency/dependency_analyzer.py +93 -0
  38. iic/diagnosis/__init__.py +7 -0
  39. iic/diagnosis/diagnosis_engine.py +183 -0
  40. iic/dna/__init__.py +7 -0
  41. iic/dna/dna_builder.py +184 -0
  42. iic/impact/__init__.py +7 -0
  43. iic/impact/impact_engine.py +102 -0
  44. iic/ingestion/__init__.py +14 -0
  45. iic/ingestion/base.py +21 -0
  46. iic/ingestion/databricks_source.py +98 -0
  47. iic/ingestion/static_source.py +23 -0
  48. iic/ingestion/webhook_source.py +39 -0
  49. iic/models/__init__.py +44 -0
  50. iic/models/change.py +77 -0
  51. iic/models/context.py +46 -0
  52. iic/models/diagnosis.py +37 -0
  53. iic/models/dna.py +77 -0
  54. iic/models/event.py +78 -0
  55. iic/models/impact.py +60 -0
  56. iic/models/report.py +88 -0
  57. iic/models/routing.py +41 -0
  58. iic/notify/__init__.py +7 -0
  59. iic/notify/teams_notifier.py +112 -0
  60. iic/report/__init__.py +7 -0
  61. iic/report/report_generator.py +67 -0
  62. iic/routing/__init__.py +7 -0
  63. iic/routing/router.py +42 -0
  64. iic/runtime/__init__.py +10 -0
  65. iic/runtime/_sql.py +11 -0
  66. iic/runtime/agent_config.py +48 -0
  67. iic/runtime/agent_runtime.py +70 -0
  68. iic/runtime/antibodies.py +100 -0
  69. iic/runtime/bootstrap.py +157 -0
  70. iic/runtime/constants.py +40 -0
  71. iic/runtime/context.py +46 -0
  72. iic/runtime/detective.py +72 -0
  73. iic/runtime/hooks.py +85 -0
  74. iic/runtime/incident_engine.py +207 -0
  75. iic/runtime/inprocess.py +350 -0
  76. iic/runtime/ledger.py +120 -0
  77. iic/runtime/monitor.py +155 -0
  78. iic/runtime/pattern_store.py +53 -0
  79. iic/runtime/reconciler.py +139 -0
  80. iic/runtime/scope_config.py +127 -0
  81. iic/runtime/store.py +150 -0
  82. iic/runtime/wrapper.py +28 -0
  83. iic_autoload.pth +1 -0
  84. onboarding/__init__.py +1 -0
  85. onboarding/cli.py +168 -0
  86. onboarding/config_schema.py +62 -0
  87. onboarding/manifest.py +27 -0
  88. onboarding/preflight.py +129 -0
  89. onboarding/provisioner.py +573 -0
  90. onboarding/rollback.py +81 -0
  91. shkit-1.2.0.dist-info/METADATA +239 -0
  92. shkit-1.2.0.dist-info/RECORD +94 -0
  93. shkit-1.2.0.dist-info/WHEEL +4 -0
  94. shkit-1.2.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,39 @@
1
+ """Logs-webhook ingestion — normalize an arbitrary JSON failure payload.
2
+
3
+ A pure function (no I/O) so it is trivially unit-testable and reusable from a
4
+ Lakehouse webhook, a custom poller, or a test. Unknown shapes degrade gracefully
5
+ to a MANUAL event rather than raising.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from iic.models.event import EventSource, NormalizedFailureEvent
11
+
12
+
13
+ def normalize_log_webhook(payload: dict) -> NormalizedFailureEvent:
14
+ """Map a loosely-structured webhook body onto a NormalizedFailureEvent.
15
+
16
+ Accepts common aliases (``pipeline``/``job``/``job_name``, ``task``/``task_key``,
17
+ ``error``/``message``/``error_message``) so callers needn't match exactly.
18
+ """
19
+ p = payload or {}
20
+
21
+ def pick(*keys, default=""):
22
+ for k in keys:
23
+ v = p.get(k)
24
+ if v:
25
+ return str(v)
26
+ return default
27
+
28
+ return NormalizedFailureEvent(
29
+ source=EventSource.LOG_WEBHOOK,
30
+ pipeline=pick("pipeline", "job", "job_name", default="unknown"),
31
+ task=pick("task", "task_key", "step", default="unknown"),
32
+ error_message=pick("error_message", "error", "message", default="(no error message)"),
33
+ error_trace=pick("error_trace", "stack_trace", "trace"),
34
+ timestamp=pick("timestamp", "time", "ts"),
35
+ run_id=pick("run_id", "run"),
36
+ job_id=pick("job_id"),
37
+ cluster_id=pick("cluster_id", "cluster"),
38
+ notebook_path=pick("notebook_path", "notebook", "path"),
39
+ )
iic/models/__init__.py ADDED
@@ -0,0 +1,44 @@
1
+ """Core IIC data models — the structured objects that flow through the pipeline.
2
+
3
+ The whole system is organised around a small set of immutable-ish dataclasses, in
4
+ the order they are produced by the 11-stage pipeline:
5
+
6
+ NormalizedFailureEvent (stage 2) raw failure, source-agnostic
7
+ IncidentContextBundle (stage 3) everything we gathered about the failure
8
+ ImpactGraph (stage 4) upstream/downstream blast radius
9
+ ChangeDiffObject (stage 5) what changed since the last success
10
+ IncidentDNA (stage 6) structured failure fingerprint (the heart)
11
+ ImpactScore (stage 7) deterministic severity / business risk
12
+ RoutingDecision (stage 8) which model (if any) to use
13
+ DiagnosisResult (stage 9) LLM root-cause + reasoning + evidence
14
+ IncidentReport (stage 10) the final, serialisable product
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from iic.models.change import ChangeDiffObject, FieldChange
20
+ from iic.models.context import IncidentContextBundle
21
+ from iic.models.diagnosis import DiagnosisResult
22
+ from iic.models.dna import FailureType, IncidentDNA, SystemLayer
23
+ from iic.models.event import EventSource, NormalizedFailureEvent
24
+ from iic.models.impact import BusinessRisk, ImpactScore, Severity
25
+ from iic.models.report import IncidentReport
26
+ from iic.models.routing import ModelTier, RoutingDecision
27
+
28
+ __all__ = [
29
+ "NormalizedFailureEvent",
30
+ "EventSource",
31
+ "IncidentContextBundle",
32
+ "ChangeDiffObject",
33
+ "FieldChange",
34
+ "IncidentDNA",
35
+ "FailureType",
36
+ "SystemLayer",
37
+ "ImpactScore",
38
+ "Severity",
39
+ "BusinessRisk",
40
+ "RoutingDecision",
41
+ "ModelTier",
42
+ "DiagnosisResult",
43
+ "IncidentReport",
44
+ ]
iic/models/change.py ADDED
@@ -0,0 +1,77 @@
1
+ """Stage 5 — the change-diff object.
2
+
3
+ Captures what changed between the last successful execution and the failed one.
4
+ This is a strong, often-decisive signal for root cause: most production failures
5
+ correlate with a recent change (a new column, a config tweak, a runtime bump).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+
12
+
13
+ @dataclass
14
+ class FieldChange:
15
+ """A single before/after change with a category."""
16
+
17
+ category: str # schema | config | code | runtime | deployment
18
+ field: str
19
+ before: str = ""
20
+ after: str = ""
21
+
22
+ def describe(self) -> str:
23
+ if self.before and self.after:
24
+ return f"{self.field}: {self.before} → {self.after}"
25
+ if self.after and not self.before:
26
+ return f"{self.field}: added ({self.after})"
27
+ if self.before and not self.after:
28
+ return f"{self.field}: removed (was {self.before})"
29
+ return self.field
30
+
31
+
32
+ @dataclass
33
+ class ChangeDiffObject:
34
+ """Diff of the failed run vs. the last successful run."""
35
+
36
+ has_prior_success: bool = False
37
+ last_success_run_id: str = ""
38
+ schema_changes: list[FieldChange] = field(default_factory=list)
39
+ config_changes: list[FieldChange] = field(default_factory=list)
40
+ code_changes: list[FieldChange] = field(default_factory=list)
41
+ runtime_changes: list[FieldChange] = field(default_factory=list)
42
+ deployment_changes: list[FieldChange] = field(default_factory=list)
43
+
44
+ @property
45
+ def all_changes(self) -> list[FieldChange]:
46
+ return (
47
+ self.schema_changes
48
+ + self.config_changes
49
+ + self.code_changes
50
+ + self.runtime_changes
51
+ + self.deployment_changes
52
+ )
53
+
54
+ @property
55
+ def has_changes(self) -> bool:
56
+ return bool(self.all_changes)
57
+
58
+ @property
59
+ def change_count(self) -> int:
60
+ return len(self.all_changes)
61
+
62
+ def summaries(self) -> list[str]:
63
+ return [c.describe() for c in self.all_changes]
64
+
65
+ def to_dict(self) -> dict:
66
+ def dump(changes: list[FieldChange]) -> list[dict]:
67
+ return [{"field": c.field, "before": c.before, "after": c.after} for c in changes]
68
+
69
+ return {
70
+ "has_prior_success": self.has_prior_success,
71
+ "last_success_run_id": self.last_success_run_id,
72
+ "schema_changes": dump(self.schema_changes),
73
+ "config_changes": dump(self.config_changes),
74
+ "code_changes": dump(self.code_changes),
75
+ "runtime_changes": dump(self.runtime_changes),
76
+ "deployment_changes": dump(self.deployment_changes),
77
+ }
iic/models/context.py ADDED
@@ -0,0 +1,46 @@
1
+ """Stage 3 — the gathered context bundle.
2
+
3
+ Everything the context builder could collect about a failure, in one place. All
4
+ fields are optional: the builder fills in what it can reach and leaves the rest
5
+ empty rather than failing, so a partial context still flows through the pipeline.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+
12
+
13
+ @dataclass
14
+ class IncidentContextBundle:
15
+ """All evidence gathered about a single failure."""
16
+
17
+ event_id: str
18
+ logs: list[str] = field(default_factory=list)
19
+ notebook_source: str = ""
20
+ cluster_state: dict = field(default_factory=dict)
21
+ job_metadata: dict = field(default_factory=dict)
22
+ # Task keys, resolved from the job DAG.
23
+ upstream_tasks: list[str] = field(default_factory=list)
24
+ downstream_tasks: list[str] = field(default_factory=list)
25
+ # Best-effort schema of the table the failing task writes to, if known.
26
+ schema_snapshot: dict = field(default_factory=dict)
27
+ # Tables referenced by the failing task (parsed from notebook / metadata).
28
+ referenced_tables: list[str] = field(default_factory=list)
29
+
30
+ @property
31
+ def error_text(self) -> str:
32
+ """Joined log text, used by the DNA builder and the LLM prompt."""
33
+ return "\n".join(self.logs)
34
+
35
+ def to_dict(self) -> dict:
36
+ return {
37
+ "event_id": self.event_id,
38
+ "logs": self.logs,
39
+ "notebook_source_len": len(self.notebook_source),
40
+ "cluster_state": self.cluster_state,
41
+ "job_metadata": self.job_metadata,
42
+ "upstream_tasks": self.upstream_tasks,
43
+ "downstream_tasks": self.downstream_tasks,
44
+ "schema_snapshot": self.schema_snapshot,
45
+ "referenced_tables": self.referenced_tables,
46
+ }
@@ -0,0 +1,37 @@
1
+ """Stage 9 — the diagnosis result.
2
+
3
+ The *only* place an LLM result enters the system, and even here it is structured
4
+ (never free-form chat). When the router decides no LLM is needed, the diagnosis
5
+ engine still returns one of these, populated deterministically from the DNA.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+
12
+
13
+ @dataclass
14
+ class DiagnosisResult:
15
+ """Root-cause analysis, whether produced by rules or the LLM."""
16
+
17
+ root_cause: str
18
+ confidence: float = 0.0 # 0.0–1.0
19
+ reasoning: str = ""
20
+ suggested_fix: str = ""
21
+ evidence: list[str] = field(default_factory=list)
22
+ alternatives: list[str] = field(default_factory=list)
23
+ produced_by: str = "rules" # "rules" | "llm:<model>"
24
+
25
+ def __post_init__(self) -> None:
26
+ self.confidence = max(0.0, min(1.0, float(self.confidence or 0.0)))
27
+
28
+ def to_dict(self) -> dict:
29
+ return {
30
+ "root_cause": self.root_cause,
31
+ "confidence": round(self.confidence, 3),
32
+ "reasoning": self.reasoning,
33
+ "suggested_fix": self.suggested_fix,
34
+ "evidence": self.evidence,
35
+ "alternatives": self.alternatives,
36
+ "produced_by": self.produced_by,
37
+ }
iic/models/dna.py ADDED
@@ -0,0 +1,77 @@
1
+ """Stage 6 — IncidentDNA, the core intelligence asset.
2
+
3
+ This is the structured fingerprint of a failure. Everything before it exists to
4
+ build it; everything after it consumes it. It is produced *deterministically*
5
+ (rules over context + changes) so the system's behaviour is predictable and the
6
+ LLM — which only runs later — operates on structure rather than raw text.
7
+
8
+ ``pattern_id`` is a stable, human-readable key (e.g. ``schema_drift_v1``) that lets
9
+ recurring failures be counted, cached, and trended over time.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass, field
15
+ from enum import Enum
16
+
17
+
18
+ class FailureType(str, Enum):
19
+ """Canonical failure taxonomy. The DNA builder maps raw errors onto these."""
20
+
21
+ SCHEMA_DRIFT = "SCHEMA_DRIFT" # column added/removed/retyped
22
+ DATA_QUALITY = "DATA_QUALITY" # nulls, duplicates, constraint/threshold
23
+ MISSING_DATA = "MISSING_DATA" # empty/absent source table or path
24
+ PERMISSION = "PERMISSION" # access denied, unauthorized
25
+ DEPENDENCY = "DEPENDENCY" # upstream task/table failed first
26
+ RESOURCE = "RESOURCE" # OOM, executor lost, disk, capacity
27
+ TIMEOUT = "TIMEOUT" # exceeded wall-clock / query timeout
28
+ CONFIG = "CONFIG" # bad/missing parameter, secret, path
29
+ CODE_ERROR = "CODE_ERROR" # syntax / type / attribute / import
30
+ UNKNOWN = "UNKNOWN" # could not be classified
31
+
32
+
33
+ class SystemLayer(str, Enum):
34
+ """Which infrastructure layer the failure sits in."""
35
+
36
+ DATABRICKS = "databricks"
37
+ SPARK = "spark"
38
+ STORAGE = "storage"
39
+ NETWORK = "network"
40
+ UNKNOWN = "unknown"
41
+
42
+
43
+ # Medallion / pipeline layer inferred from the asset name.
44
+ _DATA_LAYERS = ("bronze", "silver", "gold", "raw", "staging", "mart", "unknown")
45
+
46
+
47
+ @dataclass
48
+ class IncidentDNA:
49
+ """Structured failure representation — deterministic, pre-LLM."""
50
+
51
+ failure_type: FailureType = FailureType.UNKNOWN
52
+ system_layer: SystemLayer = SystemLayer.UNKNOWN
53
+ affected_layer: str = "unknown" # bronze | silver | gold | ...
54
+ root_signal: str = "" # the single most telling phrase
55
+ confidence_signature: str = "low" # low | medium | high (rule confidence)
56
+ pattern_id: str = "unknown_v1" # stable recurrence key
57
+ # Evidence the rules fired on — keeps the classification fully traceable.
58
+ signals: list[str] = field(default_factory=list)
59
+ likely_caused_by_change: bool = False
60
+
61
+ def __post_init__(self) -> None:
62
+ if isinstance(self.failure_type, str):
63
+ self.failure_type = FailureType(self.failure_type)
64
+ if isinstance(self.system_layer, str):
65
+ self.system_layer = SystemLayer(self.system_layer)
66
+
67
+ def to_dict(self) -> dict:
68
+ return {
69
+ "failure_type": self.failure_type.value,
70
+ "system_layer": self.system_layer.value,
71
+ "affected_layer": self.affected_layer,
72
+ "root_signal": self.root_signal,
73
+ "confidence_signature": self.confidence_signature,
74
+ "pattern_id": self.pattern_id,
75
+ "signals": self.signals,
76
+ "likely_caused_by_change": self.likely_caused_by_change,
77
+ }
iic/models/event.py ADDED
@@ -0,0 +1,78 @@
1
+ """Stage 2 — the normalized event model.
2
+
3
+ Every ingestion source (Databricks Jobs API, a logs webhook, …) is responsible
4
+ for emitting this one shape. Nothing downstream of ingestion ever sees a
5
+ source-specific payload, which is what keeps the core source-agnostic.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import uuid
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+
15
+
16
+ class EventSource(str, Enum):
17
+ """Where a failure event originated. Extend here when adding an ingester."""
18
+
19
+ DATABRICKS = "databricks"
20
+ LOG_WEBHOOK = "log_webhook"
21
+ MANUAL = "manual"
22
+
23
+
24
+ @dataclass
25
+ class NormalizedFailureEvent:
26
+ """A single, source-agnostic pipeline failure.
27
+
28
+ ``event_id`` is deterministic by default (derived from source+run+task) so the
29
+ same failure normalizes to the same id and can be de-duplicated, but a caller
30
+ may pass an explicit id. Timestamps are ISO-8601 strings to stay
31
+ serialisation-friendly across the Spark/JSON boundary.
32
+ """
33
+
34
+ source: EventSource
35
+ pipeline: str
36
+ task: str
37
+ error_message: str
38
+ timestamp: str
39
+ run_id: str = ""
40
+ job_id: str = ""
41
+ cluster_id: str = ""
42
+ notebook_path: str = ""
43
+ error_trace: str = ""
44
+ event_id: str = ""
45
+
46
+ def __post_init__(self) -> None:
47
+ if isinstance(self.source, str):
48
+ self.source = EventSource(self.source)
49
+ if not self.event_id:
50
+ self.event_id = self._derive_id()
51
+
52
+ def _derive_id(self) -> str:
53
+ basis = f"{self.source.value}|{self.run_id}|{self.task}"
54
+ if self.run_id and self.task:
55
+ digest = hashlib.sha256(basis.encode()).hexdigest()[:12]
56
+ return f"EVT-{digest}"
57
+ return f"EVT-{uuid.uuid4().hex[:12]}"
58
+
59
+ @property
60
+ def short_error(self) -> str:
61
+ """First line of the error, trimmed — handy for fingerprinting."""
62
+ first = (self.error_message or "").strip().splitlines()
63
+ return (first[0] if first else "")[:200]
64
+
65
+ def to_dict(self) -> dict:
66
+ return {
67
+ "event_id": self.event_id,
68
+ "source": self.source.value,
69
+ "pipeline": self.pipeline,
70
+ "task": self.task,
71
+ "error_message": self.error_message,
72
+ "error_trace": self.error_trace,
73
+ "timestamp": self.timestamp,
74
+ "run_id": self.run_id,
75
+ "job_id": self.job_id,
76
+ "cluster_id": self.cluster_id,
77
+ "notebook_path": self.notebook_path,
78
+ }
iic/models/impact.py ADDED
@@ -0,0 +1,60 @@
1
+ """Stage 7 — the deterministic impact score.
2
+
3
+ Produced with NO LLM. The score is a pure function of the blast radius and
4
+ recurrence so the same situation always yields the same severity — auditable and
5
+ cheap. Severity/business-risk thresholds live with the engine, not here.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from enum import Enum
12
+
13
+
14
+ class Severity(str, Enum):
15
+ LOW = "LOW"
16
+ MEDIUM = "MEDIUM"
17
+ HIGH = "HIGH"
18
+ CRITICAL = "CRITICAL"
19
+
20
+
21
+ class BusinessRisk(str, Enum):
22
+ LOW = "LOW"
23
+ MODERATE = "MODERATE"
24
+ HIGH = "HIGH"
25
+ CRITICAL = "CRITICAL"
26
+
27
+
28
+ @dataclass
29
+ class ImpactScore:
30
+ """Deterministic blast-radius and business-risk assessment."""
31
+
32
+ raw_score: float = 0.0
33
+ blast_radius: int = 0
34
+ downstream_jobs: int = 0
35
+ affected_tables: int = 0
36
+ dashboard_impact: int = 0
37
+ recurrence_score: int = 0
38
+ severity: Severity = Severity.LOW
39
+ business_risk: BusinessRisk = BusinessRisk.LOW
40
+ # The exact term contributions, for a fully transparent score.
41
+ breakdown: dict = field(default_factory=dict)
42
+
43
+ def __post_init__(self) -> None:
44
+ if isinstance(self.severity, str):
45
+ self.severity = Severity(self.severity)
46
+ if isinstance(self.business_risk, str):
47
+ self.business_risk = BusinessRisk(self.business_risk)
48
+
49
+ def to_dict(self) -> dict:
50
+ return {
51
+ "raw_score": round(self.raw_score, 2),
52
+ "blast_radius": self.blast_radius,
53
+ "downstream_jobs": self.downstream_jobs,
54
+ "affected_tables": self.affected_tables,
55
+ "dashboard_impact": self.dashboard_impact,
56
+ "recurrence_score": self.recurrence_score,
57
+ "severity": self.severity.value,
58
+ "business_risk": self.business_risk.value,
59
+ "breakdown": self.breakdown,
60
+ }
iic/models/report.py ADDED
@@ -0,0 +1,88 @@
1
+ """Stage 10 — the final IncidentReport.
2
+
3
+ The product the whole pipeline exists to produce: a single object that is both
4
+ machine-consumable (``to_dict``) and human-readable (``to_markdown``). It carries
5
+ the full provenance chain (DNA, impact, diagnosis, evidence, timeline) so any
6
+ conclusion can be traced back to the evidence it rests on.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass, field
12
+ from typing import Optional
13
+
14
+ from iic.models.diagnosis import DiagnosisResult
15
+ from iic.models.dna import IncidentDNA
16
+ from iic.models.impact import ImpactScore
17
+ from iic.models.routing import RoutingDecision
18
+
19
+
20
+ @dataclass
21
+ class IncidentReport:
22
+ incident_id: str
23
+ pipeline: str
24
+ task: str
25
+ timestamp: str
26
+ summary: str
27
+ dna: IncidentDNA
28
+ impact: ImpactScore
29
+ diagnosis: DiagnosisResult
30
+ routing: Optional[RoutingDecision] = None
31
+ evidence: list[str] = field(default_factory=list)
32
+ timeline: list[str] = field(default_factory=list)
33
+ changes: list[str] = field(default_factory=list)
34
+
35
+ def to_dict(self) -> dict:
36
+ return {
37
+ "incident_id": self.incident_id,
38
+ "pipeline": self.pipeline,
39
+ "task": self.task,
40
+ "timestamp": self.timestamp,
41
+ "summary": self.summary,
42
+ "root_cause": self.diagnosis.root_cause,
43
+ "confidence": round(self.diagnosis.confidence, 3),
44
+ "failure_type": self.dna.failure_type.value,
45
+ "pattern_id": self.dna.pattern_id,
46
+ "impact": self.impact.to_dict(),
47
+ "diagnosis": self.diagnosis.to_dict(),
48
+ "routing": self.routing.to_dict() if self.routing else None,
49
+ "evidence": self.evidence,
50
+ "changes": self.changes,
51
+ "timeline": self.timeline,
52
+ "suggested_fix": self.diagnosis.suggested_fix,
53
+ }
54
+
55
+ def to_markdown(self) -> str:
56
+ """Compact human-readable summary (used by the notifier and logs)."""
57
+ imp = self.impact
58
+ lines = [
59
+ f"# Incident {self.incident_id} — {imp.severity.value}",
60
+ f"**Pipeline:** {self.pipeline} · **Task:** {self.task} · **{self.timestamp}**",
61
+ "",
62
+ f"**Summary:** {self.summary}",
63
+ "",
64
+ f"**Root cause** ({int(self.diagnosis.confidence * 100)}% confidence): "
65
+ f"{self.diagnosis.root_cause}",
66
+ "",
67
+ "## Impact",
68
+ f"- Severity: **{imp.severity.value}** · Business risk: **{imp.business_risk.value}**",
69
+ f"- Blast radius: {imp.blast_radius} asset(s) · "
70
+ f"{imp.downstream_jobs} downstream job(s) · {imp.dashboard_impact} dashboard(s)",
71
+ "",
72
+ "## Failure DNA",
73
+ f"- Type: `{self.dna.failure_type.value}` · Layer: `{self.dna.affected_layer}` "
74
+ f"· Pattern: `{self.dna.pattern_id}`",
75
+ f"- Root signal: {self.dna.root_signal or 'n/a'}",
76
+ ]
77
+ if self.changes:
78
+ lines += ["", "## Recent changes (since last success)"]
79
+ lines += [f"- {c}" for c in self.changes]
80
+ if self.diagnosis.suggested_fix:
81
+ lines += ["", "## Suggested fix", self.diagnosis.suggested_fix]
82
+ if self.evidence:
83
+ lines += ["", "## Evidence"]
84
+ lines += [f"- {e}" for e in self.evidence]
85
+ if self.timeline:
86
+ lines += ["", "## Timeline"]
87
+ lines += [f"- {t}" for t in self.timeline]
88
+ return "\n".join(lines)
iic/models/routing.py ADDED
@@ -0,0 +1,41 @@
1
+ """Stage 8 — the routing decision.
2
+
3
+ Records which model (if any) the diagnosis stage should use and *why*, so the
4
+ choice is auditable. The router is deterministic: impact + cache state decide the
5
+ tier, never the LLM itself.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ from enum import Enum
12
+
13
+
14
+ class ModelTier(str, Enum):
15
+ NONE = "none" # skip the LLM entirely (cache hit / trivial)
16
+ LIGHTWEIGHT = "lightweight"
17
+ POWERFUL = "powerful"
18
+
19
+
20
+ @dataclass
21
+ class RoutingDecision:
22
+ tier: ModelTier
23
+ model: str = "" # resolved endpoint name ("" when tier is NONE)
24
+ reason: str = ""
25
+ cache_hit: bool = False
26
+
27
+ def __post_init__(self) -> None:
28
+ if isinstance(self.tier, str):
29
+ self.tier = ModelTier(self.tier)
30
+
31
+ @property
32
+ def requires_llm(self) -> bool:
33
+ return self.tier != ModelTier.NONE
34
+
35
+ def to_dict(self) -> dict:
36
+ return {
37
+ "tier": self.tier.value,
38
+ "model": self.model,
39
+ "reason": self.reason,
40
+ "cache_hit": self.cache_hit,
41
+ }
iic/notify/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Stage 11 — notification (optional)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from iic.notify.teams_notifier import TeamsNotifier, build_incident_card
6
+
7
+ __all__ = ["TeamsNotifier", "build_incident_card"]