shkit 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- healing_kit/__init__.py +3 -0
- healing_kit/auth.py +79 -0
- healing_kit/clients/__init__.py +1 -0
- healing_kit/clients/databricks_client.py +183 -0
- healing_kit/clients/teams_client.py +128 -0
- healing_kit/models/__init__.py +1 -0
- healing_kit/models/diagnosis.py +45 -0
- healing_kit/models/events.py +30 -0
- healing_kit/models/evidence.py +83 -0
- healing_kit/runtime/__init__.py +6 -0
- healing_kit/runtime/approval.py +141 -0
- healing_kit/runtime/maintenance.py +52 -0
- healing_kit/services/__init__.py +1 -0
- healing_kit/services/cache_service.py +120 -0
- healing_kit/services/circuit_breaker.py +114 -0
- healing_kit/services/context_agent.py +127 -0
- healing_kit/services/dependency_graph.py +141 -0
- healing_kit/services/diagnosis_engine.py +165 -0
- healing_kit/services/identity.py +61 -0
- healing_kit/services/model_router.py +52 -0
- healing_kit/services/query_guard.py +168 -0
- healing_kit/services/resolution_verifier.py +100 -0
- healing_kit/services/token_budget.py +137 -0
- healing_kit/utils/__init__.py +1 -0
- healing_kit/utils/error_hash.py +15 -0
- healing_kit/utils/hmac_tokens.py +86 -0
- healing_kit/utils/sql_safety.py +84 -0
- iic/__init__.py +51 -0
- iic/__main__.py +18 -0
- iic/_console.py +235 -0
- iic/_doctor.py +143 -0
- iic/change/__init__.py +7 -0
- iic/change/change_detector.py +154 -0
- iic/context/__init__.py +7 -0
- iic/context/context_builder.py +117 -0
- iic/dependency/__init__.py +7 -0
- iic/dependency/dependency_analyzer.py +93 -0
- iic/diagnosis/__init__.py +7 -0
- iic/diagnosis/diagnosis_engine.py +183 -0
- iic/dna/__init__.py +7 -0
- iic/dna/dna_builder.py +184 -0
- iic/impact/__init__.py +7 -0
- iic/impact/impact_engine.py +102 -0
- iic/ingestion/__init__.py +14 -0
- iic/ingestion/base.py +21 -0
- iic/ingestion/databricks_source.py +98 -0
- iic/ingestion/static_source.py +23 -0
- iic/ingestion/webhook_source.py +39 -0
- iic/models/__init__.py +44 -0
- iic/models/change.py +77 -0
- iic/models/context.py +46 -0
- iic/models/diagnosis.py +37 -0
- iic/models/dna.py +77 -0
- iic/models/event.py +78 -0
- iic/models/impact.py +60 -0
- iic/models/report.py +88 -0
- iic/models/routing.py +41 -0
- iic/notify/__init__.py +7 -0
- iic/notify/teams_notifier.py +112 -0
- iic/report/__init__.py +7 -0
- iic/report/report_generator.py +67 -0
- iic/routing/__init__.py +7 -0
- iic/routing/router.py +42 -0
- iic/runtime/__init__.py +10 -0
- iic/runtime/_sql.py +11 -0
- iic/runtime/agent_config.py +48 -0
- iic/runtime/agent_runtime.py +70 -0
- iic/runtime/antibodies.py +100 -0
- iic/runtime/bootstrap.py +157 -0
- iic/runtime/constants.py +40 -0
- iic/runtime/context.py +46 -0
- iic/runtime/detective.py +72 -0
- iic/runtime/hooks.py +85 -0
- iic/runtime/incident_engine.py +207 -0
- iic/runtime/inprocess.py +350 -0
- iic/runtime/ledger.py +120 -0
- iic/runtime/monitor.py +155 -0
- iic/runtime/pattern_store.py +53 -0
- iic/runtime/reconciler.py +139 -0
- iic/runtime/scope_config.py +127 -0
- iic/runtime/store.py +150 -0
- iic/runtime/wrapper.py +28 -0
- iic_autoload.pth +1 -0
- onboarding/__init__.py +1 -0
- onboarding/cli.py +168 -0
- onboarding/config_schema.py +62 -0
- onboarding/manifest.py +27 -0
- onboarding/preflight.py +129 -0
- onboarding/provisioner.py +573 -0
- onboarding/rollback.py +81 -0
- shkit-1.2.0.dist-info/METADATA +239 -0
- shkit-1.2.0.dist-info/RECORD +94 -0
- shkit-1.2.0.dist-info/WHEEL +4 -0
- shkit-1.2.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Logs-webhook ingestion — normalize an arbitrary JSON failure payload.
|
|
2
|
+
|
|
3
|
+
A pure function (no I/O) so it is trivially unit-testable and reusable from a
|
|
4
|
+
Lakehouse webhook, a custom poller, or a test. Unknown shapes degrade gracefully
|
|
5
|
+
to a MANUAL event rather than raising.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from iic.models.event import EventSource, NormalizedFailureEvent
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def normalize_log_webhook(payload: dict) -> NormalizedFailureEvent:
|
|
14
|
+
"""Map a loosely-structured webhook body onto a NormalizedFailureEvent.
|
|
15
|
+
|
|
16
|
+
Accepts common aliases (``pipeline``/``job``/``job_name``, ``task``/``task_key``,
|
|
17
|
+
``error``/``message``/``error_message``) so callers needn't match exactly.
|
|
18
|
+
"""
|
|
19
|
+
p = payload or {}
|
|
20
|
+
|
|
21
|
+
def pick(*keys, default=""):
|
|
22
|
+
for k in keys:
|
|
23
|
+
v = p.get(k)
|
|
24
|
+
if v:
|
|
25
|
+
return str(v)
|
|
26
|
+
return default
|
|
27
|
+
|
|
28
|
+
return NormalizedFailureEvent(
|
|
29
|
+
source=EventSource.LOG_WEBHOOK,
|
|
30
|
+
pipeline=pick("pipeline", "job", "job_name", default="unknown"),
|
|
31
|
+
task=pick("task", "task_key", "step", default="unknown"),
|
|
32
|
+
error_message=pick("error_message", "error", "message", default="(no error message)"),
|
|
33
|
+
error_trace=pick("error_trace", "stack_trace", "trace"),
|
|
34
|
+
timestamp=pick("timestamp", "time", "ts"),
|
|
35
|
+
run_id=pick("run_id", "run"),
|
|
36
|
+
job_id=pick("job_id"),
|
|
37
|
+
cluster_id=pick("cluster_id", "cluster"),
|
|
38
|
+
notebook_path=pick("notebook_path", "notebook", "path"),
|
|
39
|
+
)
|
iic/models/__init__.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Core IIC data models — the structured objects that flow through the pipeline.
|
|
2
|
+
|
|
3
|
+
The whole system is organised around a small set of immutable-ish dataclasses, in
|
|
4
|
+
the order they are produced by the 11-stage pipeline:
|
|
5
|
+
|
|
6
|
+
NormalizedFailureEvent (stage 2) raw failure, source-agnostic
|
|
7
|
+
IncidentContextBundle (stage 3) everything we gathered about the failure
|
|
8
|
+
ImpactGraph (stage 4) upstream/downstream blast radius
|
|
9
|
+
ChangeDiffObject (stage 5) what changed since the last success
|
|
10
|
+
IncidentDNA (stage 6) structured failure fingerprint (the heart)
|
|
11
|
+
ImpactScore (stage 7) deterministic severity / business risk
|
|
12
|
+
RoutingDecision (stage 8) which model (if any) to use
|
|
13
|
+
DiagnosisResult (stage 9) LLM root-cause + reasoning + evidence
|
|
14
|
+
IncidentReport (stage 10) the final, serialisable product
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from iic.models.change import ChangeDiffObject, FieldChange
|
|
20
|
+
from iic.models.context import IncidentContextBundle
|
|
21
|
+
from iic.models.diagnosis import DiagnosisResult
|
|
22
|
+
from iic.models.dna import FailureType, IncidentDNA, SystemLayer
|
|
23
|
+
from iic.models.event import EventSource, NormalizedFailureEvent
|
|
24
|
+
from iic.models.impact import BusinessRisk, ImpactScore, Severity
|
|
25
|
+
from iic.models.report import IncidentReport
|
|
26
|
+
from iic.models.routing import ModelTier, RoutingDecision
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"NormalizedFailureEvent",
|
|
30
|
+
"EventSource",
|
|
31
|
+
"IncidentContextBundle",
|
|
32
|
+
"ChangeDiffObject",
|
|
33
|
+
"FieldChange",
|
|
34
|
+
"IncidentDNA",
|
|
35
|
+
"FailureType",
|
|
36
|
+
"SystemLayer",
|
|
37
|
+
"ImpactScore",
|
|
38
|
+
"Severity",
|
|
39
|
+
"BusinessRisk",
|
|
40
|
+
"RoutingDecision",
|
|
41
|
+
"ModelTier",
|
|
42
|
+
"DiagnosisResult",
|
|
43
|
+
"IncidentReport",
|
|
44
|
+
]
|
iic/models/change.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Stage 5 — the change-diff object.
|
|
2
|
+
|
|
3
|
+
Captures what changed between the last successful execution and the failed one.
|
|
4
|
+
This is a strong, often-decisive signal for root cause: most production failures
|
|
5
|
+
correlate with a recent change (a new column, a config tweak, a runtime bump).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class FieldChange:
|
|
15
|
+
"""A single before/after change with a category."""
|
|
16
|
+
|
|
17
|
+
category: str # schema | config | code | runtime | deployment
|
|
18
|
+
field: str
|
|
19
|
+
before: str = ""
|
|
20
|
+
after: str = ""
|
|
21
|
+
|
|
22
|
+
def describe(self) -> str:
|
|
23
|
+
if self.before and self.after:
|
|
24
|
+
return f"{self.field}: {self.before} → {self.after}"
|
|
25
|
+
if self.after and not self.before:
|
|
26
|
+
return f"{self.field}: added ({self.after})"
|
|
27
|
+
if self.before and not self.after:
|
|
28
|
+
return f"{self.field}: removed (was {self.before})"
|
|
29
|
+
return self.field
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ChangeDiffObject:
|
|
34
|
+
"""Diff of the failed run vs. the last successful run."""
|
|
35
|
+
|
|
36
|
+
has_prior_success: bool = False
|
|
37
|
+
last_success_run_id: str = ""
|
|
38
|
+
schema_changes: list[FieldChange] = field(default_factory=list)
|
|
39
|
+
config_changes: list[FieldChange] = field(default_factory=list)
|
|
40
|
+
code_changes: list[FieldChange] = field(default_factory=list)
|
|
41
|
+
runtime_changes: list[FieldChange] = field(default_factory=list)
|
|
42
|
+
deployment_changes: list[FieldChange] = field(default_factory=list)
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def all_changes(self) -> list[FieldChange]:
|
|
46
|
+
return (
|
|
47
|
+
self.schema_changes
|
|
48
|
+
+ self.config_changes
|
|
49
|
+
+ self.code_changes
|
|
50
|
+
+ self.runtime_changes
|
|
51
|
+
+ self.deployment_changes
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def has_changes(self) -> bool:
|
|
56
|
+
return bool(self.all_changes)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def change_count(self) -> int:
|
|
60
|
+
return len(self.all_changes)
|
|
61
|
+
|
|
62
|
+
def summaries(self) -> list[str]:
|
|
63
|
+
return [c.describe() for c in self.all_changes]
|
|
64
|
+
|
|
65
|
+
def to_dict(self) -> dict:
|
|
66
|
+
def dump(changes: list[FieldChange]) -> list[dict]:
|
|
67
|
+
return [{"field": c.field, "before": c.before, "after": c.after} for c in changes]
|
|
68
|
+
|
|
69
|
+
return {
|
|
70
|
+
"has_prior_success": self.has_prior_success,
|
|
71
|
+
"last_success_run_id": self.last_success_run_id,
|
|
72
|
+
"schema_changes": dump(self.schema_changes),
|
|
73
|
+
"config_changes": dump(self.config_changes),
|
|
74
|
+
"code_changes": dump(self.code_changes),
|
|
75
|
+
"runtime_changes": dump(self.runtime_changes),
|
|
76
|
+
"deployment_changes": dump(self.deployment_changes),
|
|
77
|
+
}
|
iic/models/context.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Stage 3 — the gathered context bundle.
|
|
2
|
+
|
|
3
|
+
Everything the context builder could collect about a failure, in one place. All
|
|
4
|
+
fields are optional: the builder fills in what it can reach and leaves the rest
|
|
5
|
+
empty rather than failing, so a partial context still flows through the pipeline.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class IncidentContextBundle:
|
|
15
|
+
"""All evidence gathered about a single failure."""
|
|
16
|
+
|
|
17
|
+
event_id: str
|
|
18
|
+
logs: list[str] = field(default_factory=list)
|
|
19
|
+
notebook_source: str = ""
|
|
20
|
+
cluster_state: dict = field(default_factory=dict)
|
|
21
|
+
job_metadata: dict = field(default_factory=dict)
|
|
22
|
+
# Task keys, resolved from the job DAG.
|
|
23
|
+
upstream_tasks: list[str] = field(default_factory=list)
|
|
24
|
+
downstream_tasks: list[str] = field(default_factory=list)
|
|
25
|
+
# Best-effort schema of the table the failing task writes to, if known.
|
|
26
|
+
schema_snapshot: dict = field(default_factory=dict)
|
|
27
|
+
# Tables referenced by the failing task (parsed from notebook / metadata).
|
|
28
|
+
referenced_tables: list[str] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def error_text(self) -> str:
|
|
32
|
+
"""Joined log text, used by the DNA builder and the LLM prompt."""
|
|
33
|
+
return "\n".join(self.logs)
|
|
34
|
+
|
|
35
|
+
def to_dict(self) -> dict:
|
|
36
|
+
return {
|
|
37
|
+
"event_id": self.event_id,
|
|
38
|
+
"logs": self.logs,
|
|
39
|
+
"notebook_source_len": len(self.notebook_source),
|
|
40
|
+
"cluster_state": self.cluster_state,
|
|
41
|
+
"job_metadata": self.job_metadata,
|
|
42
|
+
"upstream_tasks": self.upstream_tasks,
|
|
43
|
+
"downstream_tasks": self.downstream_tasks,
|
|
44
|
+
"schema_snapshot": self.schema_snapshot,
|
|
45
|
+
"referenced_tables": self.referenced_tables,
|
|
46
|
+
}
|
iic/models/diagnosis.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Stage 9 — the diagnosis result.
|
|
2
|
+
|
|
3
|
+
The *only* place an LLM result enters the system, and even here it is structured
|
|
4
|
+
(never free-form chat). When the router decides no LLM is needed, the diagnosis
|
|
5
|
+
engine still returns one of these, populated deterministically from the DNA.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DiagnosisResult:
|
|
15
|
+
"""Root-cause analysis, whether produced by rules or the LLM."""
|
|
16
|
+
|
|
17
|
+
root_cause: str
|
|
18
|
+
confidence: float = 0.0 # 0.0–1.0
|
|
19
|
+
reasoning: str = ""
|
|
20
|
+
suggested_fix: str = ""
|
|
21
|
+
evidence: list[str] = field(default_factory=list)
|
|
22
|
+
alternatives: list[str] = field(default_factory=list)
|
|
23
|
+
produced_by: str = "rules" # "rules" | "llm:<model>"
|
|
24
|
+
|
|
25
|
+
def __post_init__(self) -> None:
|
|
26
|
+
self.confidence = max(0.0, min(1.0, float(self.confidence or 0.0)))
|
|
27
|
+
|
|
28
|
+
def to_dict(self) -> dict:
|
|
29
|
+
return {
|
|
30
|
+
"root_cause": self.root_cause,
|
|
31
|
+
"confidence": round(self.confidence, 3),
|
|
32
|
+
"reasoning": self.reasoning,
|
|
33
|
+
"suggested_fix": self.suggested_fix,
|
|
34
|
+
"evidence": self.evidence,
|
|
35
|
+
"alternatives": self.alternatives,
|
|
36
|
+
"produced_by": self.produced_by,
|
|
37
|
+
}
|
iic/models/dna.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Stage 6 — IncidentDNA, the core intelligence asset.
|
|
2
|
+
|
|
3
|
+
This is the structured fingerprint of a failure. Everything before it exists to
|
|
4
|
+
build it; everything after it consumes it. It is produced *deterministically*
|
|
5
|
+
(rules over context + changes) so the system's behaviour is predictable and the
|
|
6
|
+
LLM — which only runs later — operates on structure rather than raw text.
|
|
7
|
+
|
|
8
|
+
``pattern_id`` is a stable, human-readable key (e.g. ``schema_drift_v1``) that lets
|
|
9
|
+
recurring failures be counted, cached, and trended over time.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from enum import Enum
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FailureType(str, Enum):
|
|
19
|
+
"""Canonical failure taxonomy. The DNA builder maps raw errors onto these."""
|
|
20
|
+
|
|
21
|
+
SCHEMA_DRIFT = "SCHEMA_DRIFT" # column added/removed/retyped
|
|
22
|
+
DATA_QUALITY = "DATA_QUALITY" # nulls, duplicates, constraint/threshold
|
|
23
|
+
MISSING_DATA = "MISSING_DATA" # empty/absent source table or path
|
|
24
|
+
PERMISSION = "PERMISSION" # access denied, unauthorized
|
|
25
|
+
DEPENDENCY = "DEPENDENCY" # upstream task/table failed first
|
|
26
|
+
RESOURCE = "RESOURCE" # OOM, executor lost, disk, capacity
|
|
27
|
+
TIMEOUT = "TIMEOUT" # exceeded wall-clock / query timeout
|
|
28
|
+
CONFIG = "CONFIG" # bad/missing parameter, secret, path
|
|
29
|
+
CODE_ERROR = "CODE_ERROR" # syntax / type / attribute / import
|
|
30
|
+
UNKNOWN = "UNKNOWN" # could not be classified
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SystemLayer(str, Enum):
|
|
34
|
+
"""Which infrastructure layer the failure sits in."""
|
|
35
|
+
|
|
36
|
+
DATABRICKS = "databricks"
|
|
37
|
+
SPARK = "spark"
|
|
38
|
+
STORAGE = "storage"
|
|
39
|
+
NETWORK = "network"
|
|
40
|
+
UNKNOWN = "unknown"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# Medallion / pipeline layer inferred from the asset name.
|
|
44
|
+
_DATA_LAYERS = ("bronze", "silver", "gold", "raw", "staging", "mart", "unknown")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class IncidentDNA:
|
|
49
|
+
"""Structured failure representation — deterministic, pre-LLM."""
|
|
50
|
+
|
|
51
|
+
failure_type: FailureType = FailureType.UNKNOWN
|
|
52
|
+
system_layer: SystemLayer = SystemLayer.UNKNOWN
|
|
53
|
+
affected_layer: str = "unknown" # bronze | silver | gold | ...
|
|
54
|
+
root_signal: str = "" # the single most telling phrase
|
|
55
|
+
confidence_signature: str = "low" # low | medium | high (rule confidence)
|
|
56
|
+
pattern_id: str = "unknown_v1" # stable recurrence key
|
|
57
|
+
# Evidence the rules fired on — keeps the classification fully traceable.
|
|
58
|
+
signals: list[str] = field(default_factory=list)
|
|
59
|
+
likely_caused_by_change: bool = False
|
|
60
|
+
|
|
61
|
+
def __post_init__(self) -> None:
|
|
62
|
+
if isinstance(self.failure_type, str):
|
|
63
|
+
self.failure_type = FailureType(self.failure_type)
|
|
64
|
+
if isinstance(self.system_layer, str):
|
|
65
|
+
self.system_layer = SystemLayer(self.system_layer)
|
|
66
|
+
|
|
67
|
+
def to_dict(self) -> dict:
|
|
68
|
+
return {
|
|
69
|
+
"failure_type": self.failure_type.value,
|
|
70
|
+
"system_layer": self.system_layer.value,
|
|
71
|
+
"affected_layer": self.affected_layer,
|
|
72
|
+
"root_signal": self.root_signal,
|
|
73
|
+
"confidence_signature": self.confidence_signature,
|
|
74
|
+
"pattern_id": self.pattern_id,
|
|
75
|
+
"signals": self.signals,
|
|
76
|
+
"likely_caused_by_change": self.likely_caused_by_change,
|
|
77
|
+
}
|
iic/models/event.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Stage 2 — the normalized event model.
|
|
2
|
+
|
|
3
|
+
Every ingestion source (Databricks Jobs API, a logs webhook, …) is responsible
|
|
4
|
+
for emitting this one shape. Nothing downstream of ingestion ever sees a
|
|
5
|
+
source-specific payload, which is what keeps the core source-agnostic.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import uuid
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from enum import Enum
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EventSource(str, Enum):
|
|
17
|
+
"""Where a failure event originated. Extend here when adding an ingester."""
|
|
18
|
+
|
|
19
|
+
DATABRICKS = "databricks"
|
|
20
|
+
LOG_WEBHOOK = "log_webhook"
|
|
21
|
+
MANUAL = "manual"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class NormalizedFailureEvent:
|
|
26
|
+
"""A single, source-agnostic pipeline failure.
|
|
27
|
+
|
|
28
|
+
``event_id`` is deterministic by default (derived from source+run+task) so the
|
|
29
|
+
same failure normalizes to the same id and can be de-duplicated, but a caller
|
|
30
|
+
may pass an explicit id. Timestamps are ISO-8601 strings to stay
|
|
31
|
+
serialisation-friendly across the Spark/JSON boundary.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
source: EventSource
|
|
35
|
+
pipeline: str
|
|
36
|
+
task: str
|
|
37
|
+
error_message: str
|
|
38
|
+
timestamp: str
|
|
39
|
+
run_id: str = ""
|
|
40
|
+
job_id: str = ""
|
|
41
|
+
cluster_id: str = ""
|
|
42
|
+
notebook_path: str = ""
|
|
43
|
+
error_trace: str = ""
|
|
44
|
+
event_id: str = ""
|
|
45
|
+
|
|
46
|
+
def __post_init__(self) -> None:
|
|
47
|
+
if isinstance(self.source, str):
|
|
48
|
+
self.source = EventSource(self.source)
|
|
49
|
+
if not self.event_id:
|
|
50
|
+
self.event_id = self._derive_id()
|
|
51
|
+
|
|
52
|
+
def _derive_id(self) -> str:
|
|
53
|
+
basis = f"{self.source.value}|{self.run_id}|{self.task}"
|
|
54
|
+
if self.run_id and self.task:
|
|
55
|
+
digest = hashlib.sha256(basis.encode()).hexdigest()[:12]
|
|
56
|
+
return f"EVT-{digest}"
|
|
57
|
+
return f"EVT-{uuid.uuid4().hex[:12]}"
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def short_error(self) -> str:
|
|
61
|
+
"""First line of the error, trimmed — handy for fingerprinting."""
|
|
62
|
+
first = (self.error_message or "").strip().splitlines()
|
|
63
|
+
return (first[0] if first else "")[:200]
|
|
64
|
+
|
|
65
|
+
def to_dict(self) -> dict:
|
|
66
|
+
return {
|
|
67
|
+
"event_id": self.event_id,
|
|
68
|
+
"source": self.source.value,
|
|
69
|
+
"pipeline": self.pipeline,
|
|
70
|
+
"task": self.task,
|
|
71
|
+
"error_message": self.error_message,
|
|
72
|
+
"error_trace": self.error_trace,
|
|
73
|
+
"timestamp": self.timestamp,
|
|
74
|
+
"run_id": self.run_id,
|
|
75
|
+
"job_id": self.job_id,
|
|
76
|
+
"cluster_id": self.cluster_id,
|
|
77
|
+
"notebook_path": self.notebook_path,
|
|
78
|
+
}
|
iic/models/impact.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Stage 7 — the deterministic impact score.
|
|
2
|
+
|
|
3
|
+
Produced with NO LLM. The score is a pure function of the blast radius and
|
|
4
|
+
recurrence so the same situation always yields the same severity — auditable and
|
|
5
|
+
cheap. Severity/business-risk thresholds live with the engine, not here.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from enum import Enum
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Severity(str, Enum):
|
|
15
|
+
LOW = "LOW"
|
|
16
|
+
MEDIUM = "MEDIUM"
|
|
17
|
+
HIGH = "HIGH"
|
|
18
|
+
CRITICAL = "CRITICAL"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BusinessRisk(str, Enum):
|
|
22
|
+
LOW = "LOW"
|
|
23
|
+
MODERATE = "MODERATE"
|
|
24
|
+
HIGH = "HIGH"
|
|
25
|
+
CRITICAL = "CRITICAL"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ImpactScore:
|
|
30
|
+
"""Deterministic blast-radius and business-risk assessment."""
|
|
31
|
+
|
|
32
|
+
raw_score: float = 0.0
|
|
33
|
+
blast_radius: int = 0
|
|
34
|
+
downstream_jobs: int = 0
|
|
35
|
+
affected_tables: int = 0
|
|
36
|
+
dashboard_impact: int = 0
|
|
37
|
+
recurrence_score: int = 0
|
|
38
|
+
severity: Severity = Severity.LOW
|
|
39
|
+
business_risk: BusinessRisk = BusinessRisk.LOW
|
|
40
|
+
# The exact term contributions, for a fully transparent score.
|
|
41
|
+
breakdown: dict = field(default_factory=dict)
|
|
42
|
+
|
|
43
|
+
def __post_init__(self) -> None:
|
|
44
|
+
if isinstance(self.severity, str):
|
|
45
|
+
self.severity = Severity(self.severity)
|
|
46
|
+
if isinstance(self.business_risk, str):
|
|
47
|
+
self.business_risk = BusinessRisk(self.business_risk)
|
|
48
|
+
|
|
49
|
+
def to_dict(self) -> dict:
|
|
50
|
+
return {
|
|
51
|
+
"raw_score": round(self.raw_score, 2),
|
|
52
|
+
"blast_radius": self.blast_radius,
|
|
53
|
+
"downstream_jobs": self.downstream_jobs,
|
|
54
|
+
"affected_tables": self.affected_tables,
|
|
55
|
+
"dashboard_impact": self.dashboard_impact,
|
|
56
|
+
"recurrence_score": self.recurrence_score,
|
|
57
|
+
"severity": self.severity.value,
|
|
58
|
+
"business_risk": self.business_risk.value,
|
|
59
|
+
"breakdown": self.breakdown,
|
|
60
|
+
}
|
iic/models/report.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Stage 10 — the final IncidentReport.
|
|
2
|
+
|
|
3
|
+
The product the whole pipeline exists to produce: a single object that is both
|
|
4
|
+
machine-consumable (``to_dict``) and human-readable (``to_markdown``). It carries
|
|
5
|
+
the full provenance chain (DNA, impact, diagnosis, evidence, timeline) so any
|
|
6
|
+
conclusion can be traced back to the evidence it rests on.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from iic.models.diagnosis import DiagnosisResult
|
|
15
|
+
from iic.models.dna import IncidentDNA
|
|
16
|
+
from iic.models.impact import ImpactScore
|
|
17
|
+
from iic.models.routing import RoutingDecision
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class IncidentReport:
|
|
22
|
+
incident_id: str
|
|
23
|
+
pipeline: str
|
|
24
|
+
task: str
|
|
25
|
+
timestamp: str
|
|
26
|
+
summary: str
|
|
27
|
+
dna: IncidentDNA
|
|
28
|
+
impact: ImpactScore
|
|
29
|
+
diagnosis: DiagnosisResult
|
|
30
|
+
routing: Optional[RoutingDecision] = None
|
|
31
|
+
evidence: list[str] = field(default_factory=list)
|
|
32
|
+
timeline: list[str] = field(default_factory=list)
|
|
33
|
+
changes: list[str] = field(default_factory=list)
|
|
34
|
+
|
|
35
|
+
def to_dict(self) -> dict:
|
|
36
|
+
return {
|
|
37
|
+
"incident_id": self.incident_id,
|
|
38
|
+
"pipeline": self.pipeline,
|
|
39
|
+
"task": self.task,
|
|
40
|
+
"timestamp": self.timestamp,
|
|
41
|
+
"summary": self.summary,
|
|
42
|
+
"root_cause": self.diagnosis.root_cause,
|
|
43
|
+
"confidence": round(self.diagnosis.confidence, 3),
|
|
44
|
+
"failure_type": self.dna.failure_type.value,
|
|
45
|
+
"pattern_id": self.dna.pattern_id,
|
|
46
|
+
"impact": self.impact.to_dict(),
|
|
47
|
+
"diagnosis": self.diagnosis.to_dict(),
|
|
48
|
+
"routing": self.routing.to_dict() if self.routing else None,
|
|
49
|
+
"evidence": self.evidence,
|
|
50
|
+
"changes": self.changes,
|
|
51
|
+
"timeline": self.timeline,
|
|
52
|
+
"suggested_fix": self.diagnosis.suggested_fix,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
def to_markdown(self) -> str:
|
|
56
|
+
"""Compact human-readable summary (used by the notifier and logs)."""
|
|
57
|
+
imp = self.impact
|
|
58
|
+
lines = [
|
|
59
|
+
f"# Incident {self.incident_id} — {imp.severity.value}",
|
|
60
|
+
f"**Pipeline:** {self.pipeline} · **Task:** {self.task} · **{self.timestamp}**",
|
|
61
|
+
"",
|
|
62
|
+
f"**Summary:** {self.summary}",
|
|
63
|
+
"",
|
|
64
|
+
f"**Root cause** ({int(self.diagnosis.confidence * 100)}% confidence): "
|
|
65
|
+
f"{self.diagnosis.root_cause}",
|
|
66
|
+
"",
|
|
67
|
+
"## Impact",
|
|
68
|
+
f"- Severity: **{imp.severity.value}** · Business risk: **{imp.business_risk.value}**",
|
|
69
|
+
f"- Blast radius: {imp.blast_radius} asset(s) · "
|
|
70
|
+
f"{imp.downstream_jobs} downstream job(s) · {imp.dashboard_impact} dashboard(s)",
|
|
71
|
+
"",
|
|
72
|
+
"## Failure DNA",
|
|
73
|
+
f"- Type: `{self.dna.failure_type.value}` · Layer: `{self.dna.affected_layer}` "
|
|
74
|
+
f"· Pattern: `{self.dna.pattern_id}`",
|
|
75
|
+
f"- Root signal: {self.dna.root_signal or 'n/a'}",
|
|
76
|
+
]
|
|
77
|
+
if self.changes:
|
|
78
|
+
lines += ["", "## Recent changes (since last success)"]
|
|
79
|
+
lines += [f"- {c}" for c in self.changes]
|
|
80
|
+
if self.diagnosis.suggested_fix:
|
|
81
|
+
lines += ["", "## Suggested fix", self.diagnosis.suggested_fix]
|
|
82
|
+
if self.evidence:
|
|
83
|
+
lines += ["", "## Evidence"]
|
|
84
|
+
lines += [f"- {e}" for e in self.evidence]
|
|
85
|
+
if self.timeline:
|
|
86
|
+
lines += ["", "## Timeline"]
|
|
87
|
+
lines += [f"- {t}" for t in self.timeline]
|
|
88
|
+
return "\n".join(lines)
|
iic/models/routing.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Stage 8 — the routing decision.
|
|
2
|
+
|
|
3
|
+
Records which model (if any) the diagnosis stage should use and *why*, so the
|
|
4
|
+
choice is auditable. The router is deterministic: impact + cache state decide the
|
|
5
|
+
tier, never the LLM itself.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from enum import Enum
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ModelTier(str, Enum):
|
|
15
|
+
NONE = "none" # skip the LLM entirely (cache hit / trivial)
|
|
16
|
+
LIGHTWEIGHT = "lightweight"
|
|
17
|
+
POWERFUL = "powerful"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class RoutingDecision:
|
|
22
|
+
tier: ModelTier
|
|
23
|
+
model: str = "" # resolved endpoint name ("" when tier is NONE)
|
|
24
|
+
reason: str = ""
|
|
25
|
+
cache_hit: bool = False
|
|
26
|
+
|
|
27
|
+
def __post_init__(self) -> None:
|
|
28
|
+
if isinstance(self.tier, str):
|
|
29
|
+
self.tier = ModelTier(self.tier)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def requires_llm(self) -> bool:
|
|
33
|
+
return self.tier != ModelTier.NONE
|
|
34
|
+
|
|
35
|
+
def to_dict(self) -> dict:
|
|
36
|
+
return {
|
|
37
|
+
"tier": self.tier.value,
|
|
38
|
+
"model": self.model,
|
|
39
|
+
"reason": self.reason,
|
|
40
|
+
"cache_hit": self.cache_hit,
|
|
41
|
+
}
|