shkit 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- healing_kit/__init__.py +3 -0
- healing_kit/auth.py +79 -0
- healing_kit/clients/__init__.py +1 -0
- healing_kit/clients/databricks_client.py +183 -0
- healing_kit/clients/teams_client.py +128 -0
- healing_kit/models/__init__.py +1 -0
- healing_kit/models/diagnosis.py +45 -0
- healing_kit/models/events.py +30 -0
- healing_kit/models/evidence.py +83 -0
- healing_kit/runtime/__init__.py +6 -0
- healing_kit/runtime/approval.py +141 -0
- healing_kit/runtime/maintenance.py +52 -0
- healing_kit/services/__init__.py +1 -0
- healing_kit/services/cache_service.py +120 -0
- healing_kit/services/circuit_breaker.py +114 -0
- healing_kit/services/context_agent.py +127 -0
- healing_kit/services/dependency_graph.py +141 -0
- healing_kit/services/diagnosis_engine.py +165 -0
- healing_kit/services/identity.py +61 -0
- healing_kit/services/model_router.py +52 -0
- healing_kit/services/query_guard.py +168 -0
- healing_kit/services/resolution_verifier.py +100 -0
- healing_kit/services/token_budget.py +137 -0
- healing_kit/utils/__init__.py +1 -0
- healing_kit/utils/error_hash.py +15 -0
- healing_kit/utils/hmac_tokens.py +86 -0
- healing_kit/utils/sql_safety.py +84 -0
- iic/__init__.py +51 -0
- iic/__main__.py +18 -0
- iic/_console.py +235 -0
- iic/_doctor.py +143 -0
- iic/change/__init__.py +7 -0
- iic/change/change_detector.py +154 -0
- iic/context/__init__.py +7 -0
- iic/context/context_builder.py +117 -0
- iic/dependency/__init__.py +7 -0
- iic/dependency/dependency_analyzer.py +93 -0
- iic/diagnosis/__init__.py +7 -0
- iic/diagnosis/diagnosis_engine.py +183 -0
- iic/dna/__init__.py +7 -0
- iic/dna/dna_builder.py +184 -0
- iic/impact/__init__.py +7 -0
- iic/impact/impact_engine.py +102 -0
- iic/ingestion/__init__.py +14 -0
- iic/ingestion/base.py +21 -0
- iic/ingestion/databricks_source.py +98 -0
- iic/ingestion/static_source.py +23 -0
- iic/ingestion/webhook_source.py +39 -0
- iic/models/__init__.py +44 -0
- iic/models/change.py +77 -0
- iic/models/context.py +46 -0
- iic/models/diagnosis.py +37 -0
- iic/models/dna.py +77 -0
- iic/models/event.py +78 -0
- iic/models/impact.py +60 -0
- iic/models/report.py +88 -0
- iic/models/routing.py +41 -0
- iic/notify/__init__.py +7 -0
- iic/notify/teams_notifier.py +112 -0
- iic/report/__init__.py +7 -0
- iic/report/report_generator.py +67 -0
- iic/routing/__init__.py +7 -0
- iic/routing/router.py +42 -0
- iic/runtime/__init__.py +10 -0
- iic/runtime/_sql.py +11 -0
- iic/runtime/agent_config.py +48 -0
- iic/runtime/agent_runtime.py +70 -0
- iic/runtime/antibodies.py +100 -0
- iic/runtime/bootstrap.py +157 -0
- iic/runtime/constants.py +40 -0
- iic/runtime/context.py +46 -0
- iic/runtime/detective.py +72 -0
- iic/runtime/hooks.py +85 -0
- iic/runtime/incident_engine.py +207 -0
- iic/runtime/inprocess.py +350 -0
- iic/runtime/ledger.py +120 -0
- iic/runtime/monitor.py +155 -0
- iic/runtime/pattern_store.py +53 -0
- iic/runtime/reconciler.py +139 -0
- iic/runtime/scope_config.py +127 -0
- iic/runtime/store.py +150 -0
- iic/runtime/wrapper.py +28 -0
- iic_autoload.pth +1 -0
- onboarding/__init__.py +1 -0
- onboarding/cli.py +168 -0
- onboarding/config_schema.py +62 -0
- onboarding/manifest.py +27 -0
- onboarding/preflight.py +129 -0
- onboarding/provisioner.py +573 -0
- onboarding/rollback.py +81 -0
- shkit-1.2.0.dist-info/METADATA +239 -0
- shkit-1.2.0.dist-info/RECORD +94 -0
- shkit-1.2.0.dist-info/WHEEL +4 -0
- shkit-1.2.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Stage 9 — the diagnosis engine.
|
|
2
|
+
|
|
3
|
+
The only place an LLM enters the system, and only after the DNA, impact, and
|
|
4
|
+
routing decision exist. Two paths:
|
|
5
|
+
|
|
6
|
+
* ``tier == NONE`` → no LLM. A deterministic narrative is synthesised from the
|
|
7
|
+
DNA (cache replay, or a derived-dependency pointer, or a templated explanation
|
|
8
|
+
for a confidently-classified failure). The system is fully useful with zero
|
|
9
|
+
tokens spent.
|
|
10
|
+
* LLM tier → a strict-JSON prompt built *from the structured DNA* (not raw
|
|
11
|
+
text), parsed into a structured :class:`DiagnosisResult`. The model interprets
|
|
12
|
+
structure; it does not classify or chat.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
|
|
19
|
+
from iic.models.change import ChangeDiffObject
|
|
20
|
+
from iic.models.context import IncidentContextBundle
|
|
21
|
+
from iic.models.diagnosis import DiagnosisResult
|
|
22
|
+
from iic.models.dna import FailureType, IncidentDNA
|
|
23
|
+
from iic.models.event import NormalizedFailureEvent
|
|
24
|
+
from iic.models.routing import RoutingDecision
|
|
25
|
+
|
|
26
|
+
_SYSTEM_MSG = (
|
|
27
|
+
"You are a senior data reliability engineer. You are given a STRUCTURED incident "
|
|
28
|
+
"fingerprint that was already classified deterministically. Do not re-classify; "
|
|
29
|
+
"explain the root cause and propose a concrete fix. Respond ONLY in valid JSON. "
|
|
30
|
+
"Never suggest DROP, DELETE, or TRUNCATE."
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
_JSON_SPEC = (
|
|
34
|
+
'\n\nReturn ONLY this JSON object:\n'
|
|
35
|
+
'{"root_cause": "one-sentence root cause", "confidence": 0.0-1.0, '
|
|
36
|
+
'"reasoning": "why, citing the signals/changes", '
|
|
37
|
+
'"suggested_fix": "concrete remediation step", '
|
|
38
|
+
'"alternatives": ["other hypothesis", "..."]}'
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Deterministic narratives used when no LLM is called (tier == NONE) or as a
|
|
42
|
+
# fallback if the model errors. Keyed by failure type.
|
|
43
|
+
_TEMPLATES: dict[FailureType, tuple[str, str]] = {
|
|
44
|
+
FailureType.SCHEMA_DRIFT: (
|
|
45
|
+
"The failing task hit a schema mismatch — a column it expects is missing or changed type.",
|
|
46
|
+
"Reconcile the ingestion/transform mapping with the new source schema "
|
|
47
|
+
"(add/rename the column or enable schema evolution), then re-run.",
|
|
48
|
+
),
|
|
49
|
+
FailureType.DATA_QUALITY: (
|
|
50
|
+
"A data-quality rule failed (duplicates, nulls, or a threshold/constraint breach).",
|
|
51
|
+
"Inspect the offending rows from the evidence query, fix the source/transform, and re-run the quality gate.",
|
|
52
|
+
),
|
|
53
|
+
FailureType.MISSING_DATA: (
|
|
54
|
+
"An expected input table or file path was empty or absent when the task ran.",
|
|
55
|
+
"Confirm the upstream load completed and the path/table exists, then re-run; add a presence check upstream.",
|
|
56
|
+
),
|
|
57
|
+
FailureType.PERMISSION: (
|
|
58
|
+
"The executing identity lacks a required privilege on a table, path, or secret.",
|
|
59
|
+
"Grant the missing privilege to the run identity (least-privilege) and re-run; no data change needed.",
|
|
60
|
+
),
|
|
61
|
+
FailureType.DEPENDENCY: (
|
|
62
|
+
"This task did not fail on its own — an upstream task in the same run failed first.",
|
|
63
|
+
"Resolve the upstream root-cause incident; this task should recover on its re-run.",
|
|
64
|
+
),
|
|
65
|
+
FailureType.RESOURCE: (
|
|
66
|
+
"The task exhausted compute resources (memory/executors/disk).",
|
|
67
|
+
"Increase cluster size or partition the workload; re-run with adjusted resources.",
|
|
68
|
+
),
|
|
69
|
+
FailureType.TIMEOUT: (
|
|
70
|
+
"The task exceeded its time budget.",
|
|
71
|
+
"Raise the timeout or optimise the slow stage (skew/partitioning), then re-run.",
|
|
72
|
+
),
|
|
73
|
+
FailureType.CONFIG: (
|
|
74
|
+
"A required configuration value, parameter, or secret was missing or invalid.",
|
|
75
|
+
"Correct the parameter/secret value in the job/secret scope and re-run.",
|
|
76
|
+
),
|
|
77
|
+
FailureType.CODE_ERROR: (
|
|
78
|
+
"The notebook raised a code-level exception (syntax/type/import).",
|
|
79
|
+
"Fix the code defect indicated by the traceback and redeploy/re-run.",
|
|
80
|
+
),
|
|
81
|
+
FailureType.UNKNOWN: (
|
|
82
|
+
"The failure could not be classified deterministically.",
|
|
83
|
+
"Review the captured logs and notebook source; escalate to the owning engineer.",
|
|
84
|
+
),
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class DiagnosisEngine:
|
|
89
|
+
"""Produces a structured :class:`DiagnosisResult` for an incident."""
|
|
90
|
+
|
|
91
|
+
def __init__(self, client=None):
|
|
92
|
+
self.client = client
|
|
93
|
+
|
|
94
|
+
def diagnose(
|
|
95
|
+
self,
|
|
96
|
+
event: NormalizedFailureEvent,
|
|
97
|
+
dna: IncidentDNA,
|
|
98
|
+
routing: RoutingDecision,
|
|
99
|
+
context: IncidentContextBundle | None = None,
|
|
100
|
+
change_diff: ChangeDiffObject | None = None,
|
|
101
|
+
evidence: list[str] | None = None,
|
|
102
|
+
) -> tuple[DiagnosisResult, int]:
|
|
103
|
+
"""Return ``(DiagnosisResult, tokens_used)``."""
|
|
104
|
+
if not routing.requires_llm or self.client is None:
|
|
105
|
+
return self._deterministic(dna, routing, change_diff), 0
|
|
106
|
+
|
|
107
|
+
prompt = self._build_prompt(event, dna, context, change_diff, evidence)
|
|
108
|
+
try:
|
|
109
|
+
content, usage = self.client.invoke_model_full(
|
|
110
|
+
routing.model,
|
|
111
|
+
[{"role": "system", "content": _SYSTEM_MSG}, {"role": "user", "content": prompt}],
|
|
112
|
+
max_tokens=900,
|
|
113
|
+
)
|
|
114
|
+
except Exception as ex:
|
|
115
|
+
result = self._deterministic(dna, routing, change_diff)
|
|
116
|
+
result.reasoning = f"LLM unavailable ({str(ex)[:80]}); used deterministic template."
|
|
117
|
+
return result, 0
|
|
118
|
+
|
|
119
|
+
tokens = int((usage or {}).get("total_tokens", 0) or 0)
|
|
120
|
+
return self._parse(content, dna, routing), tokens
|
|
121
|
+
|
|
122
|
+
# ─── deterministic path ───
|
|
123
|
+
|
|
124
|
+
def _deterministic(self, dna: IncidentDNA, routing: RoutingDecision,
|
|
125
|
+
change_diff: ChangeDiffObject | None) -> DiagnosisResult:
|
|
126
|
+
root_cause, fix = _TEMPLATES.get(dna.failure_type, _TEMPLATES[FailureType.UNKNOWN])
|
|
127
|
+
confidence = {"high": 0.85, "medium": 0.6, "low": 0.35}.get(dna.confidence_signature, 0.4)
|
|
128
|
+
evidence = list(dna.signals)
|
|
129
|
+
if change_diff and change_diff.has_changes:
|
|
130
|
+
evidence += [f"change: {s}" for s in change_diff.summaries()[:3]]
|
|
131
|
+
reasoning = routing.reason or "Deterministic classification; no LLM required."
|
|
132
|
+
return DiagnosisResult(
|
|
133
|
+
root_cause=root_cause,
|
|
134
|
+
confidence=confidence,
|
|
135
|
+
reasoning=reasoning,
|
|
136
|
+
suggested_fix=fix,
|
|
137
|
+
evidence=evidence,
|
|
138
|
+
produced_by="rules",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# ─── LLM path ───
|
|
142
|
+
|
|
143
|
+
def _build_prompt(self, event, dna, context, change_diff, evidence) -> str:
|
|
144
|
+
changes = (change_diff.summaries() if change_diff else []) or ["none detected"]
|
|
145
|
+
ev = evidence or []
|
|
146
|
+
nb = (context.notebook_source[:1200] if context and context.notebook_source else "")
|
|
147
|
+
return (
|
|
148
|
+
f"INCIDENT FINGERPRINT (pre-classified):\n"
|
|
149
|
+
f"- failure_type: {dna.failure_type.value}\n"
|
|
150
|
+
f"- affected_layer: {dna.affected_layer}\n"
|
|
151
|
+
f"- system_layer: {dna.system_layer.value}\n"
|
|
152
|
+
f"- root_signal: {dna.root_signal}\n"
|
|
153
|
+
f"- signals: {', '.join(dna.signals) or 'n/a'}\n"
|
|
154
|
+
f"- pattern_id: {dna.pattern_id}\n\n"
|
|
155
|
+
f"PIPELINE: {event.pipeline} · TASK: {event.task}\n"
|
|
156
|
+
f"ERROR:\n{event.error_message[:1500]}\n\n"
|
|
157
|
+
f"RECENT CHANGES (since last success):\n- " + "\n- ".join(changes[:8]) + "\n\n"
|
|
158
|
+
"DATA EVIDENCE:\n- " + ("\n- ".join(ev[:6]) if ev else "none") + "\n\n"
|
|
159
|
+
+ (f"NOTEBOOK (truncated):\n{nb}\n" if nb else "")
|
|
160
|
+
+ _JSON_SPEC
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _parse(self, content: str, dna: IncidentDNA, routing: RoutingDecision) -> DiagnosisResult:
|
|
164
|
+
cleaned = (content or "").strip()
|
|
165
|
+
if cleaned.startswith("```"):
|
|
166
|
+
cleaned = cleaned.split("\n", 1)[-1]
|
|
167
|
+
if cleaned.endswith("```"):
|
|
168
|
+
cleaned = cleaned[:-3]
|
|
169
|
+
try:
|
|
170
|
+
data = json.loads(cleaned.strip())
|
|
171
|
+
except Exception:
|
|
172
|
+
result = self._deterministic(dna, routing, None)
|
|
173
|
+
result.reasoning = "LLM returned unparseable output; used deterministic template."
|
|
174
|
+
return result
|
|
175
|
+
return DiagnosisResult(
|
|
176
|
+
root_cause=str(data.get("root_cause", ""))[:500] or _TEMPLATES[dna.failure_type][0],
|
|
177
|
+
confidence=data.get("confidence", 0.5),
|
|
178
|
+
reasoning=str(data.get("reasoning", ""))[:800],
|
|
179
|
+
suggested_fix=str(data.get("suggested_fix", ""))[:500],
|
|
180
|
+
alternatives=[str(a)[:200] for a in (data.get("alternatives") or [])][:4],
|
|
181
|
+
evidence=list(dna.signals),
|
|
182
|
+
produced_by=f"llm:{routing.model}",
|
|
183
|
+
)
|
iic/dna/__init__.py
ADDED
iic/dna/dna_builder.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Stage 6 — build the IncidentDNA. The heart of the system.
|
|
2
|
+
|
|
3
|
+
This is *deterministic, rule-based* classification: it maps the raw error text,
|
|
4
|
+
the gathered context, and the change diff onto the canonical
|
|
5
|
+
:class:`FailureType` taxonomy, and records exactly which signals fired so the
|
|
6
|
+
result is fully traceable. The LLM never runs here — it runs later, on the DNA
|
|
7
|
+
this stage produces.
|
|
8
|
+
|
|
9
|
+
Why rules and not an LLM for classification:
|
|
10
|
+
* predictable, testable, free, and instant;
|
|
11
|
+
* the classification drives cost decisions (model routing), so it must not
|
|
12
|
+
itself cost an LLM call;
|
|
13
|
+
* a wrong-but-confident LLM label would poison everything downstream.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from iic.models.change import ChangeDiffObject
|
|
19
|
+
from iic.models.context import IncidentContextBundle
|
|
20
|
+
from iic.models.dna import FailureType, IncidentDNA, SystemLayer
|
|
21
|
+
from iic.models.event import NormalizedFailureEvent
|
|
22
|
+
|
|
23
|
+
# Ordered most-specific → least-specific. The first type with a matching signal
|
|
24
|
+
# (after the dependency override) wins. Each pattern is a lowercased substring.
|
|
25
|
+
_RULES: list[tuple[FailureType, tuple[str, ...]]] = [
|
|
26
|
+
(FailureType.PERMISSION, (
|
|
27
|
+
"permission denied", "access denied", "unauthorized", "not authorized",
|
|
28
|
+
"forbidden", "does not have privilege", "requires permission", "403",
|
|
29
|
+
)),
|
|
30
|
+
(FailureType.SCHEMA_DRIFT, (
|
|
31
|
+
"no such column", "cannot resolve column", "cannot resolve '",
|
|
32
|
+
"unresolved column", "missing column", "schema mismatch",
|
|
33
|
+
"incompatible schema", "schema is not compatible", "field not found",
|
|
34
|
+
"column not found", "mergeschema", "schema drift",
|
|
35
|
+
)),
|
|
36
|
+
(FailureType.MISSING_DATA, (
|
|
37
|
+
"path does not exist", "table or view not found", "no such table",
|
|
38
|
+
"file not found", "filenotfound", "does not exist", "no files found",
|
|
39
|
+
"empty input", "0 rows", "returned no rows",
|
|
40
|
+
)),
|
|
41
|
+
(FailureType.DATA_QUALITY, (
|
|
42
|
+
"duplicate", "constraint", "expectation", "check constraint",
|
|
43
|
+
"not null", "null value", "validation failed", "data quality",
|
|
44
|
+
"threshold", "violat",
|
|
45
|
+
)),
|
|
46
|
+
(FailureType.RESOURCE, (
|
|
47
|
+
"out of memory", "outofmemory", "java heap", "gc overhead",
|
|
48
|
+
"executor lost", "container killed", "no space left", "disk is full",
|
|
49
|
+
"exceeded memory", "lost executor", "exceeds the allowed",
|
|
50
|
+
)),
|
|
51
|
+
(FailureType.TIMEOUT, (
|
|
52
|
+
"timeout", "timed out", "deadline exceeded", "exceeded the timeout",
|
|
53
|
+
"query has exceeded the time", "cancelled because it exceeded",
|
|
54
|
+
)),
|
|
55
|
+
(FailureType.CONFIG, (
|
|
56
|
+
"not found in scope", "missing required", "invalid argument",
|
|
57
|
+
"secret", "environment variable", "missing parameter", "no value for",
|
|
58
|
+
"configuration", "could not find a value",
|
|
59
|
+
)),
|
|
60
|
+
(FailureType.CODE_ERROR, (
|
|
61
|
+
"syntaxerror", "typeerror", "attributeerror", "nameerror",
|
|
62
|
+
"importerror", "modulenotfounderror", "valueerror", "indexerror",
|
|
63
|
+
"keyerror", "zerodivisionerror", "traceback (most recent call last)",
|
|
64
|
+
)),
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
# System-layer hints (lowercased substrings).
|
|
68
|
+
_LAYER_HINTS: list[tuple[SystemLayer, tuple[str, ...]]] = [
|
|
69
|
+
(SystemLayer.SPARK, ("spark", "executor", "java heap", "stage failure", "py4j", "analysisexception")),
|
|
70
|
+
(SystemLayer.STORAGE, ("s3", "abfss", "dbfs", "adls", "blob", "no space", "disk", "path does not exist")),
|
|
71
|
+
(SystemLayer.NETWORK, ("connection", "timeout", "timed out", "unreachable", "dns", "socket")),
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
_MEDALLION = ("bronze", "silver", "gold", "raw", "staging", "mart", "clean", "curated")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class IncidentDNABuilder:
|
|
78
|
+
"""Constructs an :class:`IncidentDNA` deterministically from structured inputs."""
|
|
79
|
+
|
|
80
|
+
def build(
|
|
81
|
+
self,
|
|
82
|
+
event: NormalizedFailureEvent,
|
|
83
|
+
context: IncidentContextBundle | None = None,
|
|
84
|
+
change_diff: ChangeDiffObject | None = None,
|
|
85
|
+
upstream_failed: bool = False,
|
|
86
|
+
) -> IncidentDNA:
|
|
87
|
+
text = self._haystack(event, context)
|
|
88
|
+
signals: list[str] = []
|
|
89
|
+
|
|
90
|
+
failure_type = FailureType.UNKNOWN
|
|
91
|
+
# Dependency override: a task whose upstream also failed is a *derived*
|
|
92
|
+
# failure — its true root cause is upstream, so we never mis-attribute it.
|
|
93
|
+
if upstream_failed:
|
|
94
|
+
failure_type = FailureType.DEPENDENCY
|
|
95
|
+
signals.append("upstream task in the same run also failed")
|
|
96
|
+
else:
|
|
97
|
+
for ftype, patterns in _RULES:
|
|
98
|
+
hit = [p for p in patterns if p in text]
|
|
99
|
+
if hit:
|
|
100
|
+
failure_type = ftype
|
|
101
|
+
signals.extend(hit)
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
change_diff = change_diff or ChangeDiffObject()
|
|
105
|
+
likely_change = self._change_corroborates(failure_type, change_diff)
|
|
106
|
+
if likely_change:
|
|
107
|
+
signals.append(f"{change_diff.change_count} change(s) since last success")
|
|
108
|
+
|
|
109
|
+
return IncidentDNA(
|
|
110
|
+
failure_type=failure_type,
|
|
111
|
+
system_layer=self._system_layer(text),
|
|
112
|
+
affected_layer=self._affected_layer(event, context),
|
|
113
|
+
root_signal=self._root_signal(event, signals),
|
|
114
|
+
confidence_signature=self._confidence(failure_type, signals, likely_change),
|
|
115
|
+
pattern_id=self._pattern_id(failure_type, event, context),
|
|
116
|
+
signals=signals,
|
|
117
|
+
likely_caused_by_change=likely_change,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# ─── helpers ───
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
def _haystack(event: NormalizedFailureEvent, context: IncidentContextBundle | None) -> str:
|
|
124
|
+
parts = [event.error_message or "", event.error_trace or ""]
|
|
125
|
+
if context:
|
|
126
|
+
parts.append(context.error_text)
|
|
127
|
+
return "\n".join(parts).lower()
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def _system_layer(text: str) -> SystemLayer:
|
|
131
|
+
for layer, hints in _LAYER_HINTS:
|
|
132
|
+
if any(h in text for h in hints):
|
|
133
|
+
return layer
|
|
134
|
+
return SystemLayer.DATABRICKS
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def _affected_layer(event: NormalizedFailureEvent, context: IncidentContextBundle | None) -> str:
|
|
138
|
+
candidates = [event.task.lower(), event.notebook_path.lower()]
|
|
139
|
+
if context:
|
|
140
|
+
candidates.extend(t.lower() for t in context.referenced_tables)
|
|
141
|
+
for cand in candidates:
|
|
142
|
+
for layer in _MEDALLION:
|
|
143
|
+
if layer in cand:
|
|
144
|
+
return layer
|
|
145
|
+
return "unknown"
|
|
146
|
+
|
|
147
|
+
@staticmethod
|
|
148
|
+
def _root_signal(event: NormalizedFailureEvent, signals: list[str]) -> str:
|
|
149
|
+
if signals:
|
|
150
|
+
return signals[0]
|
|
151
|
+
return event.short_error or "no error text captured"
|
|
152
|
+
|
|
153
|
+
def _change_corroborates(self, ftype: FailureType, diff: ChangeDiffObject) -> bool:
|
|
154
|
+
if not diff.has_changes:
|
|
155
|
+
return False
|
|
156
|
+
# A change of the matching category strongly corroborates the type.
|
|
157
|
+
if ftype == FailureType.SCHEMA_DRIFT and diff.schema_changes:
|
|
158
|
+
return True
|
|
159
|
+
if ftype == FailureType.CONFIG and diff.config_changes:
|
|
160
|
+
return True
|
|
161
|
+
if ftype == FailureType.CODE_ERROR and (diff.code_changes or diff.deployment_changes):
|
|
162
|
+
return True
|
|
163
|
+
if ftype in (FailureType.RESOURCE, FailureType.TIMEOUT) and diff.runtime_changes:
|
|
164
|
+
return True
|
|
165
|
+
# Any recent change is at least weak corroboration for an otherwise
|
|
166
|
+
# unexplained failure.
|
|
167
|
+
return ftype == FailureType.UNKNOWN
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def _confidence(ftype: FailureType, signals: list[str], likely_change: bool) -> str:
|
|
171
|
+
if ftype == FailureType.UNKNOWN:
|
|
172
|
+
return "low"
|
|
173
|
+
score = len([s for s in signals if "change(s)" not in s])
|
|
174
|
+
if score >= 2 or (score >= 1 and likely_change):
|
|
175
|
+
return "high"
|
|
176
|
+
if score >= 1:
|
|
177
|
+
return "medium"
|
|
178
|
+
return "low"
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def _pattern_id(ftype: FailureType, event: NormalizedFailureEvent, context: IncidentContextBundle | None) -> str:
|
|
182
|
+
layer = IncidentDNABuilder._affected_layer(event, context)
|
|
183
|
+
base = ftype.value.lower()
|
|
184
|
+
return f"{base}__{layer}_v1" if layer != "unknown" else f"{base}_v1"
|
iic/impact/__init__.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Stage 7 — the deterministic impact engine.
|
|
2
|
+
|
|
3
|
+
PURE DETERMINISTIC. No LLM, no I/O. Severity and business risk are a transparent
|
|
4
|
+
function of the blast radius and recurrence, so the same situation always yields
|
|
5
|
+
the same score and every point is explained in ``breakdown``.
|
|
6
|
+
|
|
7
|
+
Scoring formula (weights chosen so dashboard impact dominates — a broken exec
|
|
8
|
+
dashboard is worse than a blocked internal job):
|
|
9
|
+
|
|
10
|
+
score = downstream_jobs * 2
|
|
11
|
+
+ affected_tables * 1.5
|
|
12
|
+
+ dashboard_impact * 3
|
|
13
|
+
+ recurrence_score * 2
|
|
14
|
+
|
|
15
|
+
A critical-layer failure (gold / mart — the business-facing layer) bumps severity
|
|
16
|
+
by one band, since the same blast radius matters more there.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from iic.dependency.dependency_analyzer import BlastRadius
|
|
22
|
+
from iic.models.dna import IncidentDNA
|
|
23
|
+
from iic.models.impact import BusinessRisk, ImpactScore, Severity
|
|
24
|
+
|
|
25
|
+
W_DOWNSTREAM_JOBS = 2.0
|
|
26
|
+
W_AFFECTED_TABLES = 1.5
|
|
27
|
+
W_DASHBOARDS = 3.0
|
|
28
|
+
W_RECURRENCE = 2.0
|
|
29
|
+
|
|
30
|
+
# raw_score → severity band thresholds (inclusive lower bound).
|
|
31
|
+
_SEVERITY_BANDS = [
|
|
32
|
+
(20.0, Severity.CRITICAL),
|
|
33
|
+
(10.0, Severity.HIGH),
|
|
34
|
+
(4.0, Severity.MEDIUM),
|
|
35
|
+
(0.0, Severity.LOW),
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
_BUSINESS_LAYERS = ("gold", "mart", "curated")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ImpactEngine:
|
|
42
|
+
"""Computes an :class:`ImpactScore` from blast radius + recurrence + DNA."""
|
|
43
|
+
|
|
44
|
+
def score(self, blast: BlastRadius, dna: IncidentDNA, recurrence: int = 0) -> ImpactScore:
|
|
45
|
+
downstream_jobs = blast.downstream_jobs
|
|
46
|
+
affected_tables = blast.affected_tables
|
|
47
|
+
dashboards = blast.dashboard_impact
|
|
48
|
+
recurrence = max(0, int(recurrence))
|
|
49
|
+
|
|
50
|
+
terms = {
|
|
51
|
+
"downstream_jobs": downstream_jobs * W_DOWNSTREAM_JOBS,
|
|
52
|
+
"affected_tables": affected_tables * W_AFFECTED_TABLES,
|
|
53
|
+
"dashboards": dashboards * W_DASHBOARDS,
|
|
54
|
+
"recurrence": recurrence * W_RECURRENCE,
|
|
55
|
+
}
|
|
56
|
+
raw = sum(terms.values())
|
|
57
|
+
|
|
58
|
+
severity = self._band(raw)
|
|
59
|
+
# Business-facing layer failures are escalated one band.
|
|
60
|
+
if dna.affected_layer in _BUSINESS_LAYERS:
|
|
61
|
+
severity = self._bump(severity)
|
|
62
|
+
|
|
63
|
+
return ImpactScore(
|
|
64
|
+
raw_score=raw,
|
|
65
|
+
blast_radius=blast.total,
|
|
66
|
+
downstream_jobs=downstream_jobs,
|
|
67
|
+
affected_tables=affected_tables,
|
|
68
|
+
dashboard_impact=dashboards,
|
|
69
|
+
recurrence_score=recurrence,
|
|
70
|
+
severity=severity,
|
|
71
|
+
business_risk=self._business_risk(severity, dna),
|
|
72
|
+
breakdown={k: round(v, 2) for k, v in terms.items()},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
@staticmethod
|
|
76
|
+
def _band(raw: float) -> Severity:
|
|
77
|
+
for threshold, sev in _SEVERITY_BANDS:
|
|
78
|
+
if raw >= threshold:
|
|
79
|
+
return sev
|
|
80
|
+
return Severity.LOW
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def _bump(sev: Severity) -> Severity:
|
|
84
|
+
order = [Severity.LOW, Severity.MEDIUM, Severity.HIGH, Severity.CRITICAL]
|
|
85
|
+
idx = min(order.index(sev) + 1, len(order) - 1)
|
|
86
|
+
return order[idx]
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def _business_risk(sev: Severity, dna: IncidentDNA) -> BusinessRisk:
|
|
90
|
+
mapping = {
|
|
91
|
+
Severity.LOW: BusinessRisk.LOW,
|
|
92
|
+
Severity.MEDIUM: BusinessRisk.MODERATE,
|
|
93
|
+
Severity.HIGH: BusinessRisk.HIGH,
|
|
94
|
+
Severity.CRITICAL: BusinessRisk.CRITICAL,
|
|
95
|
+
}
|
|
96
|
+
risk = mapping[sev]
|
|
97
|
+
# A business-layer data-quality / schema problem is a reporting-integrity
|
|
98
|
+
# risk even at moderate blast radius — never below HIGH there.
|
|
99
|
+
if dna.affected_layer in _BUSINESS_LAYERS and dna.failure_type.value in ("SCHEMA_DRIFT", "DATA_QUALITY"):
|
|
100
|
+
if risk in (BusinessRisk.LOW, BusinessRisk.MODERATE):
|
|
101
|
+
risk = BusinessRisk.HIGH
|
|
102
|
+
return risk
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Stage 1+2 — event ingestion.
|
|
2
|
+
|
|
3
|
+
Each source implements :class:`FailureSource` and emits
|
|
4
|
+
:class:`~iic.models.event.NormalizedFailureEvent` objects. The core never sees a
|
|
5
|
+
source-specific payload, which is what keeps everything downstream uniform.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from iic.ingestion.base import FailureSource
|
|
11
|
+
from iic.ingestion.databricks_source import DatabricksFailureSource
|
|
12
|
+
from iic.ingestion.webhook_source import normalize_log_webhook
|
|
13
|
+
|
|
14
|
+
__all__ = ["FailureSource", "DatabricksFailureSource", "normalize_log_webhook"]
|
iic/ingestion/base.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""The ingestion source contract.
|
|
2
|
+
|
|
3
|
+
Adding a new failure source (e.g. a different orchestrator) means implementing
|
|
4
|
+
``discover`` to return :class:`NormalizedFailureEvent` objects — nothing else in
|
|
5
|
+
the system changes.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
|
|
12
|
+
from iic.models.event import NormalizedFailureEvent
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FailureSource(ABC):
|
|
16
|
+
"""A source that can surface pipeline failures as normalized events."""
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def discover(self) -> list[NormalizedFailureEvent]:
|
|
20
|
+
"""Return the failed tasks this source currently knows about."""
|
|
21
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Databricks ingestion — turn failed Job tasks into NormalizedFailureEvents.
|
|
2
|
+
|
|
3
|
+
Prefers the exact parent run handed in by the trigger task; otherwise discovers
|
|
4
|
+
the most recent failed run of the protected job. The healing-trigger task itself
|
|
5
|
+
is always excluded so the system never reports on itself.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from iic.ingestion.base import FailureSource
|
|
11
|
+
from iic.models.event import EventSource, NormalizedFailureEvent
|
|
12
|
+
|
|
13
|
+
TRIGGER_TASK_KEY = "trigger_incident_intelligence"
|
|
14
|
+
_LEGACY_TRIGGER_KEY = "trigger_self_healing"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DatabricksFailureSource(FailureSource):
|
|
18
|
+
"""Discovers failed tasks via the Databricks Jobs/Runs API."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, client, job_id: str, parent_run_id: str = "", lookback: int = 5):
|
|
21
|
+
self.client = client
|
|
22
|
+
self.job_id = job_id
|
|
23
|
+
self.parent_run_id = (parent_run_id or "").strip()
|
|
24
|
+
self.lookback = lookback
|
|
25
|
+
# Populated as a side effect of discover(), consumed by later stages.
|
|
26
|
+
self.run_info: dict = {}
|
|
27
|
+
|
|
28
|
+
def _failed_tasks(self, run: dict) -> list[dict]:
|
|
29
|
+
out = []
|
|
30
|
+
for t in run.get("tasks", []):
|
|
31
|
+
if t.get("state", {}).get("result_state") != "FAILED":
|
|
32
|
+
continue
|
|
33
|
+
if t.get("task_key") in (TRIGGER_TASK_KEY, _LEGACY_TRIGGER_KEY):
|
|
34
|
+
continue
|
|
35
|
+
out.append(t)
|
|
36
|
+
return out
|
|
37
|
+
|
|
38
|
+
def _locate_run(self) -> tuple[list[dict], dict]:
|
|
39
|
+
if self.parent_run_id:
|
|
40
|
+
try:
|
|
41
|
+
run = self.client._get("/api/2.1/jobs/runs/get", {"run_id": int(self.parent_run_id)})
|
|
42
|
+
ft = self._failed_tasks(run)
|
|
43
|
+
if ft:
|
|
44
|
+
return ft, run
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
if not self.job_id:
|
|
48
|
+
return [], {}
|
|
49
|
+
try:
|
|
50
|
+
for run in self.client.list_runs(int(self.job_id), limit=self.lookback):
|
|
51
|
+
ft = self._failed_tasks(run)
|
|
52
|
+
if ft:
|
|
53
|
+
return ft, run
|
|
54
|
+
except Exception:
|
|
55
|
+
pass
|
|
56
|
+
return [], {}
|
|
57
|
+
|
|
58
|
+
def discover(self) -> list[NormalizedFailureEvent]:
|
|
59
|
+
failed, run = self._locate_run()
|
|
60
|
+
self.run_info = run or {}
|
|
61
|
+
run_id = str(run.get("run_id", "")) if run else ""
|
|
62
|
+
pipeline = run.get("run_name", "") if run else ""
|
|
63
|
+
events: list[NormalizedFailureEvent] = []
|
|
64
|
+
for task in failed:
|
|
65
|
+
task_run_id = str(task.get("run_id", ""))
|
|
66
|
+
error_message, error_trace = "", ""
|
|
67
|
+
if task_run_id:
|
|
68
|
+
try:
|
|
69
|
+
out = self.client.get_run_output(int(task_run_id))
|
|
70
|
+
error_message = (out.get("error") or "").strip()
|
|
71
|
+
error_trace = (out.get("error_trace") or "").strip()
|
|
72
|
+
except Exception:
|
|
73
|
+
pass
|
|
74
|
+
cluster_id = ""
|
|
75
|
+
cluster = task.get("cluster_instance", {}) or {}
|
|
76
|
+
cluster_id = cluster.get("cluster_id", "")
|
|
77
|
+
events.append(NormalizedFailureEvent(
|
|
78
|
+
source=EventSource.DATABRICKS,
|
|
79
|
+
pipeline=pipeline or str(self.job_id),
|
|
80
|
+
task=task.get("task_key", "unknown"),
|
|
81
|
+
error_message=error_message or "(no error message returned)",
|
|
82
|
+
error_trace=error_trace,
|
|
83
|
+
timestamp=_iso_from_ms(task.get("start_time")),
|
|
84
|
+
run_id=run_id,
|
|
85
|
+
job_id=str(self.job_id),
|
|
86
|
+
cluster_id=cluster_id,
|
|
87
|
+
notebook_path=task.get("notebook_task", {}).get("notebook_path", ""),
|
|
88
|
+
))
|
|
89
|
+
return events
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _iso_from_ms(ms) -> str:
|
|
93
|
+
"""Databricks returns epoch-millis. Render ISO-8601 without importing wall-clock."""
|
|
94
|
+
try:
|
|
95
|
+
from datetime import datetime, timezone
|
|
96
|
+
return datetime.fromtimestamp(int(ms) / 1000.0, tz=timezone.utc).isoformat()
|
|
97
|
+
except Exception:
|
|
98
|
+
return ""
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""A failure source backed by already-known events.
|
|
2
|
+
|
|
3
|
+
The v4 wrapper (``iic_monitor``) and the Spark listener catch a failure *as it
|
|
4
|
+
happens* — they already hold the error, so there's nothing to "discover" from the
|
|
5
|
+
Jobs API. They hand the engine a pre-built :class:`NormalizedFailureEvent` through
|
|
6
|
+
this source instead of polling.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from iic.ingestion.base import FailureSource
|
|
12
|
+
from iic.models.event import NormalizedFailureEvent
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StaticFailureSource(FailureSource):
|
|
16
|
+
"""Yields a fixed list of events; ``run_info`` carries the job DAG if known."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, events: list[NormalizedFailureEvent], run_info: dict | None = None):
|
|
19
|
+
self._events = events
|
|
20
|
+
self.run_info = run_info or {}
|
|
21
|
+
|
|
22
|
+
def discover(self) -> list[NormalizedFailureEvent]:
|
|
23
|
+
return list(self._events)
|