shkit 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. healing_kit/__init__.py +3 -0
  2. healing_kit/auth.py +79 -0
  3. healing_kit/clients/__init__.py +1 -0
  4. healing_kit/clients/databricks_client.py +183 -0
  5. healing_kit/clients/teams_client.py +128 -0
  6. healing_kit/models/__init__.py +1 -0
  7. healing_kit/models/diagnosis.py +45 -0
  8. healing_kit/models/events.py +30 -0
  9. healing_kit/models/evidence.py +83 -0
  10. healing_kit/runtime/__init__.py +6 -0
  11. healing_kit/runtime/approval.py +141 -0
  12. healing_kit/runtime/maintenance.py +52 -0
  13. healing_kit/services/__init__.py +1 -0
  14. healing_kit/services/cache_service.py +120 -0
  15. healing_kit/services/circuit_breaker.py +114 -0
  16. healing_kit/services/context_agent.py +127 -0
  17. healing_kit/services/dependency_graph.py +141 -0
  18. healing_kit/services/diagnosis_engine.py +165 -0
  19. healing_kit/services/identity.py +61 -0
  20. healing_kit/services/model_router.py +52 -0
  21. healing_kit/services/query_guard.py +168 -0
  22. healing_kit/services/resolution_verifier.py +100 -0
  23. healing_kit/services/token_budget.py +137 -0
  24. healing_kit/utils/__init__.py +1 -0
  25. healing_kit/utils/error_hash.py +15 -0
  26. healing_kit/utils/hmac_tokens.py +86 -0
  27. healing_kit/utils/sql_safety.py +84 -0
  28. iic/__init__.py +51 -0
  29. iic/__main__.py +18 -0
  30. iic/_console.py +235 -0
  31. iic/_doctor.py +143 -0
  32. iic/change/__init__.py +7 -0
  33. iic/change/change_detector.py +154 -0
  34. iic/context/__init__.py +7 -0
  35. iic/context/context_builder.py +117 -0
  36. iic/dependency/__init__.py +7 -0
  37. iic/dependency/dependency_analyzer.py +93 -0
  38. iic/diagnosis/__init__.py +7 -0
  39. iic/diagnosis/diagnosis_engine.py +183 -0
  40. iic/dna/__init__.py +7 -0
  41. iic/dna/dna_builder.py +184 -0
  42. iic/impact/__init__.py +7 -0
  43. iic/impact/impact_engine.py +102 -0
  44. iic/ingestion/__init__.py +14 -0
  45. iic/ingestion/base.py +21 -0
  46. iic/ingestion/databricks_source.py +98 -0
  47. iic/ingestion/static_source.py +23 -0
  48. iic/ingestion/webhook_source.py +39 -0
  49. iic/models/__init__.py +44 -0
  50. iic/models/change.py +77 -0
  51. iic/models/context.py +46 -0
  52. iic/models/diagnosis.py +37 -0
  53. iic/models/dna.py +77 -0
  54. iic/models/event.py +78 -0
  55. iic/models/impact.py +60 -0
  56. iic/models/report.py +88 -0
  57. iic/models/routing.py +41 -0
  58. iic/notify/__init__.py +7 -0
  59. iic/notify/teams_notifier.py +112 -0
  60. iic/report/__init__.py +7 -0
  61. iic/report/report_generator.py +67 -0
  62. iic/routing/__init__.py +7 -0
  63. iic/routing/router.py +42 -0
  64. iic/runtime/__init__.py +10 -0
  65. iic/runtime/_sql.py +11 -0
  66. iic/runtime/agent_config.py +48 -0
  67. iic/runtime/agent_runtime.py +70 -0
  68. iic/runtime/antibodies.py +100 -0
  69. iic/runtime/bootstrap.py +157 -0
  70. iic/runtime/constants.py +40 -0
  71. iic/runtime/context.py +46 -0
  72. iic/runtime/detective.py +72 -0
  73. iic/runtime/hooks.py +85 -0
  74. iic/runtime/incident_engine.py +207 -0
  75. iic/runtime/inprocess.py +350 -0
  76. iic/runtime/ledger.py +120 -0
  77. iic/runtime/monitor.py +155 -0
  78. iic/runtime/pattern_store.py +53 -0
  79. iic/runtime/reconciler.py +139 -0
  80. iic/runtime/scope_config.py +127 -0
  81. iic/runtime/store.py +150 -0
  82. iic/runtime/wrapper.py +28 -0
  83. iic_autoload.pth +1 -0
  84. onboarding/__init__.py +1 -0
  85. onboarding/cli.py +168 -0
  86. onboarding/config_schema.py +62 -0
  87. onboarding/manifest.py +27 -0
  88. onboarding/preflight.py +129 -0
  89. onboarding/provisioner.py +573 -0
  90. onboarding/rollback.py +81 -0
  91. shkit-1.2.0.dist-info/METADATA +239 -0
  92. shkit-1.2.0.dist-info/RECORD +94 -0
  93. shkit-1.2.0.dist-info/WHEEL +4 -0
  94. shkit-1.2.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,183 @@
1
+ """Stage 9 — the diagnosis engine.
2
+
3
+ The only place an LLM enters the system, and only after the DNA, impact, and
4
+ routing decision exist. Two paths:
5
+
6
+ * ``tier == NONE`` → no LLM. A deterministic narrative is synthesised from the
7
+ DNA (cache replay, or a derived-dependency pointer, or a templated explanation
8
+ for a confidently-classified failure). The system is fully useful with zero
9
+ tokens spent.
10
+ * LLM tier → a strict-JSON prompt built *from the structured DNA* (not raw
11
+ text), parsed into a structured :class:`DiagnosisResult`. The model interprets
12
+ structure; it does not classify or chat.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+
19
+ from iic.models.change import ChangeDiffObject
20
+ from iic.models.context import IncidentContextBundle
21
+ from iic.models.diagnosis import DiagnosisResult
22
+ from iic.models.dna import FailureType, IncidentDNA
23
+ from iic.models.event import NormalizedFailureEvent
24
+ from iic.models.routing import RoutingDecision
25
+
26
+ _SYSTEM_MSG = (
27
+ "You are a senior data reliability engineer. You are given a STRUCTURED incident "
28
+ "fingerprint that was already classified deterministically. Do not re-classify; "
29
+ "explain the root cause and propose a concrete fix. Respond ONLY in valid JSON. "
30
+ "Never suggest DROP, DELETE, or TRUNCATE."
31
+ )
32
+
33
+ _JSON_SPEC = (
34
+ '\n\nReturn ONLY this JSON object:\n'
35
+ '{"root_cause": "one-sentence root cause", "confidence": 0.0-1.0, '
36
+ '"reasoning": "why, citing the signals/changes", '
37
+ '"suggested_fix": "concrete remediation step", '
38
+ '"alternatives": ["other hypothesis", "..."]}'
39
+ )
40
+
41
+ # Deterministic narratives used when no LLM is called (tier == NONE) or as a
42
+ # fallback if the model errors. Keyed by failure type.
43
+ _TEMPLATES: dict[FailureType, tuple[str, str]] = {
44
+ FailureType.SCHEMA_DRIFT: (
45
+ "The failing task hit a schema mismatch — a column it expects is missing or changed type.",
46
+ "Reconcile the ingestion/transform mapping with the new source schema "
47
+ "(add/rename the column or enable schema evolution), then re-run.",
48
+ ),
49
+ FailureType.DATA_QUALITY: (
50
+ "A data-quality rule failed (duplicates, nulls, or a threshold/constraint breach).",
51
+ "Inspect the offending rows from the evidence query, fix the source/transform, and re-run the quality gate.",
52
+ ),
53
+ FailureType.MISSING_DATA: (
54
+ "An expected input table or file path was empty or absent when the task ran.",
55
+ "Confirm the upstream load completed and the path/table exists, then re-run; add a presence check upstream.",
56
+ ),
57
+ FailureType.PERMISSION: (
58
+ "The executing identity lacks a required privilege on a table, path, or secret.",
59
+ "Grant the missing privilege to the run identity (least-privilege) and re-run; no data change needed.",
60
+ ),
61
+ FailureType.DEPENDENCY: (
62
+ "This task did not fail on its own — an upstream task in the same run failed first.",
63
+ "Resolve the upstream root-cause incident; this task should recover on its re-run.",
64
+ ),
65
+ FailureType.RESOURCE: (
66
+ "The task exhausted compute resources (memory/executors/disk).",
67
+ "Increase cluster size or partition the workload; re-run with adjusted resources.",
68
+ ),
69
+ FailureType.TIMEOUT: (
70
+ "The task exceeded its time budget.",
71
+ "Raise the timeout or optimise the slow stage (skew/partitioning), then re-run.",
72
+ ),
73
+ FailureType.CONFIG: (
74
+ "A required configuration value, parameter, or secret was missing or invalid.",
75
+ "Correct the parameter/secret value in the job/secret scope and re-run.",
76
+ ),
77
+ FailureType.CODE_ERROR: (
78
+ "The notebook raised a code-level exception (syntax/type/import).",
79
+ "Fix the code defect indicated by the traceback and redeploy/re-run.",
80
+ ),
81
+ FailureType.UNKNOWN: (
82
+ "The failure could not be classified deterministically.",
83
+ "Review the captured logs and notebook source; escalate to the owning engineer.",
84
+ ),
85
+ }
86
+
87
+
88
+ class DiagnosisEngine:
89
+ """Produces a structured :class:`DiagnosisResult` for an incident."""
90
+
91
+ def __init__(self, client=None):
92
+ self.client = client
93
+
94
+ def diagnose(
95
+ self,
96
+ event: NormalizedFailureEvent,
97
+ dna: IncidentDNA,
98
+ routing: RoutingDecision,
99
+ context: IncidentContextBundle | None = None,
100
+ change_diff: ChangeDiffObject | None = None,
101
+ evidence: list[str] | None = None,
102
+ ) -> tuple[DiagnosisResult, int]:
103
+ """Return ``(DiagnosisResult, tokens_used)``."""
104
+ if not routing.requires_llm or self.client is None:
105
+ return self._deterministic(dna, routing, change_diff), 0
106
+
107
+ prompt = self._build_prompt(event, dna, context, change_diff, evidence)
108
+ try:
109
+ content, usage = self.client.invoke_model_full(
110
+ routing.model,
111
+ [{"role": "system", "content": _SYSTEM_MSG}, {"role": "user", "content": prompt}],
112
+ max_tokens=900,
113
+ )
114
+ except Exception as ex:
115
+ result = self._deterministic(dna, routing, change_diff)
116
+ result.reasoning = f"LLM unavailable ({str(ex)[:80]}); used deterministic template."
117
+ return result, 0
118
+
119
+ tokens = int((usage or {}).get("total_tokens", 0) or 0)
120
+ return self._parse(content, dna, routing), tokens
121
+
122
+ # ─── deterministic path ───
123
+
124
+ def _deterministic(self, dna: IncidentDNA, routing: RoutingDecision,
125
+ change_diff: ChangeDiffObject | None) -> DiagnosisResult:
126
+ root_cause, fix = _TEMPLATES.get(dna.failure_type, _TEMPLATES[FailureType.UNKNOWN])
127
+ confidence = {"high": 0.85, "medium": 0.6, "low": 0.35}.get(dna.confidence_signature, 0.4)
128
+ evidence = list(dna.signals)
129
+ if change_diff and change_diff.has_changes:
130
+ evidence += [f"change: {s}" for s in change_diff.summaries()[:3]]
131
+ reasoning = routing.reason or "Deterministic classification; no LLM required."
132
+ return DiagnosisResult(
133
+ root_cause=root_cause,
134
+ confidence=confidence,
135
+ reasoning=reasoning,
136
+ suggested_fix=fix,
137
+ evidence=evidence,
138
+ produced_by="rules",
139
+ )
140
+
141
+ # ─── LLM path ───
142
+
143
+ def _build_prompt(self, event, dna, context, change_diff, evidence) -> str:
144
+ changes = (change_diff.summaries() if change_diff else []) or ["none detected"]
145
+ ev = evidence or []
146
+ nb = (context.notebook_source[:1200] if context and context.notebook_source else "")
147
+ return (
148
+ f"INCIDENT FINGERPRINT (pre-classified):\n"
149
+ f"- failure_type: {dna.failure_type.value}\n"
150
+ f"- affected_layer: {dna.affected_layer}\n"
151
+ f"- system_layer: {dna.system_layer.value}\n"
152
+ f"- root_signal: {dna.root_signal}\n"
153
+ f"- signals: {', '.join(dna.signals) or 'n/a'}\n"
154
+ f"- pattern_id: {dna.pattern_id}\n\n"
155
+ f"PIPELINE: {event.pipeline} · TASK: {event.task}\n"
156
+ f"ERROR:\n{event.error_message[:1500]}\n\n"
157
+ f"RECENT CHANGES (since last success):\n- " + "\n- ".join(changes[:8]) + "\n\n"
158
+ "DATA EVIDENCE:\n- " + ("\n- ".join(ev[:6]) if ev else "none") + "\n\n"
159
+ + (f"NOTEBOOK (truncated):\n{nb}\n" if nb else "")
160
+ + _JSON_SPEC
161
+ )
162
+
163
+ def _parse(self, content: str, dna: IncidentDNA, routing: RoutingDecision) -> DiagnosisResult:
164
+ cleaned = (content or "").strip()
165
+ if cleaned.startswith("```"):
166
+ cleaned = cleaned.split("\n", 1)[-1]
167
+ if cleaned.endswith("```"):
168
+ cleaned = cleaned[:-3]
169
+ try:
170
+ data = json.loads(cleaned.strip())
171
+ except Exception:
172
+ result = self._deterministic(dna, routing, None)
173
+ result.reasoning = "LLM returned unparseable output; used deterministic template."
174
+ return result
175
+ return DiagnosisResult(
176
+ root_cause=str(data.get("root_cause", ""))[:500] or _TEMPLATES[dna.failure_type][0],
177
+ confidence=data.get("confidence", 0.5),
178
+ reasoning=str(data.get("reasoning", ""))[:800],
179
+ suggested_fix=str(data.get("suggested_fix", ""))[:500],
180
+ alternatives=[str(a)[:200] for a in (data.get("alternatives") or [])][:4],
181
+ evidence=list(dna.signals),
182
+ produced_by=f"llm:{routing.model}",
183
+ )
iic/dna/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Stage 6 — Incident DNA construction (the core intelligence layer)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from iic.dna.dna_builder import IncidentDNABuilder
6
+
7
+ __all__ = ["IncidentDNABuilder"]
iic/dna/dna_builder.py ADDED
@@ -0,0 +1,184 @@
1
+ """Stage 6 — build the IncidentDNA. The heart of the system.
2
+
3
+ This is *deterministic, rule-based* classification: it maps the raw error text,
4
+ the gathered context, and the change diff onto the canonical
5
+ :class:`FailureType` taxonomy, and records exactly which signals fired so the
6
+ result is fully traceable. The LLM never runs here — it runs later, on the DNA
7
+ this stage produces.
8
+
9
+ Why rules and not an LLM for classification:
10
+ * predictable, testable, free, and instant;
11
+ * the classification drives cost decisions (model routing), so it must not
12
+ itself cost an LLM call;
13
+ * a wrong-but-confident LLM label would poison everything downstream.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from iic.models.change import ChangeDiffObject
19
+ from iic.models.context import IncidentContextBundle
20
+ from iic.models.dna import FailureType, IncidentDNA, SystemLayer
21
+ from iic.models.event import NormalizedFailureEvent
22
+
23
+ # Ordered most-specific → least-specific. The first type with a matching signal
24
+ # (after the dependency override) wins. Each pattern is a lowercased substring.
25
+ _RULES: list[tuple[FailureType, tuple[str, ...]]] = [
26
+ (FailureType.PERMISSION, (
27
+ "permission denied", "access denied", "unauthorized", "not authorized",
28
+ "forbidden", "does not have privilege", "requires permission", "403",
29
+ )),
30
+ (FailureType.SCHEMA_DRIFT, (
31
+ "no such column", "cannot resolve column", "cannot resolve '",
32
+ "unresolved column", "missing column", "schema mismatch",
33
+ "incompatible schema", "schema is not compatible", "field not found",
34
+ "column not found", "mergeschema", "schema drift",
35
+ )),
36
+ (FailureType.MISSING_DATA, (
37
+ "path does not exist", "table or view not found", "no such table",
38
+ "file not found", "filenotfound", "does not exist", "no files found",
39
+ "empty input", "0 rows", "returned no rows",
40
+ )),
41
+ (FailureType.DATA_QUALITY, (
42
+ "duplicate", "constraint", "expectation", "check constraint",
43
+ "not null", "null value", "validation failed", "data quality",
44
+ "threshold", "violat",
45
+ )),
46
+ (FailureType.RESOURCE, (
47
+ "out of memory", "outofmemory", "java heap", "gc overhead",
48
+ "executor lost", "container killed", "no space left", "disk is full",
49
+ "exceeded memory", "lost executor", "exceeds the allowed",
50
+ )),
51
+ (FailureType.TIMEOUT, (
52
+ "timeout", "timed out", "deadline exceeded", "exceeded the timeout",
53
+ "query has exceeded the time", "cancelled because it exceeded",
54
+ )),
55
+ (FailureType.CONFIG, (
56
+ "not found in scope", "missing required", "invalid argument",
57
+ "secret", "environment variable", "missing parameter", "no value for",
58
+ "configuration", "could not find a value",
59
+ )),
60
+ (FailureType.CODE_ERROR, (
61
+ "syntaxerror", "typeerror", "attributeerror", "nameerror",
62
+ "importerror", "modulenotfounderror", "valueerror", "indexerror",
63
+ "keyerror", "zerodivisionerror", "traceback (most recent call last)",
64
+ )),
65
+ ]
66
+
67
+ # System-layer hints (lowercased substrings).
68
+ _LAYER_HINTS: list[tuple[SystemLayer, tuple[str, ...]]] = [
69
+ (SystemLayer.SPARK, ("spark", "executor", "java heap", "stage failure", "py4j", "analysisexception")),
70
+ (SystemLayer.STORAGE, ("s3", "abfss", "dbfs", "adls", "blob", "no space", "disk", "path does not exist")),
71
+ (SystemLayer.NETWORK, ("connection", "timeout", "timed out", "unreachable", "dns", "socket")),
72
+ ]
73
+
74
+ _MEDALLION = ("bronze", "silver", "gold", "raw", "staging", "mart", "clean", "curated")
75
+
76
+
77
+ class IncidentDNABuilder:
78
+ """Constructs an :class:`IncidentDNA` deterministically from structured inputs."""
79
+
80
+ def build(
81
+ self,
82
+ event: NormalizedFailureEvent,
83
+ context: IncidentContextBundle | None = None,
84
+ change_diff: ChangeDiffObject | None = None,
85
+ upstream_failed: bool = False,
86
+ ) -> IncidentDNA:
87
+ text = self._haystack(event, context)
88
+ signals: list[str] = []
89
+
90
+ failure_type = FailureType.UNKNOWN
91
+ # Dependency override: a task whose upstream also failed is a *derived*
92
+ # failure — its true root cause is upstream, so we never mis-attribute it.
93
+ if upstream_failed:
94
+ failure_type = FailureType.DEPENDENCY
95
+ signals.append("upstream task in the same run also failed")
96
+ else:
97
+ for ftype, patterns in _RULES:
98
+ hit = [p for p in patterns if p in text]
99
+ if hit:
100
+ failure_type = ftype
101
+ signals.extend(hit)
102
+ break
103
+
104
+ change_diff = change_diff or ChangeDiffObject()
105
+ likely_change = self._change_corroborates(failure_type, change_diff)
106
+ if likely_change:
107
+ signals.append(f"{change_diff.change_count} change(s) since last success")
108
+
109
+ return IncidentDNA(
110
+ failure_type=failure_type,
111
+ system_layer=self._system_layer(text),
112
+ affected_layer=self._affected_layer(event, context),
113
+ root_signal=self._root_signal(event, signals),
114
+ confidence_signature=self._confidence(failure_type, signals, likely_change),
115
+ pattern_id=self._pattern_id(failure_type, event, context),
116
+ signals=signals,
117
+ likely_caused_by_change=likely_change,
118
+ )
119
+
120
+ # ─── helpers ───
121
+
122
+ @staticmethod
123
+ def _haystack(event: NormalizedFailureEvent, context: IncidentContextBundle | None) -> str:
124
+ parts = [event.error_message or "", event.error_trace or ""]
125
+ if context:
126
+ parts.append(context.error_text)
127
+ return "\n".join(parts).lower()
128
+
129
+ @staticmethod
130
+ def _system_layer(text: str) -> SystemLayer:
131
+ for layer, hints in _LAYER_HINTS:
132
+ if any(h in text for h in hints):
133
+ return layer
134
+ return SystemLayer.DATABRICKS
135
+
136
+ @staticmethod
137
+ def _affected_layer(event: NormalizedFailureEvent, context: IncidentContextBundle | None) -> str:
138
+ candidates = [event.task.lower(), event.notebook_path.lower()]
139
+ if context:
140
+ candidates.extend(t.lower() for t in context.referenced_tables)
141
+ for cand in candidates:
142
+ for layer in _MEDALLION:
143
+ if layer in cand:
144
+ return layer
145
+ return "unknown"
146
+
147
+ @staticmethod
148
+ def _root_signal(event: NormalizedFailureEvent, signals: list[str]) -> str:
149
+ if signals:
150
+ return signals[0]
151
+ return event.short_error or "no error text captured"
152
+
153
+ def _change_corroborates(self, ftype: FailureType, diff: ChangeDiffObject) -> bool:
154
+ if not diff.has_changes:
155
+ return False
156
+ # A change of the matching category strongly corroborates the type.
157
+ if ftype == FailureType.SCHEMA_DRIFT and diff.schema_changes:
158
+ return True
159
+ if ftype == FailureType.CONFIG and diff.config_changes:
160
+ return True
161
+ if ftype == FailureType.CODE_ERROR and (diff.code_changes or diff.deployment_changes):
162
+ return True
163
+ if ftype in (FailureType.RESOURCE, FailureType.TIMEOUT) and diff.runtime_changes:
164
+ return True
165
+ # Any recent change is at least weak corroboration for an otherwise
166
+ # unexplained failure.
167
+ return ftype == FailureType.UNKNOWN
168
+
169
+ @staticmethod
170
+ def _confidence(ftype: FailureType, signals: list[str], likely_change: bool) -> str:
171
+ if ftype == FailureType.UNKNOWN:
172
+ return "low"
173
+ score = len([s for s in signals if "change(s)" not in s])
174
+ if score >= 2 or (score >= 1 and likely_change):
175
+ return "high"
176
+ if score >= 1:
177
+ return "medium"
178
+ return "low"
179
+
180
+ @staticmethod
181
+ def _pattern_id(ftype: FailureType, event: NormalizedFailureEvent, context: IncidentContextBundle | None) -> str:
182
+ layer = IncidentDNABuilder._affected_layer(event, context)
183
+ base = ftype.value.lower()
184
+ return f"{base}__{layer}_v1" if layer != "unknown" else f"{base}_v1"
iic/impact/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Stage 7 — deterministic impact scoring (NO LLM)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from iic.impact.impact_engine import ImpactEngine
6
+
7
+ __all__ = ["ImpactEngine"]
@@ -0,0 +1,102 @@
1
+ """Stage 7 — the deterministic impact engine.
2
+
3
+ PURE DETERMINISTIC. No LLM, no I/O. Severity and business risk are a transparent
4
+ function of the blast radius and recurrence, so the same situation always yields
5
+ the same score and every point is explained in ``breakdown``.
6
+
7
+ Scoring formula (weights chosen so dashboard impact dominates — a broken exec
8
+ dashboard is worse than a blocked internal job):
9
+
10
+ score = downstream_jobs * 2
11
+ + affected_tables * 1.5
12
+ + dashboard_impact * 3
13
+ + recurrence_score * 2
14
+
15
+ A critical-layer failure (gold / mart — the business-facing layer) bumps severity
16
+ by one band, since the same blast radius matters more there.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from iic.dependency.dependency_analyzer import BlastRadius
22
+ from iic.models.dna import IncidentDNA
23
+ from iic.models.impact import BusinessRisk, ImpactScore, Severity
24
+
25
+ W_DOWNSTREAM_JOBS = 2.0
26
+ W_AFFECTED_TABLES = 1.5
27
+ W_DASHBOARDS = 3.0
28
+ W_RECURRENCE = 2.0
29
+
30
+ # raw_score → severity band thresholds (inclusive lower bound).
31
+ _SEVERITY_BANDS = [
32
+ (20.0, Severity.CRITICAL),
33
+ (10.0, Severity.HIGH),
34
+ (4.0, Severity.MEDIUM),
35
+ (0.0, Severity.LOW),
36
+ ]
37
+
38
+ _BUSINESS_LAYERS = ("gold", "mart", "curated")
39
+
40
+
41
+ class ImpactEngine:
42
+ """Computes an :class:`ImpactScore` from blast radius + recurrence + DNA."""
43
+
44
+ def score(self, blast: BlastRadius, dna: IncidentDNA, recurrence: int = 0) -> ImpactScore:
45
+ downstream_jobs = blast.downstream_jobs
46
+ affected_tables = blast.affected_tables
47
+ dashboards = blast.dashboard_impact
48
+ recurrence = max(0, int(recurrence))
49
+
50
+ terms = {
51
+ "downstream_jobs": downstream_jobs * W_DOWNSTREAM_JOBS,
52
+ "affected_tables": affected_tables * W_AFFECTED_TABLES,
53
+ "dashboards": dashboards * W_DASHBOARDS,
54
+ "recurrence": recurrence * W_RECURRENCE,
55
+ }
56
+ raw = sum(terms.values())
57
+
58
+ severity = self._band(raw)
59
+ # Business-facing layer failures are escalated one band.
60
+ if dna.affected_layer in _BUSINESS_LAYERS:
61
+ severity = self._bump(severity)
62
+
63
+ return ImpactScore(
64
+ raw_score=raw,
65
+ blast_radius=blast.total,
66
+ downstream_jobs=downstream_jobs,
67
+ affected_tables=affected_tables,
68
+ dashboard_impact=dashboards,
69
+ recurrence_score=recurrence,
70
+ severity=severity,
71
+ business_risk=self._business_risk(severity, dna),
72
+ breakdown={k: round(v, 2) for k, v in terms.items()},
73
+ )
74
+
75
+ @staticmethod
76
+ def _band(raw: float) -> Severity:
77
+ for threshold, sev in _SEVERITY_BANDS:
78
+ if raw >= threshold:
79
+ return sev
80
+ return Severity.LOW
81
+
82
+ @staticmethod
83
+ def _bump(sev: Severity) -> Severity:
84
+ order = [Severity.LOW, Severity.MEDIUM, Severity.HIGH, Severity.CRITICAL]
85
+ idx = min(order.index(sev) + 1, len(order) - 1)
86
+ return order[idx]
87
+
88
+ @staticmethod
89
+ def _business_risk(sev: Severity, dna: IncidentDNA) -> BusinessRisk:
90
+ mapping = {
91
+ Severity.LOW: BusinessRisk.LOW,
92
+ Severity.MEDIUM: BusinessRisk.MODERATE,
93
+ Severity.HIGH: BusinessRisk.HIGH,
94
+ Severity.CRITICAL: BusinessRisk.CRITICAL,
95
+ }
96
+ risk = mapping[sev]
97
+ # A business-layer data-quality / schema problem is a reporting-integrity
98
+ # risk even at moderate blast radius — never below HIGH there.
99
+ if dna.affected_layer in _BUSINESS_LAYERS and dna.failure_type.value in ("SCHEMA_DRIFT", "DATA_QUALITY"):
100
+ if risk in (BusinessRisk.LOW, BusinessRisk.MODERATE):
101
+ risk = BusinessRisk.HIGH
102
+ return risk
@@ -0,0 +1,14 @@
1
+ """Stage 1+2 — event ingestion.
2
+
3
+ Each source implements :class:`FailureSource` and emits
4
+ :class:`~iic.models.event.NormalizedFailureEvent` objects. The core never sees a
5
+ source-specific payload, which is what keeps everything downstream uniform.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from iic.ingestion.base import FailureSource
11
+ from iic.ingestion.databricks_source import DatabricksFailureSource
12
+ from iic.ingestion.webhook_source import normalize_log_webhook
13
+
14
+ __all__ = ["FailureSource", "DatabricksFailureSource", "normalize_log_webhook"]
iic/ingestion/base.py ADDED
@@ -0,0 +1,21 @@
1
+ """The ingestion source contract.
2
+
3
+ Adding a new failure source (e.g. a different orchestrator) means implementing
4
+ ``discover`` to return :class:`NormalizedFailureEvent` objects — nothing else in
5
+ the system changes.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from abc import ABC, abstractmethod
11
+
12
+ from iic.models.event import NormalizedFailureEvent
13
+
14
+
15
+ class FailureSource(ABC):
16
+ """A source that can surface pipeline failures as normalized events."""
17
+
18
+ @abstractmethod
19
+ def discover(self) -> list[NormalizedFailureEvent]:
20
+ """Return the failed tasks this source currently knows about."""
21
+ raise NotImplementedError
@@ -0,0 +1,98 @@
1
+ """Databricks ingestion — turn failed Job tasks into NormalizedFailureEvents.
2
+
3
+ Prefers the exact parent run handed in by the trigger task; otherwise discovers
4
+ the most recent failed run of the protected job. The healing-trigger task itself
5
+ is always excluded so the system never reports on itself.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from iic.ingestion.base import FailureSource
11
+ from iic.models.event import EventSource, NormalizedFailureEvent
12
+
13
+ TRIGGER_TASK_KEY = "trigger_incident_intelligence"
14
+ _LEGACY_TRIGGER_KEY = "trigger_self_healing"
15
+
16
+
17
+ class DatabricksFailureSource(FailureSource):
18
+ """Discovers failed tasks via the Databricks Jobs/Runs API."""
19
+
20
+ def __init__(self, client, job_id: str, parent_run_id: str = "", lookback: int = 5):
21
+ self.client = client
22
+ self.job_id = job_id
23
+ self.parent_run_id = (parent_run_id or "").strip()
24
+ self.lookback = lookback
25
+ # Populated as a side effect of discover(), consumed by later stages.
26
+ self.run_info: dict = {}
27
+
28
+ def _failed_tasks(self, run: dict) -> list[dict]:
29
+ out = []
30
+ for t in run.get("tasks", []):
31
+ if t.get("state", {}).get("result_state") != "FAILED":
32
+ continue
33
+ if t.get("task_key") in (TRIGGER_TASK_KEY, _LEGACY_TRIGGER_KEY):
34
+ continue
35
+ out.append(t)
36
+ return out
37
+
38
+ def _locate_run(self) -> tuple[list[dict], dict]:
39
+ if self.parent_run_id:
40
+ try:
41
+ run = self.client._get("/api/2.1/jobs/runs/get", {"run_id": int(self.parent_run_id)})
42
+ ft = self._failed_tasks(run)
43
+ if ft:
44
+ return ft, run
45
+ except Exception:
46
+ pass
47
+ if not self.job_id:
48
+ return [], {}
49
+ try:
50
+ for run in self.client.list_runs(int(self.job_id), limit=self.lookback):
51
+ ft = self._failed_tasks(run)
52
+ if ft:
53
+ return ft, run
54
+ except Exception:
55
+ pass
56
+ return [], {}
57
+
58
+ def discover(self) -> list[NormalizedFailureEvent]:
59
+ failed, run = self._locate_run()
60
+ self.run_info = run or {}
61
+ run_id = str(run.get("run_id", "")) if run else ""
62
+ pipeline = run.get("run_name", "") if run else ""
63
+ events: list[NormalizedFailureEvent] = []
64
+ for task in failed:
65
+ task_run_id = str(task.get("run_id", ""))
66
+ error_message, error_trace = "", ""
67
+ if task_run_id:
68
+ try:
69
+ out = self.client.get_run_output(int(task_run_id))
70
+ error_message = (out.get("error") or "").strip()
71
+ error_trace = (out.get("error_trace") or "").strip()
72
+ except Exception:
73
+ pass
74
+ cluster_id = ""
75
+ cluster = task.get("cluster_instance", {}) or {}
76
+ cluster_id = cluster.get("cluster_id", "")
77
+ events.append(NormalizedFailureEvent(
78
+ source=EventSource.DATABRICKS,
79
+ pipeline=pipeline or str(self.job_id),
80
+ task=task.get("task_key", "unknown"),
81
+ error_message=error_message or "(no error message returned)",
82
+ error_trace=error_trace,
83
+ timestamp=_iso_from_ms(task.get("start_time")),
84
+ run_id=run_id,
85
+ job_id=str(self.job_id),
86
+ cluster_id=cluster_id,
87
+ notebook_path=task.get("notebook_task", {}).get("notebook_path", ""),
88
+ ))
89
+ return events
90
+
91
+
92
+ def _iso_from_ms(ms) -> str:
93
+ """Databricks returns epoch-millis. Render ISO-8601 without importing wall-clock."""
94
+ try:
95
+ from datetime import datetime, timezone
96
+ return datetime.fromtimestamp(int(ms) / 1000.0, tz=timezone.utc).isoformat()
97
+ except Exception:
98
+ return ""
@@ -0,0 +1,23 @@
1
+ """A failure source backed by already-known events.
2
+
3
+ The v4 wrapper (``iic_monitor``) and the Spark listener catch a failure *as it
4
+ happens* — they already hold the error, so there's nothing to "discover" from the
5
+ Jobs API. They hand the engine a pre-built :class:`NormalizedFailureEvent` through
6
+ this source instead of polling.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from iic.ingestion.base import FailureSource
12
+ from iic.models.event import NormalizedFailureEvent
13
+
14
+
15
+ class StaticFailureSource(FailureSource):
16
+ """Yields a fixed list of events; ``run_info`` carries the job DAG if known."""
17
+
18
+ def __init__(self, events: list[NormalizedFailureEvent], run_info: dict | None = None):
19
+ self._events = events
20
+ self.run_info = run_info or {}
21
+
22
+ def discover(self) -> list[NormalizedFailureEvent]:
23
+ return list(self._events)