shkit 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- healing_kit/__init__.py +3 -0
- healing_kit/auth.py +79 -0
- healing_kit/clients/__init__.py +1 -0
- healing_kit/clients/databricks_client.py +183 -0
- healing_kit/clients/teams_client.py +128 -0
- healing_kit/models/__init__.py +1 -0
- healing_kit/models/diagnosis.py +45 -0
- healing_kit/models/events.py +30 -0
- healing_kit/models/evidence.py +83 -0
- healing_kit/runtime/__init__.py +6 -0
- healing_kit/runtime/approval.py +141 -0
- healing_kit/runtime/maintenance.py +52 -0
- healing_kit/services/__init__.py +1 -0
- healing_kit/services/cache_service.py +120 -0
- healing_kit/services/circuit_breaker.py +114 -0
- healing_kit/services/context_agent.py +127 -0
- healing_kit/services/dependency_graph.py +141 -0
- healing_kit/services/diagnosis_engine.py +165 -0
- healing_kit/services/identity.py +61 -0
- healing_kit/services/model_router.py +52 -0
- healing_kit/services/query_guard.py +168 -0
- healing_kit/services/resolution_verifier.py +100 -0
- healing_kit/services/token_budget.py +137 -0
- healing_kit/utils/__init__.py +1 -0
- healing_kit/utils/error_hash.py +15 -0
- healing_kit/utils/hmac_tokens.py +86 -0
- healing_kit/utils/sql_safety.py +84 -0
- iic/__init__.py +51 -0
- iic/__main__.py +18 -0
- iic/_console.py +235 -0
- iic/_doctor.py +143 -0
- iic/change/__init__.py +7 -0
- iic/change/change_detector.py +154 -0
- iic/context/__init__.py +7 -0
- iic/context/context_builder.py +117 -0
- iic/dependency/__init__.py +7 -0
- iic/dependency/dependency_analyzer.py +93 -0
- iic/diagnosis/__init__.py +7 -0
- iic/diagnosis/diagnosis_engine.py +183 -0
- iic/dna/__init__.py +7 -0
- iic/dna/dna_builder.py +184 -0
- iic/impact/__init__.py +7 -0
- iic/impact/impact_engine.py +102 -0
- iic/ingestion/__init__.py +14 -0
- iic/ingestion/base.py +21 -0
- iic/ingestion/databricks_source.py +98 -0
- iic/ingestion/static_source.py +23 -0
- iic/ingestion/webhook_source.py +39 -0
- iic/models/__init__.py +44 -0
- iic/models/change.py +77 -0
- iic/models/context.py +46 -0
- iic/models/diagnosis.py +37 -0
- iic/models/dna.py +77 -0
- iic/models/event.py +78 -0
- iic/models/impact.py +60 -0
- iic/models/report.py +88 -0
- iic/models/routing.py +41 -0
- iic/notify/__init__.py +7 -0
- iic/notify/teams_notifier.py +112 -0
- iic/report/__init__.py +7 -0
- iic/report/report_generator.py +67 -0
- iic/routing/__init__.py +7 -0
- iic/routing/router.py +42 -0
- iic/runtime/__init__.py +10 -0
- iic/runtime/_sql.py +11 -0
- iic/runtime/agent_config.py +48 -0
- iic/runtime/agent_runtime.py +70 -0
- iic/runtime/antibodies.py +100 -0
- iic/runtime/bootstrap.py +157 -0
- iic/runtime/constants.py +40 -0
- iic/runtime/context.py +46 -0
- iic/runtime/detective.py +72 -0
- iic/runtime/hooks.py +85 -0
- iic/runtime/incident_engine.py +207 -0
- iic/runtime/inprocess.py +350 -0
- iic/runtime/ledger.py +120 -0
- iic/runtime/monitor.py +155 -0
- iic/runtime/pattern_store.py +53 -0
- iic/runtime/reconciler.py +139 -0
- iic/runtime/scope_config.py +127 -0
- iic/runtime/store.py +150 -0
- iic/runtime/wrapper.py +28 -0
- iic_autoload.pth +1 -0
- onboarding/__init__.py +1 -0
- onboarding/cli.py +168 -0
- onboarding/config_schema.py +62 -0
- onboarding/manifest.py +27 -0
- onboarding/preflight.py +129 -0
- onboarding/provisioner.py +573 -0
- onboarding/rollback.py +81 -0
- shkit-1.2.0.dist-info/METADATA +239 -0
- shkit-1.2.0.dist-info/RECORD +94 -0
- shkit-1.2.0.dist-info/WHEEL +4 -0
- shkit-1.2.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Stage 11 — push incident reports to Teams (optional, informational).
|
|
2
|
+
|
|
3
|
+
The new product *explains and prioritises*; it does not auto-fix, so the card has
|
|
4
|
+
no Approve/Reject buttons — it carries the intelligence (severity-ranked summary,
|
|
5
|
+
root cause, impact, changes, evidence, suggested fix) and a deep link to the run.
|
|
6
|
+
|
|
7
|
+
``build_incident_card`` is a pure function (no I/O) so the card layout is unit
|
|
8
|
+
tested; ``TeamsNotifier.send`` is the thin transport wrapper.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from iic.models.impact import Severity
|
|
14
|
+
from iic.models.report import IncidentReport
|
|
15
|
+
|
|
16
|
+
_SEVERITY_RANK = {Severity.CRITICAL: 0, Severity.HIGH: 1, Severity.MEDIUM: 2, Severity.LOW: 3}
|
|
17
|
+
_SEVERITY_EMOJI = {
|
|
18
|
+
Severity.CRITICAL: "\U0001f534", # red
|
|
19
|
+
Severity.HIGH: "\U0001f7e0", # orange
|
|
20
|
+
Severity.MEDIUM: "\U0001f7e1", # yellow
|
|
21
|
+
Severity.LOW: "\U0001f7e2", # green
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def build_incident_card(reports: list[IncidentReport], run_id: str = "",
|
|
26
|
+
host: str = "", job_id: str = "", antibodies: dict | None = None) -> dict:
|
|
27
|
+
"""Build a single severity-ranked Adaptive Card for a batch of incidents.
|
|
28
|
+
|
|
29
|
+
``antibodies`` (optional) maps ``pattern_id -> (state, entry)`` from the
|
|
30
|
+
Antibody Ledger, where ``state`` ∈ {"resolved", "known_unresolved", "new"}.
|
|
31
|
+
When a report's pattern is present, a ledger block is added ABOVE the machine
|
|
32
|
+
"Suggested fix" (which always stays — it is the deterministic suggestion; the
|
|
33
|
+
ledger line is the separate, human-confirmed answer). Callers that don't pass
|
|
34
|
+
``antibodies`` get the exact card as before (no ledger block)."""
|
|
35
|
+
ranked = sorted(reports, key=lambda r: _SEVERITY_RANK.get(r.impact.severity, 9))
|
|
36
|
+
|
|
37
|
+
body = [
|
|
38
|
+
{"type": "TextBlock", "size": "Large", "weight": "Bolder",
|
|
39
|
+
"text": "\U0001f9e0 Incident Intelligence Report"},
|
|
40
|
+
{"type": "TextBlock", "isSubtle": True, "wrap": True,
|
|
41
|
+
"text": f"Run {run_id} · {len(ranked)} incident(s), highest severity first"},
|
|
42
|
+
{"type": "TextBlock", "separator": True, "text": "---"},
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
for i, r in enumerate(ranked, 1):
|
|
46
|
+
emoji = _SEVERITY_EMOJI.get(r.impact.severity, "")
|
|
47
|
+
body.append({"type": "TextBlock", "weight": "Bolder", "size": "Medium", "separator": True, "wrap": True,
|
|
48
|
+
"text": f"{emoji} {i}. [{r.impact.severity.value}] {r.task} — {r.dna.failure_type.value}"})
|
|
49
|
+
body.append({"type": "TextBlock", "wrap": True,
|
|
50
|
+
"text": f"**Root cause** ({int(r.diagnosis.confidence * 100)}%): {r.diagnosis.root_cause}"})
|
|
51
|
+
body.append({"type": "FactSet", "facts": [
|
|
52
|
+
{"title": "Business risk", "value": r.impact.business_risk.value},
|
|
53
|
+
{"title": "Blast radius", "value": f"{r.impact.downstream_jobs} jobs · "
|
|
54
|
+
f"{r.impact.affected_tables} tables · "
|
|
55
|
+
f"{r.impact.dashboard_impact} dashboards"},
|
|
56
|
+
{"title": "Layer", "value": r.dna.affected_layer},
|
|
57
|
+
{"title": "Diagnosed by", "value": r.diagnosis.produced_by},
|
|
58
|
+
]})
|
|
59
|
+
if r.changes:
|
|
60
|
+
body.append({"type": "TextBlock", "wrap": True,
|
|
61
|
+
"text": "**Recent changes:** " + "; ".join(r.changes[:3])})
|
|
62
|
+
ab = (antibodies or {}).get(r.dna.pattern_id)
|
|
63
|
+
if ab:
|
|
64
|
+
state, entry = ab
|
|
65
|
+
n = (entry or {}).get("times_seen", 0) if entry else 0
|
|
66
|
+
if state == "resolved":
|
|
67
|
+
resolution = str((entry or {}).get("resolution", "")).strip()
|
|
68
|
+
body.append({"type": "TextBlock", "wrap": True,
|
|
69
|
+
"text": f"♻️ **Known issue** (seen {n}×) — "
|
|
70
|
+
f"Recorded fix that worked for a similar issue: {resolution}"})
|
|
71
|
+
elif state == "known_unresolved":
|
|
72
|
+
body.append({"type": "TextBlock", "wrap": True,
|
|
73
|
+
"text": f"⚠️ **Recurring issue** (seen {n}×) — "
|
|
74
|
+
"no resolution recorded yet"})
|
|
75
|
+
else: # new
|
|
76
|
+
body.append({"type": "TextBlock", "wrap": True,
|
|
77
|
+
"text": "\U0001f195 **New issue** — no resolution recorded yet"})
|
|
78
|
+
if r.diagnosis.suggested_fix:
|
|
79
|
+
body.append({"type": "TextBlock", "wrap": True,
|
|
80
|
+
"text": f"**Suggested fix:** {r.diagnosis.suggested_fix}"})
|
|
81
|
+
|
|
82
|
+
actions = []
|
|
83
|
+
if host and run_id:
|
|
84
|
+
actions.append({"type": "Action.OpenUrl", "title": "\U0001f50d View Run",
|
|
85
|
+
"url": f"{host.rstrip('/')}/#job/{job_id}/run/{run_id}"})
|
|
86
|
+
|
|
87
|
+
content = {
|
|
88
|
+
"$schema": "http://adaptivecards.io/schemas/adaptive-card.json",
|
|
89
|
+
"type": "AdaptiveCard", "version": "1.4", "body": body,
|
|
90
|
+
}
|
|
91
|
+
if actions:
|
|
92
|
+
content["actions"] = actions
|
|
93
|
+
return {"type": "message", "attachments": [
|
|
94
|
+
{"contentType": "application/vnd.microsoft.card.adaptive", "content": content}]}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class TeamsNotifier:
|
|
98
|
+
def __init__(self, webhook_url: str = ""):
|
|
99
|
+
self.webhook_url = webhook_url
|
|
100
|
+
|
|
101
|
+
def send(self, reports: list[IncidentReport], run_id: str = "",
|
|
102
|
+
host: str = "", job_id: str = "") -> bool:
|
|
103
|
+
if not (self.webhook_url and reports):
|
|
104
|
+
return False
|
|
105
|
+
import requests
|
|
106
|
+
card = build_incident_card(reports, run_id=run_id, host=host, job_id=job_id)
|
|
107
|
+
try:
|
|
108
|
+
r = requests.post(self.webhook_url, json=card,
|
|
109
|
+
headers={"Content-Type": "application/json"}, timeout=15)
|
|
110
|
+
return r.status_code in (200, 202)
|
|
111
|
+
except Exception:
|
|
112
|
+
return False
|
iic/report/__init__.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Stage 10 — assemble the final IncidentReport.
|
|
2
|
+
|
|
3
|
+
Pure assembly: it stitches the products of every prior stage into one
|
|
4
|
+
:class:`IncidentReport`, building a one-line executive summary and a timeline.
|
|
5
|
+
No I/O, no LLM — given the same inputs it always produces the same report.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from iic.models.change import ChangeDiffObject
|
|
11
|
+
from iic.models.context import IncidentContextBundle
|
|
12
|
+
from iic.models.diagnosis import DiagnosisResult
|
|
13
|
+
from iic.models.dna import IncidentDNA
|
|
14
|
+
from iic.models.event import NormalizedFailureEvent
|
|
15
|
+
from iic.models.impact import ImpactScore
|
|
16
|
+
from iic.models.report import IncidentReport
|
|
17
|
+
from iic.models.routing import RoutingDecision
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ReportGenerator:
|
|
21
|
+
def build(
|
|
22
|
+
self,
|
|
23
|
+
incident_id: str,
|
|
24
|
+
event: NormalizedFailureEvent,
|
|
25
|
+
dna: IncidentDNA,
|
|
26
|
+
impact: ImpactScore,
|
|
27
|
+
diagnosis: DiagnosisResult,
|
|
28
|
+
routing: RoutingDecision | None = None,
|
|
29
|
+
context: IncidentContextBundle | None = None,
|
|
30
|
+
change_diff: ChangeDiffObject | None = None,
|
|
31
|
+
evidence: list[str] | None = None,
|
|
32
|
+
) -> IncidentReport:
|
|
33
|
+
summary = self._summary(event, dna, impact)
|
|
34
|
+
return IncidentReport(
|
|
35
|
+
incident_id=incident_id,
|
|
36
|
+
pipeline=event.pipeline,
|
|
37
|
+
task=event.task,
|
|
38
|
+
timestamp=event.timestamp,
|
|
39
|
+
summary=summary,
|
|
40
|
+
dna=dna,
|
|
41
|
+
impact=impact,
|
|
42
|
+
diagnosis=diagnosis,
|
|
43
|
+
routing=routing,
|
|
44
|
+
evidence=list(evidence or []),
|
|
45
|
+
changes=change_diff.summaries() if change_diff else [],
|
|
46
|
+
timeline=self._timeline(event, dna, diagnosis),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
@staticmethod
|
|
50
|
+
def _summary(event: NormalizedFailureEvent, dna: IncidentDNA, impact: ImpactScore) -> str:
|
|
51
|
+
ftype = dna.failure_type.value.replace("_", " ").lower()
|
|
52
|
+
return (
|
|
53
|
+
f"{impact.severity.value} {ftype} in {event.pipeline}.{event.task} "
|
|
54
|
+
f"({dna.affected_layer} layer) — {impact.downstream_jobs} downstream job(s), "
|
|
55
|
+
f"{impact.dashboard_impact} dashboard(s) at risk; business risk {impact.business_risk.value}."
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _timeline(event: NormalizedFailureEvent, dna: IncidentDNA, diagnosis: DiagnosisResult) -> list[str]:
|
|
60
|
+
tl = []
|
|
61
|
+
if event.timestamp:
|
|
62
|
+
tl.append(f"{event.timestamp} — task '{event.task}' failed")
|
|
63
|
+
if dna.root_signal:
|
|
64
|
+
tl.append(f"signal detected — {dna.root_signal}")
|
|
65
|
+
tl.append(f"classified as {dna.failure_type.value} ({dna.confidence_signature} confidence)")
|
|
66
|
+
tl.append(f"diagnosed by {diagnosis.produced_by}")
|
|
67
|
+
return tl
|
iic/routing/__init__.py
ADDED
iic/routing/router.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Stage 8 — decide which model (if any) the diagnosis stage uses.
|
|
2
|
+
|
|
3
|
+
Deterministic and auditable: impact + cache state + DNA confidence pick the tier,
|
|
4
|
+
never the LLM. The goal is to spend the expensive model only where it earns its
|
|
5
|
+
cost — high-severity or low-confidence incidents — and to skip the LLM entirely
|
|
6
|
+
when the answer is already known (cache) or structurally obvious (a derived
|
|
7
|
+
dependency failure).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from iic.models.dna import FailureType, IncidentDNA
|
|
13
|
+
from iic.models.impact import ImpactScore, Severity
|
|
14
|
+
from iic.models.routing import ModelTier, RoutingDecision
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class IncidentModelRouter:
|
|
18
|
+
def __init__(self, lightweight_model: str, powerful_model: str):
|
|
19
|
+
self.lightweight_model = lightweight_model
|
|
20
|
+
self.powerful_model = powerful_model
|
|
21
|
+
|
|
22
|
+
def route(self, dna: IncidentDNA, impact: ImpactScore, cache_hit: bool = False) -> RoutingDecision:
|
|
23
|
+
if cache_hit:
|
|
24
|
+
return RoutingDecision(ModelTier.NONE, reason="Known pattern — cached resolution replayed (zero tokens).",
|
|
25
|
+
cache_hit=True)
|
|
26
|
+
|
|
27
|
+
# A derived dependency failure needs no LLM: the real cause is the upstream
|
|
28
|
+
# incident, which gets its own report.
|
|
29
|
+
if dna.failure_type == FailureType.DEPENDENCY:
|
|
30
|
+
return RoutingDecision(ModelTier.NONE,
|
|
31
|
+
reason="Derived from an upstream failure — diagnosed deterministically.")
|
|
32
|
+
|
|
33
|
+
# High stakes or low confidence → spend the powerful model.
|
|
34
|
+
if impact.severity in (Severity.HIGH, Severity.CRITICAL) or dna.confidence_signature == "low":
|
|
35
|
+
return RoutingDecision(ModelTier.POWERFUL, model=self.powerful_model,
|
|
36
|
+
reason=f"severity={impact.severity.value}, "
|
|
37
|
+
f"dna_confidence={dna.confidence_signature}")
|
|
38
|
+
|
|
39
|
+
# Confident, low-impact, well-understood pattern → cheap model is enough.
|
|
40
|
+
return RoutingDecision(ModelTier.LIGHTWEIGHT, model=self.lightweight_model,
|
|
41
|
+
reason=f"severity={impact.severity.value}, "
|
|
42
|
+
f"dna_confidence={dna.confidence_signature}")
|
iic/runtime/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""IIC runtime package.
|
|
2
|
+
|
|
3
|
+
Intentionally EMPTY of eager imports: the self-arming `.pth` does
|
|
4
|
+
``import iic.runtime.bootstrap``, which imports this package first. If this module
|
|
5
|
+
eagerly imported the engine, every interpreter (incl. a serverless kernel boot)
|
|
6
|
+
would pull the whole engine at startup — slow, and a kernel-restart risk. Import
|
|
7
|
+
the engine lazily from :mod:`iic.runtime.incident_engine` where you need it.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
iic/runtime/_sql.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Tiny re-export so IIC runtime has one safe SQL-literal helper.
|
|
2
|
+
|
|
3
|
+
Reuses the audited escaping from the shared library rather than re-implementing
|
|
4
|
+
string interpolation (the original codebase's #1 injection foot-gun).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from healing_kit.utils.sql_safety import sql_literal as lit
|
|
10
|
+
|
|
11
|
+
__all__ = ["lit"]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Environment-driven config for the v4 Databricks-native agent.
|
|
2
|
+
|
|
3
|
+
All knobs come from env vars so the same wheel behaves correctly whether it runs
|
|
4
|
+
inside a job, a notebook, or the reconciler. Sensible free defaults: LLM off,
|
|
5
|
+
Postgres optional.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _b(name: str, default: bool) -> bool:
|
|
15
|
+
return os.environ.get(name, str(default)).strip().lower() in ("1", "true", "yes", "on")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class AgentConfig:
|
|
20
|
+
databricks_host: str = ""
|
|
21
|
+
databricks_token: str = "" # usually the ambient cluster token
|
|
22
|
+
teams_webhook_url: str = ""
|
|
23
|
+
slack_webhook_url: str = ""
|
|
24
|
+
llm_enabled: bool = False
|
|
25
|
+
lightweight_model: str = "databricks-meta-llama-3-3-70b-instruct"
|
|
26
|
+
powerful_model: str = "databricks-claude-opus-4-8"
|
|
27
|
+
pg_dsn: str = "" # empty → no persistence/dedup (wrapper-only)
|
|
28
|
+
poll_interval_seconds: int = 90 # reconciler
|
|
29
|
+
lookback_hours: int = 24 # reconciler backfill window
|
|
30
|
+
max_jobs_scanned: int = 50
|
|
31
|
+
notify: bool = True
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_env(cls) -> "AgentConfig":
|
|
35
|
+
return cls(
|
|
36
|
+
databricks_host=os.environ.get("DATABRICKS_HOST", "").rstrip("/"),
|
|
37
|
+
databricks_token=os.environ.get("DATABRICKS_TOKEN", ""),
|
|
38
|
+
teams_webhook_url=os.environ.get("TEAMS_WEBHOOK_URL", ""),
|
|
39
|
+
slack_webhook_url=os.environ.get("SLACK_WEBHOOK_URL", ""),
|
|
40
|
+
llm_enabled=_b("LLM_ENABLED", False),
|
|
41
|
+
lightweight_model=os.environ.get("LIGHTWEIGHT_MODEL", "databricks-meta-llama-3-3-70b-instruct"),
|
|
42
|
+
powerful_model=os.environ.get("POWERFUL_MODEL", "databricks-claude-opus-4-8"),
|
|
43
|
+
pg_dsn=os.environ.get("POSTGRES_DSN", ""),
|
|
44
|
+
poll_interval_seconds=int(os.environ.get("POLL_INTERVAL_SECONDS", "90")),
|
|
45
|
+
lookback_hours=int(os.environ.get("LOOKBACK_HOURS", "24")),
|
|
46
|
+
max_jobs_scanned=int(os.environ.get("MAX_JOBS_SCANNED", "50")),
|
|
47
|
+
notify=_b("IIC_NOTIFY", True),
|
|
48
|
+
)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Shared analysis path for the v4 agent — used by both the wrapper and the reconciler.
|
|
2
|
+
|
|
3
|
+
Given one or more already-known failure events, run the unchanged v2
|
|
4
|
+
``IncidentEngine``, dedup by fingerprint, persist, and (the engine itself) notify.
|
|
5
|
+
Keeping this in one place means the two detection sources produce identical
|
|
6
|
+
incidents and can't diverge.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from iic.ingestion.static_source import StaticFailureSource
|
|
12
|
+
from iic.models.event import NormalizedFailureEvent
|
|
13
|
+
from iic.runtime.agent_config import AgentConfig
|
|
14
|
+
from iic.runtime.store import fingerprint
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def analyze_events(events: list[NormalizedFailureEvent], *, run_info: dict, client,
|
|
18
|
+
config: AgentConfig, store) -> list:
|
|
19
|
+
"""Analyze fresh (non-duplicate) events; return the IncidentReports produced."""
|
|
20
|
+
# Dedup against what either source already processed.
|
|
21
|
+
fresh, fp_by_task = [], {}
|
|
22
|
+
for ev in events:
|
|
23
|
+
fp = fingerprint(ev.run_id, ev.task, ev.error_message)
|
|
24
|
+
if store.already_seen(fp):
|
|
25
|
+
continue
|
|
26
|
+
fresh.append(ev)
|
|
27
|
+
fp_by_task[ev.task] = fp
|
|
28
|
+
if not fresh:
|
|
29
|
+
return []
|
|
30
|
+
|
|
31
|
+
engine = _build_engine(fresh, run_info, client, config, store)
|
|
32
|
+
result = engine.run()
|
|
33
|
+
|
|
34
|
+
for report in result.reports:
|
|
35
|
+
fp = fp_by_task.get(report.task) or fingerprint(_run_id(run_info), report.task, "")
|
|
36
|
+
store.save_incident(
|
|
37
|
+
incident_id=report.incident_id, run_id=_run_id(run_info),
|
|
38
|
+
task_key=report.task, fingerprint=fp, pattern_id=report.dna.pattern_id,
|
|
39
|
+
failure_type=report.dna.failure_type.value, severity=report.impact.severity.value,
|
|
40
|
+
root_cause=report.diagnosis.root_cause, confidence=report.diagnosis.confidence,
|
|
41
|
+
produced_by=report.diagnosis.produced_by, report_json=report.to_dict())
|
|
42
|
+
return result.reports
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _build_engine(events, run_info, client, config: AgentConfig, store):
|
|
46
|
+
from iic.change.change_detector import ChangeDetector
|
|
47
|
+
from iic.context.context_builder import ContextBuilder
|
|
48
|
+
from iic.dependency.dependency_analyzer import DependencyAnalyzer
|
|
49
|
+
from iic.diagnosis.diagnosis_engine import DiagnosisEngine
|
|
50
|
+
from iic.runtime.incident_engine import EngineConfig, IncidentEngine
|
|
51
|
+
|
|
52
|
+
job_id = events[0].job_id if events else ""
|
|
53
|
+
cfg = EngineConfig(job_id=job_id, host=config.databricks_host,
|
|
54
|
+
lightweight_model=config.lightweight_model,
|
|
55
|
+
powerful_model=config.powerful_model,
|
|
56
|
+
teams_webhook=config.teams_webhook_url, notify=config.notify)
|
|
57
|
+
diag_client = client if config.llm_enabled else None
|
|
58
|
+
return IncidentEngine(
|
|
59
|
+
cfg,
|
|
60
|
+
source=StaticFailureSource(events, run_info=run_info),
|
|
61
|
+
context_builder=ContextBuilder(client=client),
|
|
62
|
+
dependency_analyzer=DependencyAnalyzer(client=client),
|
|
63
|
+
change_detector=ChangeDetector(client=client),
|
|
64
|
+
diagnosis_engine=DiagnosisEngine(client=diag_client),
|
|
65
|
+
pattern_store=store,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _run_id(run_info: dict) -> str:
|
|
70
|
+
return str((run_info or {}).get("run_id", ""))
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Antibody Ledger (runtime side) — per-tenant memory of failure patterns and the
|
|
2
|
+
human-recorded resolutions that worked.
|
|
3
|
+
|
|
4
|
+
The runtime is **read-only** on the shared ``antibodies.yaml`` (humans edit it in
|
|
5
|
+
git via the GitHub web editor; the sync workflows copy it to/from the Volume). The
|
|
6
|
+
only thing the runtime *writes* is a tiny per-occurrence marker file into
|
|
7
|
+
``.iic_pending/`` — mirroring the proven ``.iic_seen/`` dedup-marker mechanism, so
|
|
8
|
+
two concurrent processes never contend on a single shared file (one file per
|
|
9
|
+
process per occurrence). The pull workflow folds those markers into the git ledger.
|
|
10
|
+
|
|
11
|
+
HARD RULES (identical to the rest of the failure path):
|
|
12
|
+
* fail-open everywhere — any read/write error → behave exactly as today;
|
|
13
|
+
* never raise into the workload;
|
|
14
|
+
* lazy heavy imports (yaml) inside the functions that need them.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
|
|
23
|
+
from iic.runtime.constants import ANTIBODIES_FILENAME, DEFAULT_CONFIG_PATH, PENDING_DIRNAME
|
|
24
|
+
|
|
25
|
+
# Redaction patterns for the human-readable ``example`` sub-field. The ledger KEY
|
|
26
|
+
# is the already-generic pattern_id; the example is only context for the human
|
|
27
|
+
# writing a resolution, so it must never carry a credential or PII.
|
|
28
|
+
_SECRET_RE = re.compile(r"(?i)(token|secret|password|passwd|pwd|api[_-]?key|key)\s*[=:]\s*\S+")
|
|
29
|
+
_EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
|
|
30
|
+
_URL_CRED_RE = re.compile(r"([A-Za-z][A-Za-z0-9+.\-]*://)[^\s:/@]+:[^\s:/@]+@")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def sanitize_example(short_error: str) -> str:
|
|
34
|
+
"""First line only, ≤120 chars, with obvious secrets / emails / URL creds redacted."""
|
|
35
|
+
try:
|
|
36
|
+
lines = (short_error or "").strip().splitlines()
|
|
37
|
+
text = lines[0] if lines else ""
|
|
38
|
+
text = _URL_CRED_RE.sub(r"\1***@", text)
|
|
39
|
+
text = _SECRET_RE.sub(r"\1=***", text)
|
|
40
|
+
text = _EMAIL_RE.sub("***", text)
|
|
41
|
+
return text[:120]
|
|
42
|
+
except Exception:
|
|
43
|
+
return ""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _base(base_dir: str) -> str:
|
|
47
|
+
return base_dir or os.path.dirname(DEFAULT_CONFIG_PATH)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def load_ledger(base_dir: str) -> dict:
|
|
51
|
+
"""Read ``{base_dir}/antibodies.yaml`` (the volume_path anchor). Absent/corrupt → {}."""
|
|
52
|
+
try:
|
|
53
|
+
path = os.path.join(_base(base_dir), ANTIBODIES_FILENAME)
|
|
54
|
+
if not os.path.exists(path):
|
|
55
|
+
return {}
|
|
56
|
+
import yaml
|
|
57
|
+
data = yaml.safe_load(open(path)) or {}
|
|
58
|
+
return data if isinstance(data, dict) else {}
|
|
59
|
+
except Exception:
|
|
60
|
+
return {}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def lookup(ledger: dict, pattern_id: str):
|
|
64
|
+
"""Return ``(state, entry)`` with ``state`` ∈ {"resolved", "known_unresolved", "new"}."""
|
|
65
|
+
entry = ledger.get(pattern_id) if isinstance(ledger, dict) else None
|
|
66
|
+
if not isinstance(entry, dict):
|
|
67
|
+
return "new", None
|
|
68
|
+
resolution = str(entry.get("resolution", "") or "").strip()
|
|
69
|
+
return ("resolved" if resolution else "known_unresolved"), entry
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def record_occurrence(base_dir: str, pattern_id: str, example: str) -> None:
|
|
73
|
+
"""Append ONE tiny per-occurrence marker into ``{base_dir}/.iic_pending/``.
|
|
74
|
+
|
|
75
|
+
Never touches the shared ledger; one file per process per occurrence makes this
|
|
76
|
+
race-free by construction. Whole body is fail-open — any error is a silent no-op.
|
|
77
|
+
"""
|
|
78
|
+
try:
|
|
79
|
+
if not pattern_id:
|
|
80
|
+
return
|
|
81
|
+
import time
|
|
82
|
+
import uuid
|
|
83
|
+
pending = os.path.join(_base(base_dir), PENDING_DIRNAME)
|
|
84
|
+
os.makedirs(pending, exist_ok=True)
|
|
85
|
+
safe_pid = re.sub(r"[^A-Za-z0-9_.\-]", "_", str(pattern_id))[:80]
|
|
86
|
+
now = time.time()
|
|
87
|
+
fname = f"{safe_pid}__{int(now)}_{uuid.uuid4().hex[:8]}.json"
|
|
88
|
+
payload = {"pattern_id": str(pattern_id), "example": example or "", "ts": _iso(now)}
|
|
89
|
+
with open(os.path.join(pending, fname), "w") as f:
|
|
90
|
+
json.dump(payload, f)
|
|
91
|
+
except Exception:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _iso(epoch: float) -> str:
|
|
96
|
+
try:
|
|
97
|
+
import datetime
|
|
98
|
+
return datetime.datetime.fromtimestamp(epoch, datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
99
|
+
except Exception:
|
|
100
|
+
return ""
|
iic/runtime/bootstrap.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Self-arming tripwire — imported by ``iic_autoload.pth`` at interpreter startup.
|
|
2
|
+
|
|
3
|
+
"Airbag, not camera": at every Python startup this arms in-process failure hooks
|
|
4
|
+
in microseconds, then does nothing. The engine runs ONLY at the moment an
|
|
5
|
+
unhandled exception occurs. On a successful run it leaves zero trace.
|
|
6
|
+
|
|
7
|
+
HARD RULES enforced here:
|
|
8
|
+
* stdlib-only imports at module load (os, sys, threading, builtins). The engine,
|
|
9
|
+
requests, yaml, etc. are imported lazily inside the failure handler.
|
|
10
|
+
* fail-open everywhere — a .pth import error would print noise at the start of
|
|
11
|
+
every process, so the whole module body is wrapped in try/except.
|
|
12
|
+
* never alter the user's failure — every hook runs our handler first (itself
|
|
13
|
+
fully guarded) then ALWAYS calls the previous hook, preserving the original
|
|
14
|
+
traceback/exit behavior.
|
|
15
|
+
* idempotent (``_ARMED``) and re-entrant.
|
|
16
|
+
* kill switch: ``IIC_DISABLE=1`` disarms entirely.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
try: # the entire module must never raise at import (it runs in every process)
|
|
20
|
+
import os
|
|
21
|
+
import sys
|
|
22
|
+
|
|
23
|
+
_ARMED = False
|
|
24
|
+
_IPY_ARMED = False # True once the IPython/notebook hook owns failure reporting
|
|
25
|
+
|
|
26
|
+
def _handle_failure(exc_type, exc, tb):
|
|
27
|
+
"""Lazy, fully-guarded bridge to the in-process responder."""
|
|
28
|
+
try:
|
|
29
|
+
from iic.runtime.inprocess import process_local_failure
|
|
30
|
+
process_local_failure(exc_type, exc, tb)
|
|
31
|
+
except Exception:
|
|
32
|
+
pass # monitoring must never break or slow the workload
|
|
33
|
+
|
|
34
|
+
# ── IPython / notebook arming (cells bypass sys.excepthook) ──
|
|
35
|
+
|
|
36
|
+
def _arm_ipython():
|
|
37
|
+
"""Arm notebook-cell failure capture if a live IPython shell exists.
|
|
38
|
+
|
|
39
|
+
Databricks notebook cells do NOT route exceptions through
|
|
40
|
+
``set_custom_exc`` (Databricks wraps cell execution itself), so the
|
|
41
|
+
reliable hook is the ``post_run_cell`` event, which fires after every cell
|
|
42
|
+
with ``result.error_in_exec`` set on failure. We register BOTH (custom_exc
|
|
43
|
+
for plain Jupyter, post_run_cell for Databricks); the per-process
|
|
44
|
+
fingerprint dedup ensures at most one incident if both fire. Returns True
|
|
45
|
+
if armed.
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
import IPython
|
|
49
|
+
shell = IPython.get_ipython()
|
|
50
|
+
if shell is None:
|
|
51
|
+
return False
|
|
52
|
+
if getattr(shell, "_iic_armed", False):
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
def _custom_exc(shell, etype, evalue, tb, tb_offset=None):
|
|
56
|
+
_handle_failure(etype, evalue, tb)
|
|
57
|
+
return shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
|
|
58
|
+
|
|
59
|
+
def _post_run_cell(result):
|
|
60
|
+
try:
|
|
61
|
+
err = getattr(result, "error_in_exec", None)
|
|
62
|
+
if err is not None:
|
|
63
|
+
_handle_failure(type(err), err, getattr(err, "__traceback__", None))
|
|
64
|
+
except Exception:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
shell.set_custom_exc((BaseException,), _custom_exc)
|
|
69
|
+
except Exception:
|
|
70
|
+
pass
|
|
71
|
+
try:
|
|
72
|
+
shell.events.register("post_run_cell", _post_run_cell) # the Databricks-reliable hook
|
|
73
|
+
except Exception:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
shell._iic_armed = True
|
|
77
|
+
global _IPY_ARMED
|
|
78
|
+
_IPY_ARMED = True # excepthook now defers to the notebook hook (avoid double cards)
|
|
79
|
+
return True
|
|
80
|
+
except Exception:
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
def _install_ipython_watcher():
|
|
84
|
+
"""Arm the notebook hook once the IPython shell exists.
|
|
85
|
+
|
|
86
|
+
IMPORTANT: do NOT wrap ``builtins.__import__`` — doing so fires on every
|
|
87
|
+
import during kernel boot and re-imports IPython re-entrantly, which can
|
|
88
|
+
deadlock the kernel startup (ERROR_RESTART_PYTHON). Instead, a tiny daemon
|
|
89
|
+
thread polls for the shell, arms it once, and exits. It never touches the
|
|
90
|
+
import machinery and never blocks the main (kernel) thread.
|
|
91
|
+
"""
|
|
92
|
+
if _arm_ipython():
|
|
93
|
+
return # shell already live → armed now
|
|
94
|
+
try:
|
|
95
|
+
import threading
|
|
96
|
+
import time
|
|
97
|
+
|
|
98
|
+
def _poll():
|
|
99
|
+
for _ in range(150): # ~30s max, then give up (script-task path still covered)
|
|
100
|
+
try:
|
|
101
|
+
if _arm_ipython():
|
|
102
|
+
return
|
|
103
|
+
except Exception:
|
|
104
|
+
pass
|
|
105
|
+
time.sleep(0.2)
|
|
106
|
+
|
|
107
|
+
threading.Thread(target=_poll, name="iic-ipython-arm", daemon=True).start()
|
|
108
|
+
except Exception:
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
# ── main arming ──
|
|
112
|
+
|
|
113
|
+
def activate():
|
|
114
|
+
global _ARMED
|
|
115
|
+
if _ARMED or os.environ.get("IIC_DISABLE") == "1":
|
|
116
|
+
return
|
|
117
|
+
# Never arm Spark executor processes; only the driver. On serverless the
|
|
118
|
+
# var is absent, which means proceed.
|
|
119
|
+
db_driver = os.environ.get("DB_IS_DRIVER")
|
|
120
|
+
if db_driver is not None and db_driver.upper() != "TRUE":
|
|
121
|
+
return
|
|
122
|
+
_ARMED = True
|
|
123
|
+
|
|
124
|
+
# sys.excepthook (script tasks / top-level) — run handler, then the previous hook.
|
|
125
|
+
_prev_excepthook = sys.excepthook
|
|
126
|
+
|
|
127
|
+
def _excepthook(exc_type, exc, tb):
|
|
128
|
+
# In a notebook/job a live IPython shell exists and post_run_cell is the
|
|
129
|
+
# single source of truth — the job wrapper ALSO hits sys.excepthook with
|
|
130
|
+
# a re-raised exception, which would double-report (and misclassify). So
|
|
131
|
+
# only report here when IPython is NOT armed (pure script / non-IPython).
|
|
132
|
+
if not _IPY_ARMED:
|
|
133
|
+
_handle_failure(exc_type, exc, tb)
|
|
134
|
+
return _prev_excepthook(exc_type, exc, tb)
|
|
135
|
+
|
|
136
|
+
sys.excepthook = _excepthook
|
|
137
|
+
|
|
138
|
+
# notebook cells
|
|
139
|
+
_install_ipython_watcher()
|
|
140
|
+
|
|
141
|
+
# worker-thread failures (best-effort; threading.excepthook is 3.8+)
|
|
142
|
+
try:
|
|
143
|
+
import threading
|
|
144
|
+
_prev_thread_hook = threading.excepthook
|
|
145
|
+
|
|
146
|
+
def _thread_hook(args):
|
|
147
|
+
_handle_failure(args.exc_type, args.exc_value, args.exc_traceback)
|
|
148
|
+
return _prev_thread_hook(args)
|
|
149
|
+
|
|
150
|
+
threading.excepthook = _thread_hook
|
|
151
|
+
except Exception:
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
activate()
|
|
155
|
+
|
|
156
|
+
except Exception:
|
|
157
|
+
pass
|