shkit 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. healing_kit/__init__.py +3 -0
  2. healing_kit/auth.py +79 -0
  3. healing_kit/clients/__init__.py +1 -0
  4. healing_kit/clients/databricks_client.py +183 -0
  5. healing_kit/clients/teams_client.py +128 -0
  6. healing_kit/models/__init__.py +1 -0
  7. healing_kit/models/diagnosis.py +45 -0
  8. healing_kit/models/events.py +30 -0
  9. healing_kit/models/evidence.py +83 -0
  10. healing_kit/runtime/__init__.py +6 -0
  11. healing_kit/runtime/approval.py +141 -0
  12. healing_kit/runtime/maintenance.py +52 -0
  13. healing_kit/services/__init__.py +1 -0
  14. healing_kit/services/cache_service.py +120 -0
  15. healing_kit/services/circuit_breaker.py +114 -0
  16. healing_kit/services/context_agent.py +127 -0
  17. healing_kit/services/dependency_graph.py +141 -0
  18. healing_kit/services/diagnosis_engine.py +165 -0
  19. healing_kit/services/identity.py +61 -0
  20. healing_kit/services/model_router.py +52 -0
  21. healing_kit/services/query_guard.py +168 -0
  22. healing_kit/services/resolution_verifier.py +100 -0
  23. healing_kit/services/token_budget.py +137 -0
  24. healing_kit/utils/__init__.py +1 -0
  25. healing_kit/utils/error_hash.py +15 -0
  26. healing_kit/utils/hmac_tokens.py +86 -0
  27. healing_kit/utils/sql_safety.py +84 -0
  28. iic/__init__.py +51 -0
  29. iic/__main__.py +18 -0
  30. iic/_console.py +235 -0
  31. iic/_doctor.py +143 -0
  32. iic/change/__init__.py +7 -0
  33. iic/change/change_detector.py +154 -0
  34. iic/context/__init__.py +7 -0
  35. iic/context/context_builder.py +117 -0
  36. iic/dependency/__init__.py +7 -0
  37. iic/dependency/dependency_analyzer.py +93 -0
  38. iic/diagnosis/__init__.py +7 -0
  39. iic/diagnosis/diagnosis_engine.py +183 -0
  40. iic/dna/__init__.py +7 -0
  41. iic/dna/dna_builder.py +184 -0
  42. iic/impact/__init__.py +7 -0
  43. iic/impact/impact_engine.py +102 -0
  44. iic/ingestion/__init__.py +14 -0
  45. iic/ingestion/base.py +21 -0
  46. iic/ingestion/databricks_source.py +98 -0
  47. iic/ingestion/static_source.py +23 -0
  48. iic/ingestion/webhook_source.py +39 -0
  49. iic/models/__init__.py +44 -0
  50. iic/models/change.py +77 -0
  51. iic/models/context.py +46 -0
  52. iic/models/diagnosis.py +37 -0
  53. iic/models/dna.py +77 -0
  54. iic/models/event.py +78 -0
  55. iic/models/impact.py +60 -0
  56. iic/models/report.py +88 -0
  57. iic/models/routing.py +41 -0
  58. iic/notify/__init__.py +7 -0
  59. iic/notify/teams_notifier.py +112 -0
  60. iic/report/__init__.py +7 -0
  61. iic/report/report_generator.py +67 -0
  62. iic/routing/__init__.py +7 -0
  63. iic/routing/router.py +42 -0
  64. iic/runtime/__init__.py +10 -0
  65. iic/runtime/_sql.py +11 -0
  66. iic/runtime/agent_config.py +48 -0
  67. iic/runtime/agent_runtime.py +70 -0
  68. iic/runtime/antibodies.py +100 -0
  69. iic/runtime/bootstrap.py +157 -0
  70. iic/runtime/constants.py +40 -0
  71. iic/runtime/context.py +46 -0
  72. iic/runtime/detective.py +72 -0
  73. iic/runtime/hooks.py +85 -0
  74. iic/runtime/incident_engine.py +207 -0
  75. iic/runtime/inprocess.py +350 -0
  76. iic/runtime/ledger.py +120 -0
  77. iic/runtime/monitor.py +155 -0
  78. iic/runtime/pattern_store.py +53 -0
  79. iic/runtime/reconciler.py +139 -0
  80. iic/runtime/scope_config.py +127 -0
  81. iic/runtime/store.py +150 -0
  82. iic/runtime/wrapper.py +28 -0
  83. iic_autoload.pth +1 -0
  84. onboarding/__init__.py +1 -0
  85. onboarding/cli.py +168 -0
  86. onboarding/config_schema.py +62 -0
  87. onboarding/manifest.py +27 -0
  88. onboarding/preflight.py +129 -0
  89. onboarding/provisioner.py +573 -0
  90. onboarding/rollback.py +81 -0
  91. shkit-1.2.0.dist-info/METADATA +239 -0
  92. shkit-1.2.0.dist-info/RECORD +94 -0
  93. shkit-1.2.0.dist-info/WHEEL +4 -0
  94. shkit-1.2.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,112 @@
1
+ """Stage 11 — push incident reports to Teams (optional, informational).
2
+
3
+ The new product *explains and prioritises*; it does not auto-fix, so the card has
4
+ no Approve/Reject buttons — it carries the intelligence (severity-ranked summary,
5
+ root cause, impact, changes, evidence, suggested fix) and a deep link to the run.
6
+
7
+ ``build_incident_card`` is a pure function (no I/O) so the card layout is unit
8
+ tested; ``TeamsNotifier.send`` is the thin transport wrapper.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from iic.models.impact import Severity
14
+ from iic.models.report import IncidentReport
15
+
16
+ _SEVERITY_RANK = {Severity.CRITICAL: 0, Severity.HIGH: 1, Severity.MEDIUM: 2, Severity.LOW: 3}
17
+ _SEVERITY_EMOJI = {
18
+ Severity.CRITICAL: "\U0001f534", # red
19
+ Severity.HIGH: "\U0001f7e0", # orange
20
+ Severity.MEDIUM: "\U0001f7e1", # yellow
21
+ Severity.LOW: "\U0001f7e2", # green
22
+ }
23
+
24
+
25
+ def build_incident_card(reports: list[IncidentReport], run_id: str = "",
26
+ host: str = "", job_id: str = "", antibodies: dict | None = None) -> dict:
27
+ """Build a single severity-ranked Adaptive Card for a batch of incidents.
28
+
29
+ ``antibodies`` (optional) maps ``pattern_id -> (state, entry)`` from the
30
+ Antibody Ledger, where ``state`` ∈ {"resolved", "known_unresolved", "new"}.
31
+ When a report's pattern is present, a ledger block is added ABOVE the machine
32
+ "Suggested fix" (which always stays — it is the deterministic suggestion; the
33
+ ledger line is the separate, human-confirmed answer). Callers that don't pass
34
+ ``antibodies`` get the exact card as before (no ledger block)."""
35
+ ranked = sorted(reports, key=lambda r: _SEVERITY_RANK.get(r.impact.severity, 9))
36
+
37
+ body = [
38
+ {"type": "TextBlock", "size": "Large", "weight": "Bolder",
39
+ "text": "\U0001f9e0 Incident Intelligence Report"},
40
+ {"type": "TextBlock", "isSubtle": True, "wrap": True,
41
+ "text": f"Run {run_id} · {len(ranked)} incident(s), highest severity first"},
42
+ {"type": "TextBlock", "separator": True, "text": "---"},
43
+ ]
44
+
45
+ for i, r in enumerate(ranked, 1):
46
+ emoji = _SEVERITY_EMOJI.get(r.impact.severity, "")
47
+ body.append({"type": "TextBlock", "weight": "Bolder", "size": "Medium", "separator": True, "wrap": True,
48
+ "text": f"{emoji} {i}. [{r.impact.severity.value}] {r.task} — {r.dna.failure_type.value}"})
49
+ body.append({"type": "TextBlock", "wrap": True,
50
+ "text": f"**Root cause** ({int(r.diagnosis.confidence * 100)}%): {r.diagnosis.root_cause}"})
51
+ body.append({"type": "FactSet", "facts": [
52
+ {"title": "Business risk", "value": r.impact.business_risk.value},
53
+ {"title": "Blast radius", "value": f"{r.impact.downstream_jobs} jobs · "
54
+ f"{r.impact.affected_tables} tables · "
55
+ f"{r.impact.dashboard_impact} dashboards"},
56
+ {"title": "Layer", "value": r.dna.affected_layer},
57
+ {"title": "Diagnosed by", "value": r.diagnosis.produced_by},
58
+ ]})
59
+ if r.changes:
60
+ body.append({"type": "TextBlock", "wrap": True,
61
+ "text": "**Recent changes:** " + "; ".join(r.changes[:3])})
62
+ ab = (antibodies or {}).get(r.dna.pattern_id)
63
+ if ab:
64
+ state, entry = ab
65
+ n = (entry or {}).get("times_seen", 0) if entry else 0
66
+ if state == "resolved":
67
+ resolution = str((entry or {}).get("resolution", "")).strip()
68
+ body.append({"type": "TextBlock", "wrap": True,
69
+ "text": f"♻️ **Known issue** (seen {n}×) — "
70
+ f"Recorded fix that worked for a similar issue: {resolution}"})
71
+ elif state == "known_unresolved":
72
+ body.append({"type": "TextBlock", "wrap": True,
73
+ "text": f"⚠️ **Recurring issue** (seen {n}×) — "
74
+ "no resolution recorded yet"})
75
+ else: # new
76
+ body.append({"type": "TextBlock", "wrap": True,
77
+ "text": "\U0001f195 **New issue** — no resolution recorded yet"})
78
+ if r.diagnosis.suggested_fix:
79
+ body.append({"type": "TextBlock", "wrap": True,
80
+ "text": f"**Suggested fix:** {r.diagnosis.suggested_fix}"})
81
+
82
+ actions = []
83
+ if host and run_id:
84
+ actions.append({"type": "Action.OpenUrl", "title": "\U0001f50d View Run",
85
+ "url": f"{host.rstrip('/')}/#job/{job_id}/run/{run_id}"})
86
+
87
+ content = {
88
+ "$schema": "http://adaptivecards.io/schemas/adaptive-card.json",
89
+ "type": "AdaptiveCard", "version": "1.4", "body": body,
90
+ }
91
+ if actions:
92
+ content["actions"] = actions
93
+ return {"type": "message", "attachments": [
94
+ {"contentType": "application/vnd.microsoft.card.adaptive", "content": content}]}
95
+
96
+
97
+ class TeamsNotifier:
98
+ def __init__(self, webhook_url: str = ""):
99
+ self.webhook_url = webhook_url
100
+
101
+ def send(self, reports: list[IncidentReport], run_id: str = "",
102
+ host: str = "", job_id: str = "") -> bool:
103
+ if not (self.webhook_url and reports):
104
+ return False
105
+ import requests
106
+ card = build_incident_card(reports, run_id=run_id, host=host, job_id=job_id)
107
+ try:
108
+ r = requests.post(self.webhook_url, json=card,
109
+ headers={"Content-Type": "application/json"}, timeout=15)
110
+ return r.status_code in (200, 202)
111
+ except Exception:
112
+ return False
iic/report/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Stage 10 — incident report generation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from iic.report.report_generator import ReportGenerator
6
+
7
+ __all__ = ["ReportGenerator"]
@@ -0,0 +1,67 @@
1
+ """Stage 10 — assemble the final IncidentReport.
2
+
3
+ Pure assembly: it stitches the products of every prior stage into one
4
+ :class:`IncidentReport`, building a one-line executive summary and a timeline.
5
+ No I/O, no LLM — given the same inputs it always produces the same report.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from iic.models.change import ChangeDiffObject
11
+ from iic.models.context import IncidentContextBundle
12
+ from iic.models.diagnosis import DiagnosisResult
13
+ from iic.models.dna import IncidentDNA
14
+ from iic.models.event import NormalizedFailureEvent
15
+ from iic.models.impact import ImpactScore
16
+ from iic.models.report import IncidentReport
17
+ from iic.models.routing import RoutingDecision
18
+
19
+
20
+ class ReportGenerator:
21
+ def build(
22
+ self,
23
+ incident_id: str,
24
+ event: NormalizedFailureEvent,
25
+ dna: IncidentDNA,
26
+ impact: ImpactScore,
27
+ diagnosis: DiagnosisResult,
28
+ routing: RoutingDecision | None = None,
29
+ context: IncidentContextBundle | None = None,
30
+ change_diff: ChangeDiffObject | None = None,
31
+ evidence: list[str] | None = None,
32
+ ) -> IncidentReport:
33
+ summary = self._summary(event, dna, impact)
34
+ return IncidentReport(
35
+ incident_id=incident_id,
36
+ pipeline=event.pipeline,
37
+ task=event.task,
38
+ timestamp=event.timestamp,
39
+ summary=summary,
40
+ dna=dna,
41
+ impact=impact,
42
+ diagnosis=diagnosis,
43
+ routing=routing,
44
+ evidence=list(evidence or []),
45
+ changes=change_diff.summaries() if change_diff else [],
46
+ timeline=self._timeline(event, dna, diagnosis),
47
+ )
48
+
49
+ @staticmethod
50
+ def _summary(event: NormalizedFailureEvent, dna: IncidentDNA, impact: ImpactScore) -> str:
51
+ ftype = dna.failure_type.value.replace("_", " ").lower()
52
+ return (
53
+ f"{impact.severity.value} {ftype} in {event.pipeline}.{event.task} "
54
+ f"({dna.affected_layer} layer) — {impact.downstream_jobs} downstream job(s), "
55
+ f"{impact.dashboard_impact} dashboard(s) at risk; business risk {impact.business_risk.value}."
56
+ )
57
+
58
+ @staticmethod
59
+ def _timeline(event: NormalizedFailureEvent, dna: IncidentDNA, diagnosis: DiagnosisResult) -> list[str]:
60
+ tl = []
61
+ if event.timestamp:
62
+ tl.append(f"{event.timestamp} — task '{event.task}' failed")
63
+ if dna.root_signal:
64
+ tl.append(f"signal detected — {dna.root_signal}")
65
+ tl.append(f"classified as {dna.failure_type.value} ({dna.confidence_signature} confidence)")
66
+ tl.append(f"diagnosed by {diagnosis.produced_by}")
67
+ return tl
@@ -0,0 +1,7 @@
1
+ """Stage 8 — model routing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from iic.routing.router import IncidentModelRouter
6
+
7
+ __all__ = ["IncidentModelRouter"]
iic/routing/router.py ADDED
@@ -0,0 +1,42 @@
1
+ """Stage 8 — decide which model (if any) the diagnosis stage uses.
2
+
3
+ Deterministic and auditable: impact + cache state + DNA confidence pick the tier,
4
+ never the LLM. The goal is to spend the expensive model only where it earns its
5
+ cost — high-severity or low-confidence incidents — and to skip the LLM entirely
6
+ when the answer is already known (cache) or structurally obvious (a derived
7
+ dependency failure).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from iic.models.dna import FailureType, IncidentDNA
13
+ from iic.models.impact import ImpactScore, Severity
14
+ from iic.models.routing import ModelTier, RoutingDecision
15
+
16
+
17
+ class IncidentModelRouter:
18
+ def __init__(self, lightweight_model: str, powerful_model: str):
19
+ self.lightweight_model = lightweight_model
20
+ self.powerful_model = powerful_model
21
+
22
+ def route(self, dna: IncidentDNA, impact: ImpactScore, cache_hit: bool = False) -> RoutingDecision:
23
+ if cache_hit:
24
+ return RoutingDecision(ModelTier.NONE, reason="Known pattern — cached resolution replayed (zero tokens).",
25
+ cache_hit=True)
26
+
27
+ # A derived dependency failure needs no LLM: the real cause is the upstream
28
+ # incident, which gets its own report.
29
+ if dna.failure_type == FailureType.DEPENDENCY:
30
+ return RoutingDecision(ModelTier.NONE,
31
+ reason="Derived from an upstream failure — diagnosed deterministically.")
32
+
33
+ # High stakes or low confidence → spend the powerful model.
34
+ if impact.severity in (Severity.HIGH, Severity.CRITICAL) or dna.confidence_signature == "low":
35
+ return RoutingDecision(ModelTier.POWERFUL, model=self.powerful_model,
36
+ reason=f"severity={impact.severity.value}, "
37
+ f"dna_confidence={dna.confidence_signature}")
38
+
39
+ # Confident, low-impact, well-understood pattern → cheap model is enough.
40
+ return RoutingDecision(ModelTier.LIGHTWEIGHT, model=self.lightweight_model,
41
+ reason=f"severity={impact.severity.value}, "
42
+ f"dna_confidence={dna.confidence_signature}")
@@ -0,0 +1,10 @@
1
+ """IIC runtime package.
2
+
3
+ Intentionally EMPTY of eager imports: the self-arming `.pth` does
4
+ ``import iic.runtime.bootstrap``, which imports this package first. If this module
5
+ eagerly imported the engine, every interpreter (incl. a serverless kernel boot)
6
+ would pull the whole engine at startup — slow, and a kernel-restart risk. Import
7
+ the engine lazily from :mod:`iic.runtime.incident_engine` where you need it.
8
+ """
9
+
10
+ from __future__ import annotations
iic/runtime/_sql.py ADDED
@@ -0,0 +1,11 @@
1
+ """Tiny re-export so IIC runtime has one safe SQL-literal helper.
2
+
3
+ Reuses the audited escaping from the shared library rather than re-implementing
4
+ string interpolation (the original codebase's #1 injection foot-gun).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from healing_kit.utils.sql_safety import sql_literal as lit
10
+
11
+ __all__ = ["lit"]
@@ -0,0 +1,48 @@
1
+ """Environment-driven config for the v4 Databricks-native agent.
2
+
3
+ All knobs come from env vars so the same wheel behaves correctly whether it runs
4
+ inside a job, a notebook, or the reconciler. Sensible free defaults: LLM off,
5
+ Postgres optional.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from dataclasses import dataclass
12
+
13
+
14
+ def _b(name: str, default: bool) -> bool:
15
+ return os.environ.get(name, str(default)).strip().lower() in ("1", "true", "yes", "on")
16
+
17
+
18
+ @dataclass
19
+ class AgentConfig:
20
+ databricks_host: str = ""
21
+ databricks_token: str = "" # usually the ambient cluster token
22
+ teams_webhook_url: str = ""
23
+ slack_webhook_url: str = ""
24
+ llm_enabled: bool = False
25
+ lightweight_model: str = "databricks-meta-llama-3-3-70b-instruct"
26
+ powerful_model: str = "databricks-claude-opus-4-8"
27
+ pg_dsn: str = "" # empty → no persistence/dedup (wrapper-only)
28
+ poll_interval_seconds: int = 90 # reconciler
29
+ lookback_hours: int = 24 # reconciler backfill window
30
+ max_jobs_scanned: int = 50
31
+ notify: bool = True
32
+
33
+ @classmethod
34
+ def from_env(cls) -> "AgentConfig":
35
+ return cls(
36
+ databricks_host=os.environ.get("DATABRICKS_HOST", "").rstrip("/"),
37
+ databricks_token=os.environ.get("DATABRICKS_TOKEN", ""),
38
+ teams_webhook_url=os.environ.get("TEAMS_WEBHOOK_URL", ""),
39
+ slack_webhook_url=os.environ.get("SLACK_WEBHOOK_URL", ""),
40
+ llm_enabled=_b("LLM_ENABLED", False),
41
+ lightweight_model=os.environ.get("LIGHTWEIGHT_MODEL", "databricks-meta-llama-3-3-70b-instruct"),
42
+ powerful_model=os.environ.get("POWERFUL_MODEL", "databricks-claude-opus-4-8"),
43
+ pg_dsn=os.environ.get("POSTGRES_DSN", ""),
44
+ poll_interval_seconds=int(os.environ.get("POLL_INTERVAL_SECONDS", "90")),
45
+ lookback_hours=int(os.environ.get("LOOKBACK_HOURS", "24")),
46
+ max_jobs_scanned=int(os.environ.get("MAX_JOBS_SCANNED", "50")),
47
+ notify=_b("IIC_NOTIFY", True),
48
+ )
@@ -0,0 +1,70 @@
1
+ """Shared analysis path for the v4 agent — used by both the wrapper and the reconciler.
2
+
3
+ Given one or more already-known failure events, run the unchanged v2
4
+ ``IncidentEngine``, dedup by fingerprint, persist, and (the engine itself) notify.
5
+ Keeping this in one place means the two detection sources produce identical
6
+ incidents and can't diverge.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from iic.ingestion.static_source import StaticFailureSource
12
+ from iic.models.event import NormalizedFailureEvent
13
+ from iic.runtime.agent_config import AgentConfig
14
+ from iic.runtime.store import fingerprint
15
+
16
+
17
+ def analyze_events(events: list[NormalizedFailureEvent], *, run_info: dict, client,
18
+ config: AgentConfig, store) -> list:
19
+ """Analyze fresh (non-duplicate) events; return the IncidentReports produced."""
20
+ # Dedup against what either source already processed.
21
+ fresh, fp_by_task = [], {}
22
+ for ev in events:
23
+ fp = fingerprint(ev.run_id, ev.task, ev.error_message)
24
+ if store.already_seen(fp):
25
+ continue
26
+ fresh.append(ev)
27
+ fp_by_task[ev.task] = fp
28
+ if not fresh:
29
+ return []
30
+
31
+ engine = _build_engine(fresh, run_info, client, config, store)
32
+ result = engine.run()
33
+
34
+ for report in result.reports:
35
+ fp = fp_by_task.get(report.task) or fingerprint(_run_id(run_info), report.task, "")
36
+ store.save_incident(
37
+ incident_id=report.incident_id, run_id=_run_id(run_info),
38
+ task_key=report.task, fingerprint=fp, pattern_id=report.dna.pattern_id,
39
+ failure_type=report.dna.failure_type.value, severity=report.impact.severity.value,
40
+ root_cause=report.diagnosis.root_cause, confidence=report.diagnosis.confidence,
41
+ produced_by=report.diagnosis.produced_by, report_json=report.to_dict())
42
+ return result.reports
43
+
44
+
45
+ def _build_engine(events, run_info, client, config: AgentConfig, store):
46
+ from iic.change.change_detector import ChangeDetector
47
+ from iic.context.context_builder import ContextBuilder
48
+ from iic.dependency.dependency_analyzer import DependencyAnalyzer
49
+ from iic.diagnosis.diagnosis_engine import DiagnosisEngine
50
+ from iic.runtime.incident_engine import EngineConfig, IncidentEngine
51
+
52
+ job_id = events[0].job_id if events else ""
53
+ cfg = EngineConfig(job_id=job_id, host=config.databricks_host,
54
+ lightweight_model=config.lightweight_model,
55
+ powerful_model=config.powerful_model,
56
+ teams_webhook=config.teams_webhook_url, notify=config.notify)
57
+ diag_client = client if config.llm_enabled else None
58
+ return IncidentEngine(
59
+ cfg,
60
+ source=StaticFailureSource(events, run_info=run_info),
61
+ context_builder=ContextBuilder(client=client),
62
+ dependency_analyzer=DependencyAnalyzer(client=client),
63
+ change_detector=ChangeDetector(client=client),
64
+ diagnosis_engine=DiagnosisEngine(client=diag_client),
65
+ pattern_store=store,
66
+ )
67
+
68
+
69
+ def _run_id(run_info: dict) -> str:
70
+ return str((run_info or {}).get("run_id", ""))
@@ -0,0 +1,100 @@
1
+ """Antibody Ledger (runtime side) — per-tenant memory of failure patterns and the
2
+ human-recorded resolutions that worked.
3
+
4
+ The runtime is **read-only** on the shared ``antibodies.yaml`` (humans edit it in
5
+ git via the GitHub web editor; the sync workflows copy it to/from the Volume). The
6
+ only thing the runtime *writes* is a tiny per-occurrence marker file into
7
+ ``.iic_pending/`` — mirroring the proven ``.iic_seen/`` dedup-marker mechanism, so
8
+ two concurrent processes never contend on a single shared file (one file per
9
+ process per occurrence). The pull workflow folds those markers into the git ledger.
10
+
11
+ HARD RULES (identical to the rest of the failure path):
12
+ * fail-open everywhere — any read/write error → behave exactly as today;
13
+ * never raise into the workload;
14
+ * lazy heavy imports (yaml) inside the functions that need them.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import os
21
+ import re
22
+
23
+ from iic.runtime.constants import ANTIBODIES_FILENAME, DEFAULT_CONFIG_PATH, PENDING_DIRNAME
24
+
25
+ # Redaction patterns for the human-readable ``example`` sub-field. The ledger KEY
26
+ # is the already-generic pattern_id; the example is only context for the human
27
+ # writing a resolution, so it must never carry a credential or PII.
28
+ _SECRET_RE = re.compile(r"(?i)(token|secret|password|passwd|pwd|api[_-]?key|key)\s*[=:]\s*\S+")
29
+ _EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
30
+ _URL_CRED_RE = re.compile(r"([A-Za-z][A-Za-z0-9+.\-]*://)[^\s:/@]+:[^\s:/@]+@")
31
+
32
+
33
+ def sanitize_example(short_error: str) -> str:
34
+ """First line only, ≤120 chars, with obvious secrets / emails / URL creds redacted."""
35
+ try:
36
+ lines = (short_error or "").strip().splitlines()
37
+ text = lines[0] if lines else ""
38
+ text = _URL_CRED_RE.sub(r"\1***@", text)
39
+ text = _SECRET_RE.sub(r"\1=***", text)
40
+ text = _EMAIL_RE.sub("***", text)
41
+ return text[:120]
42
+ except Exception:
43
+ return ""
44
+
45
+
46
+ def _base(base_dir: str) -> str:
47
+ return base_dir or os.path.dirname(DEFAULT_CONFIG_PATH)
48
+
49
+
50
+ def load_ledger(base_dir: str) -> dict:
51
+ """Read ``{base_dir}/antibodies.yaml`` (the volume_path anchor). Absent/corrupt → {}."""
52
+ try:
53
+ path = os.path.join(_base(base_dir), ANTIBODIES_FILENAME)
54
+ if not os.path.exists(path):
55
+ return {}
56
+ import yaml
57
+ data = yaml.safe_load(open(path)) or {}
58
+ return data if isinstance(data, dict) else {}
59
+ except Exception:
60
+ return {}
61
+
62
+
63
+ def lookup(ledger: dict, pattern_id: str):
64
+ """Return ``(state, entry)`` with ``state`` ∈ {"resolved", "known_unresolved", "new"}."""
65
+ entry = ledger.get(pattern_id) if isinstance(ledger, dict) else None
66
+ if not isinstance(entry, dict):
67
+ return "new", None
68
+ resolution = str(entry.get("resolution", "") or "").strip()
69
+ return ("resolved" if resolution else "known_unresolved"), entry
70
+
71
+
72
+ def record_occurrence(base_dir: str, pattern_id: str, example: str) -> None:
73
+ """Append ONE tiny per-occurrence marker into ``{base_dir}/.iic_pending/``.
74
+
75
+ Never touches the shared ledger; one file per process per occurrence makes this
76
+ race-free by construction. Whole body is fail-open — any error is a silent no-op.
77
+ """
78
+ try:
79
+ if not pattern_id:
80
+ return
81
+ import time
82
+ import uuid
83
+ pending = os.path.join(_base(base_dir), PENDING_DIRNAME)
84
+ os.makedirs(pending, exist_ok=True)
85
+ safe_pid = re.sub(r"[^A-Za-z0-9_.\-]", "_", str(pattern_id))[:80]
86
+ now = time.time()
87
+ fname = f"{safe_pid}__{int(now)}_{uuid.uuid4().hex[:8]}.json"
88
+ payload = {"pattern_id": str(pattern_id), "example": example or "", "ts": _iso(now)}
89
+ with open(os.path.join(pending, fname), "w") as f:
90
+ json.dump(payload, f)
91
+ except Exception:
92
+ pass
93
+
94
+
95
+ def _iso(epoch: float) -> str:
96
+ try:
97
+ import datetime
98
+ return datetime.datetime.fromtimestamp(epoch, datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
99
+ except Exception:
100
+ return ""
@@ -0,0 +1,157 @@
1
+ """Self-arming tripwire — imported by ``iic_autoload.pth`` at interpreter startup.
2
+
3
+ "Airbag, not camera": at every Python startup this arms in-process failure hooks
4
+ in microseconds, then does nothing. The engine runs ONLY at the moment an
5
+ unhandled exception occurs. On a successful run it leaves zero trace.
6
+
7
+ HARD RULES enforced here:
8
+ * stdlib-only imports at module load (os, sys, threading, builtins). The engine,
9
+ requests, yaml, etc. are imported lazily inside the failure handler.
10
+ * fail-open everywhere — a .pth import error would print noise at the start of
11
+ every process, so the whole module body is wrapped in try/except.
12
+ * never alter the user's failure — every hook runs our handler first (itself
13
+ fully guarded) then ALWAYS calls the previous hook, preserving the original
14
+ traceback/exit behavior.
15
+ * idempotent (``_ARMED``) and re-entrant.
16
+ * kill switch: ``IIC_DISABLE=1`` disarms entirely.
17
+ """
18
+
19
+ try: # the entire module must never raise at import (it runs in every process)
20
+ import os
21
+ import sys
22
+
23
+ _ARMED = False
24
+ _IPY_ARMED = False # True once the IPython/notebook hook owns failure reporting
25
+
26
+ def _handle_failure(exc_type, exc, tb):
27
+ """Lazy, fully-guarded bridge to the in-process responder."""
28
+ try:
29
+ from iic.runtime.inprocess import process_local_failure
30
+ process_local_failure(exc_type, exc, tb)
31
+ except Exception:
32
+ pass # monitoring must never break or slow the workload
33
+
34
+ # ── IPython / notebook arming (cells bypass sys.excepthook) ──
35
+
36
+ def _arm_ipython():
37
+ """Arm notebook-cell failure capture if a live IPython shell exists.
38
+
39
+ Databricks notebook cells do NOT route exceptions through
40
+ ``set_custom_exc`` (Databricks wraps cell execution itself), so the
41
+ reliable hook is the ``post_run_cell`` event, which fires after every cell
42
+ with ``result.error_in_exec`` set on failure. We register BOTH (custom_exc
43
+ for plain Jupyter, post_run_cell for Databricks); the per-process
44
+ fingerprint dedup ensures at most one incident if both fire. Returns True
45
+ if armed.
46
+ """
47
+ try:
48
+ import IPython
49
+ shell = IPython.get_ipython()
50
+ if shell is None:
51
+ return False
52
+ if getattr(shell, "_iic_armed", False):
53
+ return True
54
+
55
+ def _custom_exc(shell, etype, evalue, tb, tb_offset=None):
56
+ _handle_failure(etype, evalue, tb)
57
+ return shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
58
+
59
+ def _post_run_cell(result):
60
+ try:
61
+ err = getattr(result, "error_in_exec", None)
62
+ if err is not None:
63
+ _handle_failure(type(err), err, getattr(err, "__traceback__", None))
64
+ except Exception:
65
+ pass
66
+
67
+ try:
68
+ shell.set_custom_exc((BaseException,), _custom_exc)
69
+ except Exception:
70
+ pass
71
+ try:
72
+ shell.events.register("post_run_cell", _post_run_cell) # the Databricks-reliable hook
73
+ except Exception:
74
+ pass
75
+
76
+ shell._iic_armed = True
77
+ global _IPY_ARMED
78
+ _IPY_ARMED = True # excepthook now defers to the notebook hook (avoid double cards)
79
+ return True
80
+ except Exception:
81
+ return False
82
+
83
+ def _install_ipython_watcher():
84
+ """Arm the notebook hook once the IPython shell exists.
85
+
86
+ IMPORTANT: do NOT wrap ``builtins.__import__`` — doing so fires on every
87
+ import during kernel boot and re-imports IPython re-entrantly, which can
88
+ deadlock the kernel startup (ERROR_RESTART_PYTHON). Instead, a tiny daemon
89
+ thread polls for the shell, arms it once, and exits. It never touches the
90
+ import machinery and never blocks the main (kernel) thread.
91
+ """
92
+ if _arm_ipython():
93
+ return # shell already live → armed now
94
+ try:
95
+ import threading
96
+ import time
97
+
98
+ def _poll():
99
+ for _ in range(150): # ~30s max, then give up (script-task path still covered)
100
+ try:
101
+ if _arm_ipython():
102
+ return
103
+ except Exception:
104
+ pass
105
+ time.sleep(0.2)
106
+
107
+ threading.Thread(target=_poll, name="iic-ipython-arm", daemon=True).start()
108
+ except Exception:
109
+ pass
110
+
111
+ # ── main arming ──
112
+
113
+ def activate():
114
+ global _ARMED
115
+ if _ARMED or os.environ.get("IIC_DISABLE") == "1":
116
+ return
117
+ # Never arm Spark executor processes; only the driver. On serverless the
118
+ # var is absent, which means proceed.
119
+ db_driver = os.environ.get("DB_IS_DRIVER")
120
+ if db_driver is not None and db_driver.upper() != "TRUE":
121
+ return
122
+ _ARMED = True
123
+
124
+ # sys.excepthook (script tasks / top-level) — run handler, then the previous hook.
125
+ _prev_excepthook = sys.excepthook
126
+
127
+ def _excepthook(exc_type, exc, tb):
128
+ # In a notebook/job a live IPython shell exists and post_run_cell is the
129
+ # single source of truth — the job wrapper ALSO hits sys.excepthook with
130
+ # a re-raised exception, which would double-report (and misclassify). So
131
+ # only report here when IPython is NOT armed (pure script / non-IPython).
132
+ if not _IPY_ARMED:
133
+ _handle_failure(exc_type, exc, tb)
134
+ return _prev_excepthook(exc_type, exc, tb)
135
+
136
+ sys.excepthook = _excepthook
137
+
138
+ # notebook cells
139
+ _install_ipython_watcher()
140
+
141
+ # worker-thread failures (best-effort; threading.excepthook is 3.8+)
142
+ try:
143
+ import threading
144
+ _prev_thread_hook = threading.excepthook
145
+
146
+ def _thread_hook(args):
147
+ _handle_failure(args.exc_type, args.exc_value, args.exc_traceback)
148
+ return _prev_thread_hook(args)
149
+
150
+ threading.excepthook = _thread_hook
151
+ except Exception:
152
+ pass
153
+
154
+ activate()
155
+
156
+ except Exception:
157
+ pass