shkit 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. healing_kit/__init__.py +3 -0
  2. healing_kit/auth.py +79 -0
  3. healing_kit/clients/__init__.py +1 -0
  4. healing_kit/clients/databricks_client.py +183 -0
  5. healing_kit/clients/teams_client.py +128 -0
  6. healing_kit/models/__init__.py +1 -0
  7. healing_kit/models/diagnosis.py +45 -0
  8. healing_kit/models/events.py +30 -0
  9. healing_kit/models/evidence.py +83 -0
  10. healing_kit/runtime/__init__.py +6 -0
  11. healing_kit/runtime/approval.py +141 -0
  12. healing_kit/runtime/maintenance.py +52 -0
  13. healing_kit/services/__init__.py +1 -0
  14. healing_kit/services/cache_service.py +120 -0
  15. healing_kit/services/circuit_breaker.py +114 -0
  16. healing_kit/services/context_agent.py +127 -0
  17. healing_kit/services/dependency_graph.py +141 -0
  18. healing_kit/services/diagnosis_engine.py +165 -0
  19. healing_kit/services/identity.py +61 -0
  20. healing_kit/services/model_router.py +52 -0
  21. healing_kit/services/query_guard.py +168 -0
  22. healing_kit/services/resolution_verifier.py +100 -0
  23. healing_kit/services/token_budget.py +137 -0
  24. healing_kit/utils/__init__.py +1 -0
  25. healing_kit/utils/error_hash.py +15 -0
  26. healing_kit/utils/hmac_tokens.py +86 -0
  27. healing_kit/utils/sql_safety.py +84 -0
  28. iic/__init__.py +51 -0
  29. iic/__main__.py +18 -0
  30. iic/_console.py +235 -0
  31. iic/_doctor.py +143 -0
  32. iic/change/__init__.py +7 -0
  33. iic/change/change_detector.py +154 -0
  34. iic/context/__init__.py +7 -0
  35. iic/context/context_builder.py +117 -0
  36. iic/dependency/__init__.py +7 -0
  37. iic/dependency/dependency_analyzer.py +93 -0
  38. iic/diagnosis/__init__.py +7 -0
  39. iic/diagnosis/diagnosis_engine.py +183 -0
  40. iic/dna/__init__.py +7 -0
  41. iic/dna/dna_builder.py +184 -0
  42. iic/impact/__init__.py +7 -0
  43. iic/impact/impact_engine.py +102 -0
  44. iic/ingestion/__init__.py +14 -0
  45. iic/ingestion/base.py +21 -0
  46. iic/ingestion/databricks_source.py +98 -0
  47. iic/ingestion/static_source.py +23 -0
  48. iic/ingestion/webhook_source.py +39 -0
  49. iic/models/__init__.py +44 -0
  50. iic/models/change.py +77 -0
  51. iic/models/context.py +46 -0
  52. iic/models/diagnosis.py +37 -0
  53. iic/models/dna.py +77 -0
  54. iic/models/event.py +78 -0
  55. iic/models/impact.py +60 -0
  56. iic/models/report.py +88 -0
  57. iic/models/routing.py +41 -0
  58. iic/notify/__init__.py +7 -0
  59. iic/notify/teams_notifier.py +112 -0
  60. iic/report/__init__.py +7 -0
  61. iic/report/report_generator.py +67 -0
  62. iic/routing/__init__.py +7 -0
  63. iic/routing/router.py +42 -0
  64. iic/runtime/__init__.py +10 -0
  65. iic/runtime/_sql.py +11 -0
  66. iic/runtime/agent_config.py +48 -0
  67. iic/runtime/agent_runtime.py +70 -0
  68. iic/runtime/antibodies.py +100 -0
  69. iic/runtime/bootstrap.py +157 -0
  70. iic/runtime/constants.py +40 -0
  71. iic/runtime/context.py +46 -0
  72. iic/runtime/detective.py +72 -0
  73. iic/runtime/hooks.py +85 -0
  74. iic/runtime/incident_engine.py +207 -0
  75. iic/runtime/inprocess.py +350 -0
  76. iic/runtime/ledger.py +120 -0
  77. iic/runtime/monitor.py +155 -0
  78. iic/runtime/pattern_store.py +53 -0
  79. iic/runtime/reconciler.py +139 -0
  80. iic/runtime/scope_config.py +127 -0
  81. iic/runtime/store.py +150 -0
  82. iic/runtime/wrapper.py +28 -0
  83. iic_autoload.pth +1 -0
  84. onboarding/__init__.py +1 -0
  85. onboarding/cli.py +168 -0
  86. onboarding/config_schema.py +62 -0
  87. onboarding/manifest.py +27 -0
  88. onboarding/preflight.py +129 -0
  89. onboarding/provisioner.py +573 -0
  90. onboarding/rollback.py +81 -0
  91. shkit-1.2.0.dist-info/METADATA +239 -0
  92. shkit-1.2.0.dist-info/RECORD +94 -0
  93. shkit-1.2.0.dist-info/WHEEL +4 -0
  94. shkit-1.2.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,40 @@
1
+ """Shared constants so the runtime, the publish workflow, and the docs agree.
2
+
3
+ The publish workflow uploads ``config.yaml`` next to the wheel in the tenant
4
+ Volume; the runtime reads it from the same place (override with ``IIC_CONFIG_PATH``).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ # Default Volume location of the tenant config (override via IIC_CONFIG_PATH).
10
+ DEFAULT_CONFIG_PATH = "/Volumes/dev_catalog/default/libs/config.yaml"
11
+
12
+ # A tenant can be muted without republishing by dropping this file next to config.
13
+ DISABLED_SENTINEL_NAME = "DISABLED"
14
+
15
+ # Antibody Ledger (per-tenant issue memory) — lives next to config.yaml in the
16
+ # Volume. The runtime READS this file; it never writes it (humans edit it in git,
17
+ # workflows sync it). New occurrences are appended as tiny per-process marker
18
+ # files under PENDING_DIRNAME, mirroring the .iic_seen/ dedup mechanism.
19
+ ANTIBODIES_FILENAME = "antibodies.yaml"
20
+ PENDING_DIRNAME = ".iic_pending"
21
+
22
+ # Handler network budgets (seconds) — keep total handler ≤ ~10s.
23
+ TEAMS_TIMEOUT = 5
24
+ GITHUB_DISPATCH_TIMEOUT = 3
25
+ ENRICH_TIMEOUT = 2
26
+ # Per-call budget for optional in-process enrichment (pat-gated); total added ≤ ~4s.
27
+ ENRICH_BUDGET = 2
28
+
29
+ # ── Secret-scope configuration (the product-model primary config source) ──
30
+ # A customer creates ONE Databricks secret scope with this conventional name and
31
+ # the fixed keys below. Override the scope name via the IIC_SECRET_SCOPE env var.
32
+ DEFAULT_SECRET_SCOPE = "iic"
33
+
34
+ SECRET_KEY_TEAMS_WEBHOOK = "teams_webhook" # required
35
+ SECRET_KEY_VOLUME_PATH = "volume_path" # required — anchors ledger/markers/sentinel
36
+ SECRET_KEY_HOST = "host" # optional — "View Run" links
37
+ SECRET_KEY_PAT = "pat" # optional — enables best-effort enrichment
38
+ SECRET_KEY_GITHUB_REPO = "github_repo" # optional — incident archiving
39
+ SECRET_KEY_GITHUB_TOKEN = "github_dispatch_token" # optional
40
+ SECRET_KEY_DEDUP_TTL = "dedup_ttl_seconds" # optional — default 300
iic/runtime/context.py ADDED
@@ -0,0 +1,46 @@
1
+ """Session-scoped context cache (initialized by auto-bootstrap).
2
+
3
+ On a long-lived cluster, the same notebook source / job metadata is fetched
4
+ repeatedly across failures. ``init_context_cache()`` turns on an in-process cache
5
+ so those lookups are memoized for the cluster session.
6
+
7
+ Crucially the cache is **inactive until initialized**, so off-cluster (tests,
8
+ local) every fetch stays fresh — no cross-test state leaks.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ _active = False
14
+ _cache: dict = {}
15
+
16
+
17
+ def init_context_cache() -> None:
18
+ """Activate (and clear) the process-wide context cache. Idempotent."""
19
+ global _active
20
+ _cache.clear()
21
+ _active = True
22
+
23
+
24
+ def is_active() -> bool:
25
+ return _active
26
+
27
+
28
+ def reset_context_cache() -> None:
29
+ """Deactivate + clear — used by tests to guarantee isolation."""
30
+ global _active
31
+ _active = False
32
+ _cache.clear()
33
+
34
+
35
+ def cached(key: str, loader):
36
+ """Return ``_cache[key]`` if the cache is active, else just call ``loader()``.
37
+
38
+ ``loader`` is a zero-arg callable that produces the value on a miss.
39
+ """
40
+ if not _active:
41
+ return loader()
42
+ if key in _cache:
43
+ return _cache[key]
44
+ value = loader()
45
+ _cache[key] = value
46
+ return value
@@ -0,0 +1,72 @@
1
+ """Payload-only incident archiver — runs in GitHub Actions on a repository_dispatch
2
+ after a failure and renders a GitHub issue PURELY from the self-contained
3
+ ``client_payload`` the agent sent.
4
+
5
+ In the product model GitHub never reaches into a customer's Databricks workspace:
6
+ there are no Databricks API calls and no customer credentials here. Any post-mortem
7
+ enrichment now happens in-process (pat-gated) before the agent dispatches, so the
8
+ payload is already self-sufficient. Best-effort; the workflow must not fail on a
9
+ rendering error.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import os
16
+ import sys
17
+
18
+
19
+ def _severity(p: dict) -> str:
20
+ return (p.get("impact", {}) or {}).get("severity") or p.get("severity") or "LOW"
21
+
22
+
23
+ def _issue_body(p: dict) -> str:
24
+ ab = p.get("antibody") or {}
25
+ lines = [
26
+ f"**Incident:** `{p.get('incident_id', '?')}` · **Severity:** {_severity(p)}",
27
+ f"**Failure type:** {p.get('failure_type', '?')} · **Pattern:** `{p.get('pattern_id', '?')}`",
28
+ "",
29
+ f"**Root cause:** {p.get('root_cause', 'n/a')}",
30
+ f"**Suggested fix:** {p.get('suggested_fix', 'n/a')}",
31
+ ]
32
+ if ab.get("state"):
33
+ seen = f" (seen {ab['times_seen']}×)" if ab.get("times_seen") else ""
34
+ fix = f" — recorded fix: {ab['resolution']}" if ab.get("resolution") else ""
35
+ lines += ["", f"**Antibody:** {ab.get('state')}{seen}{fix}"]
36
+ changes = p.get("changes") or []
37
+ if changes:
38
+ lines += ["", "## Recent changes (since last success)"]
39
+ lines += [f"- {c}" for c in changes[:8]]
40
+ lines += ["", "## Context", "```json", json.dumps(p.get("context", {}), indent=2)[:2000], "```"]
41
+ return "\n".join(lines)
42
+
43
+
44
+ def _create_issue(gh_repo: str, gh_token: str, payload: dict) -> str:
45
+ import requests
46
+ title = f"[IIC] {_severity(payload)} — {payload.get('failure_type', 'incident')}"
47
+ r = requests.post(
48
+ f"https://api.github.com/repos/{gh_repo}/issues",
49
+ headers={"Authorization": f"Bearer {gh_token}", "Accept": "application/vnd.github+json"},
50
+ json={"title": title, "body": _issue_body(payload), "labels": ["iic-incident"]},
51
+ timeout=15,
52
+ )
53
+ return r.json().get("html_url", "") if r.status_code in (200, 201) else ""
54
+
55
+
56
+ def investigate(payload: dict, *, gh_repo: str = "", gh_token: str = "") -> str:
57
+ """Render + file the GitHub issue from the payload alone. No Databricks access."""
58
+ url = _create_issue(gh_repo, gh_token, payload) if (gh_repo and gh_token) else ""
59
+ print(f"[detective] issue: {url or '(skipped)'}")
60
+ return url
61
+
62
+
63
+ def main() -> int: # pragma: no cover - workflow entry
64
+ payload = json.loads(os.environ.get("IIC_PAYLOAD", "{}") or "{}")
65
+ investigate(payload,
66
+ gh_repo=os.environ.get("GITHUB_REPOSITORY", ""),
67
+ gh_token=os.environ.get("GH_TOKEN", ""))
68
+ return 0
69
+
70
+
71
+ if __name__ == "__main__": # pragma: no cover
72
+ sys.exit(main())
iic/runtime/hooks.py ADDED
@@ -0,0 +1,85 @@
1
+ """Optional Spark runtime hooks (best-effort ONLY).
2
+
3
+ This is the v4 spec's "optional enhancement layer". A Py4J-based SparkListener can
4
+ surface task-level failures from inside the driver, but it is **runtime-version
5
+ sensitive and NOT a dependency for correctness** — the wrapper (``iic_monitor``)
6
+ and the reconciler are the reliable paths. Everything here is wrapped so a failure
7
+ to attach simply no-ops.
8
+
9
+ Caveats (why this is best-effort):
10
+ * Py4J can only *implement an interface*, so we must declare the full
11
+ ``SparkListenerInterface`` method set; a Spark version that adds methods can
12
+ break registration (hence the broad try/except).
13
+ * Spark retries tasks, so task-level failures are noisy — the default callback
14
+ only *logs*; it does not auto-run the (potentially costly) IncidentEngine.
15
+ Opt into analysis explicitly via ``on_failure``.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ # Representative SparkListenerInterface methods (Spark 3.x). Unhandled callbacks
21
+ # must still exist as no-ops or the JVM raises when it invokes them.
22
+ _NOOP_METHODS = [
23
+ "onStageCompleted", "onStageSubmitted", "onTaskStart", "onTaskGettingResult",
24
+ "onJobStart", "onJobEnd", "onEnvironmentUpdate", "onBlockManagerAdded",
25
+ "onBlockManagerRemoved", "onUnpersistRDD", "onApplicationStart", "onApplicationEnd",
26
+ "onExecutorMetricsUpdate", "onStageExecutorMetrics", "onExecutorAdded",
27
+ "onExecutorRemoved", "onExecutorExcluded", "onExecutorExcludedForStage",
28
+ "onNodeExcludedForStage", "onNodeExcluded", "onExecutorUnexcluded",
29
+ "onNodeUnexcluded", "onBlockUpdated", "onSpeculativeTaskSubmitted",
30
+ "onResourceProfileAdded", "onOtherEvent",
31
+ ]
32
+
33
+
34
+ def register_spark_listener(spark=None, on_failure=None) -> bool:
35
+ """Attach a best-effort Spark task-failure listener. Returns True if attached.
36
+
37
+ ``on_failure(info: dict)`` is called when a task ends in failure; if omitted,
38
+ failures are only logged (safe default — avoids noisy/expensive triggering on
39
+ Spark's automatic task retries).
40
+ """
41
+ try:
42
+ spark = spark or _active_spark()
43
+ if spark is None:
44
+ print("[iic.hooks] no active Spark session — listener not attached")
45
+ return False
46
+
47
+ callback = on_failure or (lambda info: print(f"[iic.hooks] task failed: {info}"))
48
+
49
+ def _on_task_end(task_end):
50
+ try:
51
+ reason = task_end.reason()
52
+ # Success reason class name is "Success"; anything else is a failure.
53
+ if reason is not None and "Success" not in reason.toString():
54
+ info = task_end.taskInfo()
55
+ callback({
56
+ "stage_id": task_end.stageId(),
57
+ "task_id": info.taskId() if info else None,
58
+ "reason": reason.toString()[:300],
59
+ })
60
+ except Exception:
61
+ pass
62
+
63
+ methods = {"onTaskEnd": _on_task_end}
64
+ for name in _NOOP_METHODS:
65
+ methods[name] = (lambda *a, **k: None)
66
+
67
+ listener_cls = type("IICSparkListener", (), methods)
68
+ listener_cls.Java = type("Java", (), {
69
+ "implements": ["org.apache.spark.scheduler.SparkListenerInterface"]})
70
+
71
+ listener = listener_cls()
72
+ spark.sparkContext._jsc.sc().addSparkListener(listener)
73
+ print("[iic.hooks] Spark task-failure listener attached (best-effort)")
74
+ return True
75
+ except Exception as ex:
76
+ print(f"[iic.hooks] could not attach Spark listener (best-effort, ignored): {str(ex)[:160]}")
77
+ return False
78
+
79
+
80
+ def _active_spark():
81
+ try:
82
+ from pyspark.sql import SparkSession
83
+ return SparkSession.getActiveSession()
84
+ except Exception:
85
+ return None
@@ -0,0 +1,207 @@
1
+ """The Incident Intelligence Core orchestrator — the 11-stage pipeline.
2
+
3
+ 1 ingest DatabricksFailureSource.discover()
4
+ 2 normalize → NormalizedFailureEvent (done in ingestion)
5
+ 3 context ContextBuilder.build()
6
+ 4 dependency DependencyAnalyzer.analyze() → blast radius
7
+ 5 change ChangeDetector.detect()
8
+ 6 dna IncidentDNABuilder.build() ← deterministic heart
9
+ 7 impact ImpactEngine.score() ← NO LLM
10
+ 8 route IncidentModelRouter.route()
11
+ 9 diagnose DiagnosisEngine.diagnose() ← LLM only if routed
12
+ 10 report ReportGenerator.build()
13
+ 11 notify TeamsNotifier.send() ← optional
14
+
15
+ Deterministic-first: stages 3–8 never call an LLM. The model is only ever invoked
16
+ at stage 9, and only when stage 8 decides it is worth the cost.
17
+
18
+ The engine is constructed from injected collaborators so the whole flow is unit
19
+ testable with fakes; :func:`run_from_notebook` wires the real Databricks ones.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ from dataclasses import dataclass, field
25
+
26
+ from iic.change.change_detector import ChangeDetector
27
+ from iic.context.context_builder import ContextBuilder
28
+ from iic.dependency.dependency_analyzer import DependencyAnalyzer
29
+ from iic.diagnosis.diagnosis_engine import DiagnosisEngine
30
+ from iic.dna.dna_builder import IncidentDNABuilder
31
+ from iic.impact.impact_engine import ImpactEngine
32
+ from iic.ingestion.databricks_source import DatabricksFailureSource
33
+ from iic.models.report import IncidentReport
34
+ from iic.notify.teams_notifier import TeamsNotifier
35
+ from iic.report.report_generator import ReportGenerator
36
+ from iic.routing.router import IncidentModelRouter
37
+ from iic.runtime.pattern_store import NullPatternStore
38
+
39
+
40
+ @dataclass
41
+ class EngineConfig:
42
+ job_id: str = ""
43
+ parent_run_id: str = ""
44
+ lightweight_model: str = "databricks-meta-llama-3-3-70b-instruct"
45
+ powerful_model: str = "databricks-claude-opus-4-8"
46
+ teams_webhook: str = ""
47
+ host: str = ""
48
+ notify: bool = True
49
+
50
+
51
+ @dataclass
52
+ class EngineResult:
53
+ reports: list[IncidentReport] = field(default_factory=list)
54
+ tokens_used: int = 0
55
+ notified: bool = False
56
+ summary: str = ""
57
+
58
+ def to_dict(self) -> dict:
59
+ return {
60
+ "summary": self.summary,
61
+ "tokens_used": self.tokens_used,
62
+ "notified": self.notified,
63
+ "incidents": [r.to_dict() for r in self.reports],
64
+ }
65
+
66
+
67
+ class IncidentEngine:
68
+ def __init__(self, config: EngineConfig, *, source, context_builder=None,
69
+ dependency_analyzer=None, change_detector=None, dna_builder=None,
70
+ impact_engine=None, router=None, diagnosis_engine=None,
71
+ report_generator=None, notifier=None, pattern_store=None):
72
+ self.config = config
73
+ self.source = source
74
+ self.context_builder = context_builder or ContextBuilder()
75
+ self.dependency_analyzer = dependency_analyzer or DependencyAnalyzer()
76
+ self.change_detector = change_detector or ChangeDetector()
77
+ self.dna_builder = dna_builder or IncidentDNABuilder()
78
+ self.impact_engine = impact_engine or ImpactEngine()
79
+ self.router = router or IncidentModelRouter(config.lightweight_model, config.powerful_model)
80
+ self.diagnosis_engine = diagnosis_engine or DiagnosisEngine()
81
+ self.report_generator = report_generator or ReportGenerator()
82
+ self.notifier = notifier or TeamsNotifier(config.teams_webhook)
83
+ self.pattern_store = pattern_store or NullPatternStore()
84
+
85
+ def run(self) -> EngineResult:
86
+ # Stage 1+2 — ingest & normalize.
87
+ events = self.source.discover()
88
+ if not events:
89
+ return EngineResult(summary="No failures found. Nothing to analyze.")
90
+
91
+ run_info = getattr(self.source, "run_info", {}) or {}
92
+ run_id = events[0].run_id
93
+ job_tasks = (run_info.get("tasks") or [])
94
+ failed_keys = {e.task for e in events}
95
+ # Authoritative upstream map from the run DAG (always present on the
96
+ # Databricks path) — used for the dependency override without an extra
97
+ # client call.
98
+ upstream_map = {
99
+ t.get("task_key", ""): [d.get("task_key", "") for d in t.get("depends_on", [])]
100
+ for t in job_tasks
101
+ }
102
+
103
+ reports: list[IncidentReport] = []
104
+ total_tokens = 0
105
+
106
+ for i, event in enumerate(events, 1):
107
+ # Stage 3 — context.
108
+ context = self.context_builder.build(event)
109
+
110
+ # Stage 4 — dependency / blast radius.
111
+ blast = self.dependency_analyzer.analyze(
112
+ job_tasks, event.task, referenced_tables=context.referenced_tables)
113
+
114
+ # Stage 5 — change detection.
115
+ change_diff = self.change_detector.detect(
116
+ self.config.job_id, run_id, failed_run=run_info or None)
117
+
118
+ # Stage 6 — Incident DNA (deterministic heart). A task whose upstream
119
+ # also failed in this run is a derived failure (DAG ∪ context lineage).
120
+ upstream_keys = set(upstream_map.get(event.task, [])) | set(context.upstream_tasks)
121
+ upstream_failed = any(up in failed_keys for up in upstream_keys)
122
+ dna = self.dna_builder.build(event, context, change_diff, upstream_failed=upstream_failed)
123
+
124
+ # Stage 7 — deterministic impact (NO LLM).
125
+ recurrence = self.pattern_store.recurrence(dna.pattern_id)
126
+ impact = self.impact_engine.score(blast, dna, recurrence=recurrence)
127
+
128
+ # Stage 8 — route. A pattern store may expose is_known(): a recurring,
129
+ # already-understood pattern is treated as a cache hit → LLM skipped.
130
+ is_known = getattr(self.pattern_store, "is_known", None)
131
+ cache_hit = bool(is_known(dna.pattern_id)) if callable(is_known) else False
132
+ routing = self.router.route(dna, impact, cache_hit=cache_hit)
133
+
134
+ # Stage 9 — diagnosis (LLM only if routed).
135
+ evidence = list(dna.signals)
136
+ diagnosis, tokens = self.diagnosis_engine.diagnose(
137
+ event, dna, routing, context=context, change_diff=change_diff, evidence=evidence)
138
+ total_tokens += tokens
139
+
140
+ # Stage 10 — report.
141
+ incident_id = f"INC-{run_id or 'adhoc'}-{i}"
142
+ report = self.report_generator.build(
143
+ incident_id, event, dna, impact, diagnosis,
144
+ routing=routing, context=context, change_diff=change_diff, evidence=evidence)
145
+ reports.append(report)
146
+ self.pattern_store.record(dna.pattern_id, incident_id, impact.severity.value)
147
+
148
+ # Sort by severity for prioritisation (the product's core promise).
149
+ from iic.notify.teams_notifier import _SEVERITY_RANK
150
+ reports.sort(key=lambda r: _SEVERITY_RANK.get(r.impact.severity, 9))
151
+
152
+ # Stage 11 — notify (optional).
153
+ notified = False
154
+ if self.config.notify:
155
+ notified = self.notifier.send(reports, run_id=run_id,
156
+ host=self.config.host, job_id=self.config.job_id)
157
+
158
+ top = reports[0].impact.severity.value if reports else "n/a"
159
+ summary = (f"{len(reports)} incident(s) analyzed | top severity {top} | "
160
+ f"{total_tokens} tokens | notified={notified}")
161
+ return EngineResult(reports=reports, tokens_used=total_tokens, notified=notified, summary=summary)
162
+
163
+
164
+ def run_from_notebook(spark, dbutils) -> str:
165
+ """Entry point for the thin Databricks notebook driver.
166
+
167
+ Reads widget params, wires the real Databricks collaborators, runs the
168
+ pipeline, and returns a one-line summary for ``dbutils.notebook.exit``.
169
+ """
170
+ from healing_kit.auth import build_client, resolve_auth_from_dbutils
171
+
172
+ def w(name, default=""):
173
+ dbutils.widgets.text(name, default)
174
+ return dbutils.widgets.get(name)
175
+
176
+ secret_scope = w("secret_scope", "iic")
177
+ config = EngineConfig(
178
+ job_id=w("pipeline_job_id", ""),
179
+ parent_run_id=w("parent_run_id", "").strip(),
180
+ lightweight_model=w("ai_endpoint", "databricks-meta-llama-3-3-70b-instruct"),
181
+ powerful_model=w("ai_endpoint_powerful", "databricks-claude-opus-4-8"),
182
+ teams_webhook=w("teams_webhook_url", ""),
183
+ notify=w("notify", "true").lower() != "false",
184
+ )
185
+ catalog = w("catalog", "dev_catalog")
186
+ schema = w("schema", "iic_schema")
187
+
188
+ auth = resolve_auth_from_dbutils(dbutils, secret_scope)
189
+ client = build_client(auth)
190
+ config.host = auth.config.host
191
+ print(f"Auth method: {auth.method}")
192
+
193
+ from iic.runtime.pattern_store import DeltaPatternStore
194
+
195
+ source = DatabricksFailureSource(client, config.job_id, parent_run_id=config.parent_run_id)
196
+ engine = IncidentEngine(
197
+ config,
198
+ source=source,
199
+ context_builder=ContextBuilder(client=client, spark=spark),
200
+ dependency_analyzer=DependencyAnalyzer(client=client),
201
+ change_detector=ChangeDetector(client=client),
202
+ diagnosis_engine=DiagnosisEngine(client=client),
203
+ pattern_store=DeltaPatternStore(spark, catalog, schema),
204
+ )
205
+ result = engine.run()
206
+ print(result.summary)
207
+ return result.summary