shkit 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- healing_kit/__init__.py +3 -0
- healing_kit/auth.py +79 -0
- healing_kit/clients/__init__.py +1 -0
- healing_kit/clients/databricks_client.py +183 -0
- healing_kit/clients/teams_client.py +128 -0
- healing_kit/models/__init__.py +1 -0
- healing_kit/models/diagnosis.py +45 -0
- healing_kit/models/events.py +30 -0
- healing_kit/models/evidence.py +83 -0
- healing_kit/runtime/__init__.py +6 -0
- healing_kit/runtime/approval.py +141 -0
- healing_kit/runtime/maintenance.py +52 -0
- healing_kit/services/__init__.py +1 -0
- healing_kit/services/cache_service.py +120 -0
- healing_kit/services/circuit_breaker.py +114 -0
- healing_kit/services/context_agent.py +127 -0
- healing_kit/services/dependency_graph.py +141 -0
- healing_kit/services/diagnosis_engine.py +165 -0
- healing_kit/services/identity.py +61 -0
- healing_kit/services/model_router.py +52 -0
- healing_kit/services/query_guard.py +168 -0
- healing_kit/services/resolution_verifier.py +100 -0
- healing_kit/services/token_budget.py +137 -0
- healing_kit/utils/__init__.py +1 -0
- healing_kit/utils/error_hash.py +15 -0
- healing_kit/utils/hmac_tokens.py +86 -0
- healing_kit/utils/sql_safety.py +84 -0
- iic/__init__.py +51 -0
- iic/__main__.py +18 -0
- iic/_console.py +235 -0
- iic/_doctor.py +143 -0
- iic/change/__init__.py +7 -0
- iic/change/change_detector.py +154 -0
- iic/context/__init__.py +7 -0
- iic/context/context_builder.py +117 -0
- iic/dependency/__init__.py +7 -0
- iic/dependency/dependency_analyzer.py +93 -0
- iic/diagnosis/__init__.py +7 -0
- iic/diagnosis/diagnosis_engine.py +183 -0
- iic/dna/__init__.py +7 -0
- iic/dna/dna_builder.py +184 -0
- iic/impact/__init__.py +7 -0
- iic/impact/impact_engine.py +102 -0
- iic/ingestion/__init__.py +14 -0
- iic/ingestion/base.py +21 -0
- iic/ingestion/databricks_source.py +98 -0
- iic/ingestion/static_source.py +23 -0
- iic/ingestion/webhook_source.py +39 -0
- iic/models/__init__.py +44 -0
- iic/models/change.py +77 -0
- iic/models/context.py +46 -0
- iic/models/diagnosis.py +37 -0
- iic/models/dna.py +77 -0
- iic/models/event.py +78 -0
- iic/models/impact.py +60 -0
- iic/models/report.py +88 -0
- iic/models/routing.py +41 -0
- iic/notify/__init__.py +7 -0
- iic/notify/teams_notifier.py +112 -0
- iic/report/__init__.py +7 -0
- iic/report/report_generator.py +67 -0
- iic/routing/__init__.py +7 -0
- iic/routing/router.py +42 -0
- iic/runtime/__init__.py +10 -0
- iic/runtime/_sql.py +11 -0
- iic/runtime/agent_config.py +48 -0
- iic/runtime/agent_runtime.py +70 -0
- iic/runtime/antibodies.py +100 -0
- iic/runtime/bootstrap.py +157 -0
- iic/runtime/constants.py +40 -0
- iic/runtime/context.py +46 -0
- iic/runtime/detective.py +72 -0
- iic/runtime/hooks.py +85 -0
- iic/runtime/incident_engine.py +207 -0
- iic/runtime/inprocess.py +350 -0
- iic/runtime/ledger.py +120 -0
- iic/runtime/monitor.py +155 -0
- iic/runtime/pattern_store.py +53 -0
- iic/runtime/reconciler.py +139 -0
- iic/runtime/scope_config.py +127 -0
- iic/runtime/store.py +150 -0
- iic/runtime/wrapper.py +28 -0
- iic_autoload.pth +1 -0
- onboarding/__init__.py +1 -0
- onboarding/cli.py +168 -0
- onboarding/config_schema.py +62 -0
- onboarding/manifest.py +27 -0
- onboarding/preflight.py +129 -0
- onboarding/provisioner.py +573 -0
- onboarding/rollback.py +81 -0
- shkit-1.2.0.dist-info/METADATA +239 -0
- shkit-1.2.0.dist-info/RECORD +94 -0
- shkit-1.2.0.dist-info/WHEEL +4 -0
- shkit-1.2.0.dist-info/entry_points.txt +2 -0
iic/runtime/constants.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Shared constants so the runtime, the publish workflow, and the docs agree.
|
|
2
|
+
|
|
3
|
+
The publish workflow uploads ``config.yaml`` next to the wheel in the tenant
|
|
4
|
+
Volume; the runtime reads it from the same place (override with ``IIC_CONFIG_PATH``).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
# Default Volume location of the tenant config (override via IIC_CONFIG_PATH).
|
|
10
|
+
DEFAULT_CONFIG_PATH = "/Volumes/dev_catalog/default/libs/config.yaml"
|
|
11
|
+
|
|
12
|
+
# A tenant can be muted without republishing by dropping this file next to config.
|
|
13
|
+
DISABLED_SENTINEL_NAME = "DISABLED"
|
|
14
|
+
|
|
15
|
+
# Antibody Ledger (per-tenant issue memory) — lives next to config.yaml in the
|
|
16
|
+
# Volume. The runtime READS this file; it never writes it (humans edit it in git,
|
|
17
|
+
# workflows sync it). New occurrences are appended as tiny per-process marker
|
|
18
|
+
# files under PENDING_DIRNAME, mirroring the .iic_seen/ dedup mechanism.
|
|
19
|
+
ANTIBODIES_FILENAME = "antibodies.yaml"
|
|
20
|
+
PENDING_DIRNAME = ".iic_pending"
|
|
21
|
+
|
|
22
|
+
# Handler network budgets (seconds) — keep total handler ≤ ~10s.
|
|
23
|
+
TEAMS_TIMEOUT = 5
|
|
24
|
+
GITHUB_DISPATCH_TIMEOUT = 3
|
|
25
|
+
ENRICH_TIMEOUT = 2
|
|
26
|
+
# Per-call budget for optional in-process enrichment (pat-gated); total added ≤ ~4s.
|
|
27
|
+
ENRICH_BUDGET = 2
|
|
28
|
+
|
|
29
|
+
# ── Secret-scope configuration (the product-model primary config source) ──
|
|
30
|
+
# A customer creates ONE Databricks secret scope with this conventional name and
|
|
31
|
+
# the fixed keys below. Override the scope name via the IIC_SECRET_SCOPE env var.
|
|
32
|
+
DEFAULT_SECRET_SCOPE = "iic"
|
|
33
|
+
|
|
34
|
+
SECRET_KEY_TEAMS_WEBHOOK = "teams_webhook" # required
|
|
35
|
+
SECRET_KEY_VOLUME_PATH = "volume_path" # required — anchors ledger/markers/sentinel
|
|
36
|
+
SECRET_KEY_HOST = "host" # optional — "View Run" links
|
|
37
|
+
SECRET_KEY_PAT = "pat" # optional — enables best-effort enrichment
|
|
38
|
+
SECRET_KEY_GITHUB_REPO = "github_repo" # optional — incident archiving
|
|
39
|
+
SECRET_KEY_GITHUB_TOKEN = "github_dispatch_token" # optional
|
|
40
|
+
SECRET_KEY_DEDUP_TTL = "dedup_ttl_seconds" # optional — default 300
|
iic/runtime/context.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Session-scoped context cache (initialized by auto-bootstrap).
|
|
2
|
+
|
|
3
|
+
On a long-lived cluster, the same notebook source / job metadata is fetched
|
|
4
|
+
repeatedly across failures. ``init_context_cache()`` turns on an in-process cache
|
|
5
|
+
so those lookups are memoized for the cluster session.
|
|
6
|
+
|
|
7
|
+
Crucially the cache is **inactive until initialized**, so off-cluster (tests,
|
|
8
|
+
local) every fetch stays fresh — no cross-test state leaks.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
_active = False
|
|
14
|
+
_cache: dict = {}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def init_context_cache() -> None:
|
|
18
|
+
"""Activate (and clear) the process-wide context cache. Idempotent."""
|
|
19
|
+
global _active
|
|
20
|
+
_cache.clear()
|
|
21
|
+
_active = True
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def is_active() -> bool:
|
|
25
|
+
return _active
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def reset_context_cache() -> None:
|
|
29
|
+
"""Deactivate + clear — used by tests to guarantee isolation."""
|
|
30
|
+
global _active
|
|
31
|
+
_active = False
|
|
32
|
+
_cache.clear()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def cached(key: str, loader):
|
|
36
|
+
"""Return ``_cache[key]`` if the cache is active, else just call ``loader()``.
|
|
37
|
+
|
|
38
|
+
``loader`` is a zero-arg callable that produces the value on a miss.
|
|
39
|
+
"""
|
|
40
|
+
if not _active:
|
|
41
|
+
return loader()
|
|
42
|
+
if key in _cache:
|
|
43
|
+
return _cache[key]
|
|
44
|
+
value = loader()
|
|
45
|
+
_cache[key] = value
|
|
46
|
+
return value
|
iic/runtime/detective.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Payload-only incident archiver — runs in GitHub Actions on a repository_dispatch
|
|
2
|
+
after a failure and renders a GitHub issue PURELY from the self-contained
|
|
3
|
+
``client_payload`` the agent sent.
|
|
4
|
+
|
|
5
|
+
In the product model GitHub never reaches into a customer's Databricks workspace:
|
|
6
|
+
there are no Databricks API calls and no customer credentials here. Any post-mortem
|
|
7
|
+
enrichment now happens in-process (pat-gated) before the agent dispatches, so the
|
|
8
|
+
payload is already self-sufficient. Best-effort; the workflow must not fail on a
|
|
9
|
+
rendering error.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import sys
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _severity(p: dict) -> str:
|
|
20
|
+
return (p.get("impact", {}) or {}).get("severity") or p.get("severity") or "LOW"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _issue_body(p: dict) -> str:
|
|
24
|
+
ab = p.get("antibody") or {}
|
|
25
|
+
lines = [
|
|
26
|
+
f"**Incident:** `{p.get('incident_id', '?')}` · **Severity:** {_severity(p)}",
|
|
27
|
+
f"**Failure type:** {p.get('failure_type', '?')} · **Pattern:** `{p.get('pattern_id', '?')}`",
|
|
28
|
+
"",
|
|
29
|
+
f"**Root cause:** {p.get('root_cause', 'n/a')}",
|
|
30
|
+
f"**Suggested fix:** {p.get('suggested_fix', 'n/a')}",
|
|
31
|
+
]
|
|
32
|
+
if ab.get("state"):
|
|
33
|
+
seen = f" (seen {ab['times_seen']}×)" if ab.get("times_seen") else ""
|
|
34
|
+
fix = f" — recorded fix: {ab['resolution']}" if ab.get("resolution") else ""
|
|
35
|
+
lines += ["", f"**Antibody:** {ab.get('state')}{seen}{fix}"]
|
|
36
|
+
changes = p.get("changes") or []
|
|
37
|
+
if changes:
|
|
38
|
+
lines += ["", "## Recent changes (since last success)"]
|
|
39
|
+
lines += [f"- {c}" for c in changes[:8]]
|
|
40
|
+
lines += ["", "## Context", "```json", json.dumps(p.get("context", {}), indent=2)[:2000], "```"]
|
|
41
|
+
return "\n".join(lines)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _create_issue(gh_repo: str, gh_token: str, payload: dict) -> str:
|
|
45
|
+
import requests
|
|
46
|
+
title = f"[IIC] {_severity(payload)} — {payload.get('failure_type', 'incident')}"
|
|
47
|
+
r = requests.post(
|
|
48
|
+
f"https://api.github.com/repos/{gh_repo}/issues",
|
|
49
|
+
headers={"Authorization": f"Bearer {gh_token}", "Accept": "application/vnd.github+json"},
|
|
50
|
+
json={"title": title, "body": _issue_body(payload), "labels": ["iic-incident"]},
|
|
51
|
+
timeout=15,
|
|
52
|
+
)
|
|
53
|
+
return r.json().get("html_url", "") if r.status_code in (200, 201) else ""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def investigate(payload: dict, *, gh_repo: str = "", gh_token: str = "") -> str:
|
|
57
|
+
"""Render + file the GitHub issue from the payload alone. No Databricks access."""
|
|
58
|
+
url = _create_issue(gh_repo, gh_token, payload) if (gh_repo and gh_token) else ""
|
|
59
|
+
print(f"[detective] issue: {url or '(skipped)'}")
|
|
60
|
+
return url
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main() -> int: # pragma: no cover - workflow entry
|
|
64
|
+
payload = json.loads(os.environ.get("IIC_PAYLOAD", "{}") or "{}")
|
|
65
|
+
investigate(payload,
|
|
66
|
+
gh_repo=os.environ.get("GITHUB_REPOSITORY", ""),
|
|
67
|
+
gh_token=os.environ.get("GH_TOKEN", ""))
|
|
68
|
+
return 0
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
if __name__ == "__main__": # pragma: no cover
|
|
72
|
+
sys.exit(main())
|
iic/runtime/hooks.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Optional Spark runtime hooks (best-effort ONLY).
|
|
2
|
+
|
|
3
|
+
This is the v4 spec's "optional enhancement layer". A Py4J-based SparkListener can
|
|
4
|
+
surface task-level failures from inside the driver, but it is **runtime-version
|
|
5
|
+
sensitive and NOT a dependency for correctness** — the wrapper (``iic_monitor``)
|
|
6
|
+
and the reconciler are the reliable paths. Everything here is wrapped so a failure
|
|
7
|
+
to attach simply no-ops.
|
|
8
|
+
|
|
9
|
+
Caveats (why this is best-effort):
|
|
10
|
+
* Py4J can only *implement an interface*, so we must declare the full
|
|
11
|
+
``SparkListenerInterface`` method set; a Spark version that adds methods can
|
|
12
|
+
break registration (hence the broad try/except).
|
|
13
|
+
* Spark retries tasks, so task-level failures are noisy — the default callback
|
|
14
|
+
only *logs*; it does not auto-run the (potentially costly) IncidentEngine.
|
|
15
|
+
Opt into analysis explicitly via ``on_failure``.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
# Representative SparkListenerInterface methods (Spark 3.x). Unhandled callbacks
|
|
21
|
+
# must still exist as no-ops or the JVM raises when it invokes them.
|
|
22
|
+
_NOOP_METHODS = [
|
|
23
|
+
"onStageCompleted", "onStageSubmitted", "onTaskStart", "onTaskGettingResult",
|
|
24
|
+
"onJobStart", "onJobEnd", "onEnvironmentUpdate", "onBlockManagerAdded",
|
|
25
|
+
"onBlockManagerRemoved", "onUnpersistRDD", "onApplicationStart", "onApplicationEnd",
|
|
26
|
+
"onExecutorMetricsUpdate", "onStageExecutorMetrics", "onExecutorAdded",
|
|
27
|
+
"onExecutorRemoved", "onExecutorExcluded", "onExecutorExcludedForStage",
|
|
28
|
+
"onNodeExcludedForStage", "onNodeExcluded", "onExecutorUnexcluded",
|
|
29
|
+
"onNodeUnexcluded", "onBlockUpdated", "onSpeculativeTaskSubmitted",
|
|
30
|
+
"onResourceProfileAdded", "onOtherEvent",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def register_spark_listener(spark=None, on_failure=None) -> bool:
|
|
35
|
+
"""Attach a best-effort Spark task-failure listener. Returns True if attached.
|
|
36
|
+
|
|
37
|
+
``on_failure(info: dict)`` is called when a task ends in failure; if omitted,
|
|
38
|
+
failures are only logged (safe default — avoids noisy/expensive triggering on
|
|
39
|
+
Spark's automatic task retries).
|
|
40
|
+
"""
|
|
41
|
+
try:
|
|
42
|
+
spark = spark or _active_spark()
|
|
43
|
+
if spark is None:
|
|
44
|
+
print("[iic.hooks] no active Spark session — listener not attached")
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
callback = on_failure or (lambda info: print(f"[iic.hooks] task failed: {info}"))
|
|
48
|
+
|
|
49
|
+
def _on_task_end(task_end):
|
|
50
|
+
try:
|
|
51
|
+
reason = task_end.reason()
|
|
52
|
+
# Success reason class name is "Success"; anything else is a failure.
|
|
53
|
+
if reason is not None and "Success" not in reason.toString():
|
|
54
|
+
info = task_end.taskInfo()
|
|
55
|
+
callback({
|
|
56
|
+
"stage_id": task_end.stageId(),
|
|
57
|
+
"task_id": info.taskId() if info else None,
|
|
58
|
+
"reason": reason.toString()[:300],
|
|
59
|
+
})
|
|
60
|
+
except Exception:
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
methods = {"onTaskEnd": _on_task_end}
|
|
64
|
+
for name in _NOOP_METHODS:
|
|
65
|
+
methods[name] = (lambda *a, **k: None)
|
|
66
|
+
|
|
67
|
+
listener_cls = type("IICSparkListener", (), methods)
|
|
68
|
+
listener_cls.Java = type("Java", (), {
|
|
69
|
+
"implements": ["org.apache.spark.scheduler.SparkListenerInterface"]})
|
|
70
|
+
|
|
71
|
+
listener = listener_cls()
|
|
72
|
+
spark.sparkContext._jsc.sc().addSparkListener(listener)
|
|
73
|
+
print("[iic.hooks] Spark task-failure listener attached (best-effort)")
|
|
74
|
+
return True
|
|
75
|
+
except Exception as ex:
|
|
76
|
+
print(f"[iic.hooks] could not attach Spark listener (best-effort, ignored): {str(ex)[:160]}")
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _active_spark():
|
|
81
|
+
try:
|
|
82
|
+
from pyspark.sql import SparkSession
|
|
83
|
+
return SparkSession.getActiveSession()
|
|
84
|
+
except Exception:
|
|
85
|
+
return None
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""The Incident Intelligence Core orchestrator — the 11-stage pipeline.
|
|
2
|
+
|
|
3
|
+
1 ingest DatabricksFailureSource.discover()
|
|
4
|
+
2 normalize → NormalizedFailureEvent (done in ingestion)
|
|
5
|
+
3 context ContextBuilder.build()
|
|
6
|
+
4 dependency DependencyAnalyzer.analyze() → blast radius
|
|
7
|
+
5 change ChangeDetector.detect()
|
|
8
|
+
6 dna IncidentDNABuilder.build() ← deterministic heart
|
|
9
|
+
7 impact ImpactEngine.score() ← NO LLM
|
|
10
|
+
8 route IncidentModelRouter.route()
|
|
11
|
+
9 diagnose DiagnosisEngine.diagnose() ← LLM only if routed
|
|
12
|
+
10 report ReportGenerator.build()
|
|
13
|
+
11 notify TeamsNotifier.send() ← optional
|
|
14
|
+
|
|
15
|
+
Deterministic-first: stages 3–8 never call an LLM. The model is only ever invoked
|
|
16
|
+
at stage 9, and only when stage 8 decides it is worth the cost.
|
|
17
|
+
|
|
18
|
+
The engine is constructed from injected collaborators so the whole flow is unit
|
|
19
|
+
testable with fakes; :func:`run_from_notebook` wires the real Databricks ones.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
|
|
26
|
+
from iic.change.change_detector import ChangeDetector
|
|
27
|
+
from iic.context.context_builder import ContextBuilder
|
|
28
|
+
from iic.dependency.dependency_analyzer import DependencyAnalyzer
|
|
29
|
+
from iic.diagnosis.diagnosis_engine import DiagnosisEngine
|
|
30
|
+
from iic.dna.dna_builder import IncidentDNABuilder
|
|
31
|
+
from iic.impact.impact_engine import ImpactEngine
|
|
32
|
+
from iic.ingestion.databricks_source import DatabricksFailureSource
|
|
33
|
+
from iic.models.report import IncidentReport
|
|
34
|
+
from iic.notify.teams_notifier import TeamsNotifier
|
|
35
|
+
from iic.report.report_generator import ReportGenerator
|
|
36
|
+
from iic.routing.router import IncidentModelRouter
|
|
37
|
+
from iic.runtime.pattern_store import NullPatternStore
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class EngineConfig:
|
|
42
|
+
job_id: str = ""
|
|
43
|
+
parent_run_id: str = ""
|
|
44
|
+
lightweight_model: str = "databricks-meta-llama-3-3-70b-instruct"
|
|
45
|
+
powerful_model: str = "databricks-claude-opus-4-8"
|
|
46
|
+
teams_webhook: str = ""
|
|
47
|
+
host: str = ""
|
|
48
|
+
notify: bool = True
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class EngineResult:
|
|
53
|
+
reports: list[IncidentReport] = field(default_factory=list)
|
|
54
|
+
tokens_used: int = 0
|
|
55
|
+
notified: bool = False
|
|
56
|
+
summary: str = ""
|
|
57
|
+
|
|
58
|
+
def to_dict(self) -> dict:
|
|
59
|
+
return {
|
|
60
|
+
"summary": self.summary,
|
|
61
|
+
"tokens_used": self.tokens_used,
|
|
62
|
+
"notified": self.notified,
|
|
63
|
+
"incidents": [r.to_dict() for r in self.reports],
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class IncidentEngine:
|
|
68
|
+
def __init__(self, config: EngineConfig, *, source, context_builder=None,
|
|
69
|
+
dependency_analyzer=None, change_detector=None, dna_builder=None,
|
|
70
|
+
impact_engine=None, router=None, diagnosis_engine=None,
|
|
71
|
+
report_generator=None, notifier=None, pattern_store=None):
|
|
72
|
+
self.config = config
|
|
73
|
+
self.source = source
|
|
74
|
+
self.context_builder = context_builder or ContextBuilder()
|
|
75
|
+
self.dependency_analyzer = dependency_analyzer or DependencyAnalyzer()
|
|
76
|
+
self.change_detector = change_detector or ChangeDetector()
|
|
77
|
+
self.dna_builder = dna_builder or IncidentDNABuilder()
|
|
78
|
+
self.impact_engine = impact_engine or ImpactEngine()
|
|
79
|
+
self.router = router or IncidentModelRouter(config.lightweight_model, config.powerful_model)
|
|
80
|
+
self.diagnosis_engine = diagnosis_engine or DiagnosisEngine()
|
|
81
|
+
self.report_generator = report_generator or ReportGenerator()
|
|
82
|
+
self.notifier = notifier or TeamsNotifier(config.teams_webhook)
|
|
83
|
+
self.pattern_store = pattern_store or NullPatternStore()
|
|
84
|
+
|
|
85
|
+
def run(self) -> EngineResult:
|
|
86
|
+
# Stage 1+2 — ingest & normalize.
|
|
87
|
+
events = self.source.discover()
|
|
88
|
+
if not events:
|
|
89
|
+
return EngineResult(summary="No failures found. Nothing to analyze.")
|
|
90
|
+
|
|
91
|
+
run_info = getattr(self.source, "run_info", {}) or {}
|
|
92
|
+
run_id = events[0].run_id
|
|
93
|
+
job_tasks = (run_info.get("tasks") or [])
|
|
94
|
+
failed_keys = {e.task for e in events}
|
|
95
|
+
# Authoritative upstream map from the run DAG (always present on the
|
|
96
|
+
# Databricks path) — used for the dependency override without an extra
|
|
97
|
+
# client call.
|
|
98
|
+
upstream_map = {
|
|
99
|
+
t.get("task_key", ""): [d.get("task_key", "") for d in t.get("depends_on", [])]
|
|
100
|
+
for t in job_tasks
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
reports: list[IncidentReport] = []
|
|
104
|
+
total_tokens = 0
|
|
105
|
+
|
|
106
|
+
for i, event in enumerate(events, 1):
|
|
107
|
+
# Stage 3 — context.
|
|
108
|
+
context = self.context_builder.build(event)
|
|
109
|
+
|
|
110
|
+
# Stage 4 — dependency / blast radius.
|
|
111
|
+
blast = self.dependency_analyzer.analyze(
|
|
112
|
+
job_tasks, event.task, referenced_tables=context.referenced_tables)
|
|
113
|
+
|
|
114
|
+
# Stage 5 — change detection.
|
|
115
|
+
change_diff = self.change_detector.detect(
|
|
116
|
+
self.config.job_id, run_id, failed_run=run_info or None)
|
|
117
|
+
|
|
118
|
+
# Stage 6 — Incident DNA (deterministic heart). A task whose upstream
|
|
119
|
+
# also failed in this run is a derived failure (DAG ∪ context lineage).
|
|
120
|
+
upstream_keys = set(upstream_map.get(event.task, [])) | set(context.upstream_tasks)
|
|
121
|
+
upstream_failed = any(up in failed_keys for up in upstream_keys)
|
|
122
|
+
dna = self.dna_builder.build(event, context, change_diff, upstream_failed=upstream_failed)
|
|
123
|
+
|
|
124
|
+
# Stage 7 — deterministic impact (NO LLM).
|
|
125
|
+
recurrence = self.pattern_store.recurrence(dna.pattern_id)
|
|
126
|
+
impact = self.impact_engine.score(blast, dna, recurrence=recurrence)
|
|
127
|
+
|
|
128
|
+
# Stage 8 — route. A pattern store may expose is_known(): a recurring,
|
|
129
|
+
# already-understood pattern is treated as a cache hit → LLM skipped.
|
|
130
|
+
is_known = getattr(self.pattern_store, "is_known", None)
|
|
131
|
+
cache_hit = bool(is_known(dna.pattern_id)) if callable(is_known) else False
|
|
132
|
+
routing = self.router.route(dna, impact, cache_hit=cache_hit)
|
|
133
|
+
|
|
134
|
+
# Stage 9 — diagnosis (LLM only if routed).
|
|
135
|
+
evidence = list(dna.signals)
|
|
136
|
+
diagnosis, tokens = self.diagnosis_engine.diagnose(
|
|
137
|
+
event, dna, routing, context=context, change_diff=change_diff, evidence=evidence)
|
|
138
|
+
total_tokens += tokens
|
|
139
|
+
|
|
140
|
+
# Stage 10 — report.
|
|
141
|
+
incident_id = f"INC-{run_id or 'adhoc'}-{i}"
|
|
142
|
+
report = self.report_generator.build(
|
|
143
|
+
incident_id, event, dna, impact, diagnosis,
|
|
144
|
+
routing=routing, context=context, change_diff=change_diff, evidence=evidence)
|
|
145
|
+
reports.append(report)
|
|
146
|
+
self.pattern_store.record(dna.pattern_id, incident_id, impact.severity.value)
|
|
147
|
+
|
|
148
|
+
# Sort by severity for prioritisation (the product's core promise).
|
|
149
|
+
from iic.notify.teams_notifier import _SEVERITY_RANK
|
|
150
|
+
reports.sort(key=lambda r: _SEVERITY_RANK.get(r.impact.severity, 9))
|
|
151
|
+
|
|
152
|
+
# Stage 11 — notify (optional).
|
|
153
|
+
notified = False
|
|
154
|
+
if self.config.notify:
|
|
155
|
+
notified = self.notifier.send(reports, run_id=run_id,
|
|
156
|
+
host=self.config.host, job_id=self.config.job_id)
|
|
157
|
+
|
|
158
|
+
top = reports[0].impact.severity.value if reports else "n/a"
|
|
159
|
+
summary = (f"{len(reports)} incident(s) analyzed | top severity {top} | "
|
|
160
|
+
f"{total_tokens} tokens | notified={notified}")
|
|
161
|
+
return EngineResult(reports=reports, tokens_used=total_tokens, notified=notified, summary=summary)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def run_from_notebook(spark, dbutils) -> str:
|
|
165
|
+
"""Entry point for the thin Databricks notebook driver.
|
|
166
|
+
|
|
167
|
+
Reads widget params, wires the real Databricks collaborators, runs the
|
|
168
|
+
pipeline, and returns a one-line summary for ``dbutils.notebook.exit``.
|
|
169
|
+
"""
|
|
170
|
+
from healing_kit.auth import build_client, resolve_auth_from_dbutils
|
|
171
|
+
|
|
172
|
+
def w(name, default=""):
|
|
173
|
+
dbutils.widgets.text(name, default)
|
|
174
|
+
return dbutils.widgets.get(name)
|
|
175
|
+
|
|
176
|
+
secret_scope = w("secret_scope", "iic")
|
|
177
|
+
config = EngineConfig(
|
|
178
|
+
job_id=w("pipeline_job_id", ""),
|
|
179
|
+
parent_run_id=w("parent_run_id", "").strip(),
|
|
180
|
+
lightweight_model=w("ai_endpoint", "databricks-meta-llama-3-3-70b-instruct"),
|
|
181
|
+
powerful_model=w("ai_endpoint_powerful", "databricks-claude-opus-4-8"),
|
|
182
|
+
teams_webhook=w("teams_webhook_url", ""),
|
|
183
|
+
notify=w("notify", "true").lower() != "false",
|
|
184
|
+
)
|
|
185
|
+
catalog = w("catalog", "dev_catalog")
|
|
186
|
+
schema = w("schema", "iic_schema")
|
|
187
|
+
|
|
188
|
+
auth = resolve_auth_from_dbutils(dbutils, secret_scope)
|
|
189
|
+
client = build_client(auth)
|
|
190
|
+
config.host = auth.config.host
|
|
191
|
+
print(f"Auth method: {auth.method}")
|
|
192
|
+
|
|
193
|
+
from iic.runtime.pattern_store import DeltaPatternStore
|
|
194
|
+
|
|
195
|
+
source = DatabricksFailureSource(client, config.job_id, parent_run_id=config.parent_run_id)
|
|
196
|
+
engine = IncidentEngine(
|
|
197
|
+
config,
|
|
198
|
+
source=source,
|
|
199
|
+
context_builder=ContextBuilder(client=client, spark=spark),
|
|
200
|
+
dependency_analyzer=DependencyAnalyzer(client=client),
|
|
201
|
+
change_detector=ChangeDetector(client=client),
|
|
202
|
+
diagnosis_engine=DiagnosisEngine(client=client),
|
|
203
|
+
pattern_store=DeltaPatternStore(spark, catalog, schema),
|
|
204
|
+
)
|
|
205
|
+
result = engine.run()
|
|
206
|
+
print(result.summary)
|
|
207
|
+
return result.summary
|