shkit 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- healing_kit/__init__.py +3 -0
- healing_kit/auth.py +79 -0
- healing_kit/clients/__init__.py +1 -0
- healing_kit/clients/databricks_client.py +183 -0
- healing_kit/clients/teams_client.py +128 -0
- healing_kit/models/__init__.py +1 -0
- healing_kit/models/diagnosis.py +45 -0
- healing_kit/models/events.py +30 -0
- healing_kit/models/evidence.py +83 -0
- healing_kit/runtime/__init__.py +6 -0
- healing_kit/runtime/approval.py +141 -0
- healing_kit/runtime/maintenance.py +52 -0
- healing_kit/services/__init__.py +1 -0
- healing_kit/services/cache_service.py +120 -0
- healing_kit/services/circuit_breaker.py +114 -0
- healing_kit/services/context_agent.py +127 -0
- healing_kit/services/dependency_graph.py +141 -0
- healing_kit/services/diagnosis_engine.py +165 -0
- healing_kit/services/identity.py +61 -0
- healing_kit/services/model_router.py +52 -0
- healing_kit/services/query_guard.py +168 -0
- healing_kit/services/resolution_verifier.py +100 -0
- healing_kit/services/token_budget.py +137 -0
- healing_kit/utils/__init__.py +1 -0
- healing_kit/utils/error_hash.py +15 -0
- healing_kit/utils/hmac_tokens.py +86 -0
- healing_kit/utils/sql_safety.py +84 -0
- iic/__init__.py +51 -0
- iic/__main__.py +18 -0
- iic/_console.py +235 -0
- iic/_doctor.py +143 -0
- iic/change/__init__.py +7 -0
- iic/change/change_detector.py +154 -0
- iic/context/__init__.py +7 -0
- iic/context/context_builder.py +117 -0
- iic/dependency/__init__.py +7 -0
- iic/dependency/dependency_analyzer.py +93 -0
- iic/diagnosis/__init__.py +7 -0
- iic/diagnosis/diagnosis_engine.py +183 -0
- iic/dna/__init__.py +7 -0
- iic/dna/dna_builder.py +184 -0
- iic/impact/__init__.py +7 -0
- iic/impact/impact_engine.py +102 -0
- iic/ingestion/__init__.py +14 -0
- iic/ingestion/base.py +21 -0
- iic/ingestion/databricks_source.py +98 -0
- iic/ingestion/static_source.py +23 -0
- iic/ingestion/webhook_source.py +39 -0
- iic/models/__init__.py +44 -0
- iic/models/change.py +77 -0
- iic/models/context.py +46 -0
- iic/models/diagnosis.py +37 -0
- iic/models/dna.py +77 -0
- iic/models/event.py +78 -0
- iic/models/impact.py +60 -0
- iic/models/report.py +88 -0
- iic/models/routing.py +41 -0
- iic/notify/__init__.py +7 -0
- iic/notify/teams_notifier.py +112 -0
- iic/report/__init__.py +7 -0
- iic/report/report_generator.py +67 -0
- iic/routing/__init__.py +7 -0
- iic/routing/router.py +42 -0
- iic/runtime/__init__.py +10 -0
- iic/runtime/_sql.py +11 -0
- iic/runtime/agent_config.py +48 -0
- iic/runtime/agent_runtime.py +70 -0
- iic/runtime/antibodies.py +100 -0
- iic/runtime/bootstrap.py +157 -0
- iic/runtime/constants.py +40 -0
- iic/runtime/context.py +46 -0
- iic/runtime/detective.py +72 -0
- iic/runtime/hooks.py +85 -0
- iic/runtime/incident_engine.py +207 -0
- iic/runtime/inprocess.py +350 -0
- iic/runtime/ledger.py +120 -0
- iic/runtime/monitor.py +155 -0
- iic/runtime/pattern_store.py +53 -0
- iic/runtime/reconciler.py +139 -0
- iic/runtime/scope_config.py +127 -0
- iic/runtime/store.py +150 -0
- iic/runtime/wrapper.py +28 -0
- iic_autoload.pth +1 -0
- onboarding/__init__.py +1 -0
- onboarding/cli.py +168 -0
- onboarding/config_schema.py +62 -0
- onboarding/manifest.py +27 -0
- onboarding/preflight.py +129 -0
- onboarding/provisioner.py +573 -0
- onboarding/rollback.py +81 -0
- shkit-1.2.0.dist-info/METADATA +239 -0
- shkit-1.2.0.dist-info/RECORD +94 -0
- shkit-1.2.0.dist-info/WHEEL +4 -0
- shkit-1.2.0.dist-info/entry_points.txt +2 -0
iic/_doctor.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""``iic doctor`` — the #1 support tool. Verifies, in order, with a clear ✅/❌ line:
|
|
2
|
+
secret scope readable → required keys present → volume writable → webhook reachable
|
|
3
|
+
→ optional-key consistency → (optionally) a named principal's READ ACL.
|
|
4
|
+
|
|
5
|
+
``python -m iic doctor [--check-principal <name>]`` and ``iic.doctor()``. Exit code
|
|
6
|
+
is 0 only when all REQUIRED checks pass. Failure messages are actionable on purpose.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
from iic.runtime.constants import (
|
|
15
|
+
DEFAULT_SECRET_SCOPE,
|
|
16
|
+
SECRET_KEY_GITHUB_REPO,
|
|
17
|
+
SECRET_KEY_GITHUB_TOKEN,
|
|
18
|
+
SECRET_KEY_HOST,
|
|
19
|
+
SECRET_KEY_PAT,
|
|
20
|
+
SECRET_KEY_TEAMS_WEBHOOK,
|
|
21
|
+
SECRET_KEY_VOLUME_PATH,
|
|
22
|
+
TEAMS_TIMEOUT,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
_LOAD = "__load__"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def doctor(check_principal=None, *, settings=_LOAD, probes=None) -> int:
|
|
29
|
+
scope = os.environ.get("IIC_SECRET_SCOPE", DEFAULT_SECRET_SCOPE)
|
|
30
|
+
probes = probes or _default_probes()
|
|
31
|
+
ok_all = True
|
|
32
|
+
|
|
33
|
+
s = _safe_load_settings() if settings == _LOAD else settings
|
|
34
|
+
if s is None:
|
|
35
|
+
print(f"❌ secret scope '{scope}' not readable as the current identity")
|
|
36
|
+
print(f" → grant READ: databricks secrets put-acl --scope {scope} "
|
|
37
|
+
"--principal <you-or-your-SP> --permission READ")
|
|
38
|
+
return 1
|
|
39
|
+
print(f"✅ secret scope '{scope}' readable")
|
|
40
|
+
|
|
41
|
+
missing = [k for k in (SECRET_KEY_TEAMS_WEBHOOK, SECRET_KEY_VOLUME_PATH) if not s.get(k)]
|
|
42
|
+
if missing:
|
|
43
|
+
print(f"❌ missing required key(s): {missing}")
|
|
44
|
+
print(f" → databricks secrets put-secret --scope {scope} --key {missing[0]} ...")
|
|
45
|
+
return 1
|
|
46
|
+
print("✅ required keys present (teams_webhook, volume_path)")
|
|
47
|
+
|
|
48
|
+
ok, msg = probes["volume_write"](s.get(SECRET_KEY_VOLUME_PATH))
|
|
49
|
+
print(("✅" if ok else "❌") + f" volume writable: {msg}")
|
|
50
|
+
if not ok:
|
|
51
|
+
print(f" → ensure the run identity can write {s.get(SECRET_KEY_VOLUME_PATH)} "
|
|
52
|
+
"(WRITE VOLUME / Unity Catalog grants)")
|
|
53
|
+
ok_all = False
|
|
54
|
+
|
|
55
|
+
ok, msg = probes["webhook"](s.get(SECRET_KEY_TEAMS_WEBHOOK))
|
|
56
|
+
print(("✅" if ok else "❌") + f" webhook reachable: {msg}")
|
|
57
|
+
if not ok:
|
|
58
|
+
print(" → check the teams_webhook URL (is the Power Automate flow enabled?)")
|
|
59
|
+
ok_all = False
|
|
60
|
+
|
|
61
|
+
if bool(s.get(SECRET_KEY_PAT)) != bool(s.get(SECRET_KEY_HOST)):
|
|
62
|
+
print("⚠️ pat/host should be set together — enrichment needs both (non-fatal)")
|
|
63
|
+
if bool(s.get(SECRET_KEY_GITHUB_REPO)) != bool(s.get(SECRET_KEY_GITHUB_TOKEN)):
|
|
64
|
+
print("⚠️ github_repo/github_dispatch_token should be set together (non-fatal)")
|
|
65
|
+
|
|
66
|
+
if check_principal:
|
|
67
|
+
ok, msg = probes["principal"](scope, check_principal)
|
|
68
|
+
print(("✅" if ok else "❌") + f" principal '{check_principal}' can read '{scope}': {msg}")
|
|
69
|
+
if not ok:
|
|
70
|
+
print(f" → databricks secrets put-acl --scope {scope} "
|
|
71
|
+
f"--principal {check_principal} --permission READ")
|
|
72
|
+
ok_all = False
|
|
73
|
+
|
|
74
|
+
print("✅ all required checks passed" if ok_all else "❌ doctor found problems (see above)")
|
|
75
|
+
return 0 if ok_all else 1
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _safe_load_settings():
|
|
79
|
+
try:
|
|
80
|
+
from iic.runtime.scope_config import load_settings
|
|
81
|
+
return load_settings()
|
|
82
|
+
except Exception:
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _default_probes():
|
|
87
|
+
return {"volume_write": _probe_volume_write, "webhook": _probe_webhook,
|
|
88
|
+
"principal": _probe_principal}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _probe_volume_write(volume_path):
|
|
92
|
+
import uuid
|
|
93
|
+
if not volume_path:
|
|
94
|
+
return False, "no volume_path"
|
|
95
|
+
try:
|
|
96
|
+
probe = os.path.join(volume_path, f".iic_doctor_{uuid.uuid4().hex[:8]}")
|
|
97
|
+
with open(probe, "w") as f:
|
|
98
|
+
f.write("ok")
|
|
99
|
+
os.remove(probe)
|
|
100
|
+
return True, f"{volume_path} (probe write+delete ok)"
|
|
101
|
+
except Exception as ex:
|
|
102
|
+
return False, f"{volume_path}: {str(ex)[:120]}"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _probe_webhook(webhook):
|
|
106
|
+
if not webhook:
|
|
107
|
+
return False, "no teams_webhook"
|
|
108
|
+
try:
|
|
109
|
+
import requests
|
|
110
|
+
card = {"type": "message", "attachments": [{
|
|
111
|
+
"contentType": "application/vnd.microsoft.card.adaptive",
|
|
112
|
+
"content": {"type": "AdaptiveCard", "version": "1.4",
|
|
113
|
+
"body": [{"type": "TextBlock", "text": "✅ IIC doctor probe"}]}}]}
|
|
114
|
+
r = requests.post(webhook, json=card, headers={"Content-Type": "application/json"},
|
|
115
|
+
timeout=TEAMS_TIMEOUT)
|
|
116
|
+
return (r.status_code in (200, 202)), f"HTTP {r.status_code}"
|
|
117
|
+
except Exception as ex:
|
|
118
|
+
return False, str(ex)[:120]
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _probe_principal(scope, principal):
|
|
122
|
+
try:
|
|
123
|
+
from databricks.sdk import WorkspaceClient
|
|
124
|
+
w = WorkspaceClient()
|
|
125
|
+
for acl in w.secrets.list_acls(scope=scope):
|
|
126
|
+
if getattr(acl, "principal", None) == principal:
|
|
127
|
+
return True, str(getattr(acl, "permission", ""))
|
|
128
|
+
return False, "no ACL entry"
|
|
129
|
+
except Exception as ex:
|
|
130
|
+
return False, str(ex)[:120]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def main(argv=None) -> int:
|
|
134
|
+
argv = argv if argv is not None else sys.argv[1:]
|
|
135
|
+
principal = None
|
|
136
|
+
if "--check-principal" in argv:
|
|
137
|
+
i = argv.index("--check-principal")
|
|
138
|
+
principal = argv[i + 1] if i + 1 < len(argv) else None
|
|
139
|
+
return doctor(check_principal=principal)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
if __name__ == "__main__": # pragma: no cover
|
|
143
|
+
sys.exit(main())
|
iic/change/__init__.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""Stage 5 — diff the failed run against the last successful run.
|
|
2
|
+
|
|
3
|
+
Most production failures correlate with a recent change, so this is one of the
|
|
4
|
+
highest-value signals. The comparison is split into a pure, fully-testable core
|
|
5
|
+
(``extract_profile`` + ``diff_profiles`` + ``diff_schemas``) and a thin I/O wrapper
|
|
6
|
+
(``ChangeDetector.detect``) that fetches the two runs via the client.
|
|
7
|
+
|
|
8
|
+
What we can compare from the Jobs/Runs API without extra infrastructure:
|
|
9
|
+
* config — task base_parameters
|
|
10
|
+
* runtime — spark_version, node_type_id, worker count
|
|
11
|
+
* deployment — git commit of the run's source
|
|
12
|
+
* code — notebook revision timestamps
|
|
13
|
+
* schema — only when before/after snapshots are supplied
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from iic.models.change import ChangeDiffObject, FieldChange
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def extract_profile(run: dict) -> dict:
|
|
22
|
+
"""Flatten a Jobs/Runs API run dict into a comparable profile."""
|
|
23
|
+
run = run or {}
|
|
24
|
+
settings = run.get("settings", run) # runs/get nests under nothing; jobs/get under settings
|
|
25
|
+
tasks = run.get("tasks") or settings.get("tasks") or []
|
|
26
|
+
|
|
27
|
+
params: dict = {}
|
|
28
|
+
notebook_revisions: dict = {}
|
|
29
|
+
for t in tasks:
|
|
30
|
+
nb = t.get("notebook_task", {}) or {}
|
|
31
|
+
for k, v in (nb.get("base_parameters") or {}).items():
|
|
32
|
+
params[f"{t.get('task_key', '?')}.{k}"] = str(v)
|
|
33
|
+
if nb.get("source") == "GIT" or nb.get("notebook_path"):
|
|
34
|
+
rev = t.get("git_source_revision") or nb.get("revision_timestamp")
|
|
35
|
+
if rev:
|
|
36
|
+
notebook_revisions[t.get("task_key", "?")] = str(rev)
|
|
37
|
+
|
|
38
|
+
cluster = _first_cluster(run, tasks)
|
|
39
|
+
git = run.get("git_source") or settings.get("git_source") or {}
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
"params": params,
|
|
43
|
+
"spark_version": cluster.get("spark_version", ""),
|
|
44
|
+
"node_type_id": cluster.get("node_type_id", ""),
|
|
45
|
+
"num_workers": str(cluster.get("num_workers", "")),
|
|
46
|
+
"git_commit": git.get("git_commit") or git.get("git_branch") or "",
|
|
47
|
+
"notebook_revisions": notebook_revisions,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _first_cluster(run: dict, tasks: list) -> dict:
|
|
52
|
+
for jc in run.get("job_clusters", []) or []:
|
|
53
|
+
spec = jc.get("new_cluster") or {}
|
|
54
|
+
if spec:
|
|
55
|
+
return spec
|
|
56
|
+
for t in tasks:
|
|
57
|
+
spec = t.get("new_cluster") or {}
|
|
58
|
+
if spec:
|
|
59
|
+
return spec
|
|
60
|
+
return {}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def diff_profiles(failed: dict, success: dict) -> ChangeDiffObject:
|
|
64
|
+
"""Pure diff of two run profiles → ChangeDiffObject (no schema diff)."""
|
|
65
|
+
diff = ChangeDiffObject(has_prior_success=True)
|
|
66
|
+
|
|
67
|
+
# config — parameter values
|
|
68
|
+
diff.config_changes = _diff_maps(failed.get("params", {}), success.get("params", {}), "config")
|
|
69
|
+
|
|
70
|
+
# runtime — cluster shape
|
|
71
|
+
for key, label in (("spark_version", "spark_version"),
|
|
72
|
+
("node_type_id", "node_type"),
|
|
73
|
+
("num_workers", "num_workers")):
|
|
74
|
+
before, after = success.get(key, ""), failed.get(key, "")
|
|
75
|
+
if before != after and (before or after):
|
|
76
|
+
diff.runtime_changes.append(FieldChange("runtime", label, before, after))
|
|
77
|
+
|
|
78
|
+
# deployment — git commit moved
|
|
79
|
+
bc, ac = success.get("git_commit", ""), failed.get("git_commit", "")
|
|
80
|
+
if bc != ac and (bc or ac):
|
|
81
|
+
diff.deployment_changes.append(FieldChange("deployment", "git_commit", bc, ac))
|
|
82
|
+
|
|
83
|
+
# code — notebook revisions changed
|
|
84
|
+
diff.code_changes = _diff_maps(
|
|
85
|
+
failed.get("notebook_revisions", {}), success.get("notebook_revisions", {}), "code"
|
|
86
|
+
)
|
|
87
|
+
return diff
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def diff_schemas(before: dict, after: dict) -> list[FieldChange]:
|
|
91
|
+
"""Diff two ``{table: [columns]}`` snapshots into schema FieldChanges."""
|
|
92
|
+
changes: list[FieldChange] = []
|
|
93
|
+
for table in sorted(set(before) | set(after)):
|
|
94
|
+
prev_cols = set(before.get(table, []))
|
|
95
|
+
curr_cols = set(after.get(table, []))
|
|
96
|
+
for col in sorted(prev_cols - curr_cols):
|
|
97
|
+
changes.append(FieldChange("schema", f"{table}.{col}", before=col, after=""))
|
|
98
|
+
for col in sorted(curr_cols - prev_cols):
|
|
99
|
+
changes.append(FieldChange("schema", f"{table}.{col}", before="", after=col))
|
|
100
|
+
return changes
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _diff_maps(failed: dict, success: dict, category: str) -> list[FieldChange]:
|
|
104
|
+
changes: list[FieldChange] = []
|
|
105
|
+
for key in sorted(set(failed) | set(success)):
|
|
106
|
+
before, after = success.get(key, ""), failed.get(key, "")
|
|
107
|
+
if before != after:
|
|
108
|
+
changes.append(FieldChange(category, key, str(before), str(after)))
|
|
109
|
+
return changes
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ChangeDetector:
|
|
113
|
+
"""Fetches the failed run + last successful run and diffs them."""
|
|
114
|
+
|
|
115
|
+
def __init__(self, client=None):
|
|
116
|
+
self.client = client
|
|
117
|
+
|
|
118
|
+
def detect(self, job_id: str, failed_run_id: str,
|
|
119
|
+
failed_run: dict | None = None,
|
|
120
|
+
prev_schema: dict | None = None,
|
|
121
|
+
curr_schema: dict | None = None) -> ChangeDiffObject:
|
|
122
|
+
if not (self.client and job_id):
|
|
123
|
+
return ChangeDiffObject(has_prior_success=False)
|
|
124
|
+
|
|
125
|
+
failed = failed_run or self._get_run(failed_run_id)
|
|
126
|
+
success = self._last_success(job_id, exclude_run_id=failed_run_id)
|
|
127
|
+
if not success:
|
|
128
|
+
diff = ChangeDiffObject(has_prior_success=False)
|
|
129
|
+
else:
|
|
130
|
+
diff = diff_profiles(extract_profile(failed), extract_profile(success))
|
|
131
|
+
diff.last_success_run_id = str(success.get("run_id", ""))
|
|
132
|
+
|
|
133
|
+
if prev_schema is not None and curr_schema is not None:
|
|
134
|
+
diff.schema_changes.extend(diff_schemas(prev_schema, curr_schema))
|
|
135
|
+
return diff
|
|
136
|
+
|
|
137
|
+
def _get_run(self, run_id: str) -> dict:
|
|
138
|
+
try:
|
|
139
|
+
return self.client._get("/api/2.1/jobs/runs/get", {"run_id": int(run_id)})
|
|
140
|
+
except Exception:
|
|
141
|
+
return {}
|
|
142
|
+
|
|
143
|
+
def _last_success(self, job_id: str, exclude_run_id: str) -> dict:
|
|
144
|
+
try:
|
|
145
|
+
runs = self.client.list_runs(int(job_id), limit=20)
|
|
146
|
+
except Exception:
|
|
147
|
+
return {}
|
|
148
|
+
for run in runs:
|
|
149
|
+
if str(run.get("run_id", "")) == str(exclude_run_id):
|
|
150
|
+
continue
|
|
151
|
+
if run.get("state", {}).get("result_state") == "SUCCESS":
|
|
152
|
+
# Re-fetch for full task/cluster detail.
|
|
153
|
+
return self._get_run(str(run.get("run_id", ""))) or run
|
|
154
|
+
return {}
|
iic/context/__init__.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Stage 3 — gather everything relevant about a failure.
|
|
2
|
+
|
|
3
|
+
Best-effort by design: each fetch is independently guarded so a missing notebook,
|
|
4
|
+
an unreachable cluster, or a lineage gap degrades the bundle rather than aborting
|
|
5
|
+
the pipeline. Table references are parsed from the notebook source so later stages
|
|
6
|
+
(schema snapshot, dependency analysis) have something to work with even when UC
|
|
7
|
+
lineage is unavailable.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
from iic.models.context import IncidentContextBundle
|
|
15
|
+
from iic.models.event import NormalizedFailureEvent
|
|
16
|
+
|
|
17
|
+
# Three-part (catalog.schema.table) or two-part names following FROM/JOIN/INTO/UPDATE.
|
|
18
|
+
_TABLE_RE = re.compile(
|
|
19
|
+
r"\b(?:FROM|JOIN|INTO|UPDATE|TABLE)\s+([a-zA-Z_][\w]*(?:\.[a-zA-Z_][\w]*){1,2})",
|
|
20
|
+
re.IGNORECASE,
|
|
21
|
+
)
|
|
22
|
+
_MAX_NOTEBOOK_CHARS = 4000
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ContextBuilder:
|
|
26
|
+
"""Builds an :class:`IncidentContextBundle` from a normalized event.
|
|
27
|
+
|
|
28
|
+
``client`` is a DatabricksClient-like object; ``spark`` is optional and only
|
|
29
|
+
used to snapshot the failing table's schema. Both may be ``None`` in tests.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, client=None, spark=None):
|
|
33
|
+
self.client = client
|
|
34
|
+
self.spark = spark
|
|
35
|
+
|
|
36
|
+
def build(self, event: NormalizedFailureEvent) -> IncidentContextBundle:
|
|
37
|
+
bundle = IncidentContextBundle(event_id=event.event_id)
|
|
38
|
+
|
|
39
|
+
# Logs always include the failure text we already have.
|
|
40
|
+
logs = []
|
|
41
|
+
if event.error_message:
|
|
42
|
+
logs.append(event.error_message)
|
|
43
|
+
if event.error_trace:
|
|
44
|
+
logs.append(event.error_trace)
|
|
45
|
+
bundle.logs = logs
|
|
46
|
+
|
|
47
|
+
bundle.notebook_source = self._notebook(event.notebook_path)
|
|
48
|
+
bundle.referenced_tables = self._tables(bundle.notebook_source)
|
|
49
|
+
bundle.cluster_state = self._cluster(event.cluster_id)
|
|
50
|
+
bundle.job_metadata, dag = self._job_metadata(event.job_id, event.task)
|
|
51
|
+
bundle.upstream_tasks, bundle.downstream_tasks = dag
|
|
52
|
+
bundle.schema_snapshot = self._schema(bundle.referenced_tables)
|
|
53
|
+
return bundle
|
|
54
|
+
|
|
55
|
+
# ─── individual, independently-guarded fetches ───
|
|
56
|
+
|
|
57
|
+
def _notebook(self, path: str) -> str:
|
|
58
|
+
if not (path and self.client):
|
|
59
|
+
return ""
|
|
60
|
+
|
|
61
|
+
def _fetch() -> str:
|
|
62
|
+
try:
|
|
63
|
+
return (self.client.export_notebook(path) or "")[:_MAX_NOTEBOOK_CHARS]
|
|
64
|
+
except Exception:
|
|
65
|
+
return ""
|
|
66
|
+
|
|
67
|
+
# Memoized by path only when the session cache is active (i.e. on a
|
|
68
|
+
# bootstrapped cluster). Off-cluster/tests → always a fresh fetch.
|
|
69
|
+
from iic.runtime.context import cached
|
|
70
|
+
return cached(f"notebook:{path}", _fetch)
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _tables(notebook_source: str) -> list[str]:
|
|
74
|
+
if not notebook_source:
|
|
75
|
+
return []
|
|
76
|
+
found = {m.group(1).lower() for m in _TABLE_RE.finditer(notebook_source)}
|
|
77
|
+
return sorted(found)
|
|
78
|
+
|
|
79
|
+
def _cluster(self, cluster_id: str) -> dict:
|
|
80
|
+
if not (cluster_id and self.client):
|
|
81
|
+
return {}
|
|
82
|
+
try:
|
|
83
|
+
events = self.client.get_cluster_events(cluster_id, limit=5)
|
|
84
|
+
except Exception:
|
|
85
|
+
return {}
|
|
86
|
+
return {"cluster_id": cluster_id, "recent_events": [e.get("type", "") for e in events]}
|
|
87
|
+
|
|
88
|
+
def _job_metadata(self, job_id: str, failed_task: str):
|
|
89
|
+
"""Return (metadata_dict, (upstream_task_keys, downstream_task_keys))."""
|
|
90
|
+
if not (job_id and self.client):
|
|
91
|
+
return {}, ([], [])
|
|
92
|
+
try:
|
|
93
|
+
job = self.client.get_job(int(job_id))
|
|
94
|
+
except Exception:
|
|
95
|
+
return {}, ([], [])
|
|
96
|
+
settings = job.get("settings", {}) or {}
|
|
97
|
+
tasks = settings.get("tasks", []) or []
|
|
98
|
+
upstream, downstream = [], []
|
|
99
|
+
for t in tasks:
|
|
100
|
+
if t.get("task_key") == failed_task:
|
|
101
|
+
upstream = [d.get("task_key", "") for d in t.get("depends_on", [])]
|
|
102
|
+
if any(d.get("task_key") == failed_task for d in t.get("depends_on", [])):
|
|
103
|
+
downstream.append(t.get("task_key", ""))
|
|
104
|
+
meta = {"job_name": settings.get("name", ""), "task_count": len(tasks)}
|
|
105
|
+
return meta, (upstream, downstream)
|
|
106
|
+
|
|
107
|
+
def _schema(self, tables: list[str]) -> dict:
|
|
108
|
+
if not (tables and self.spark):
|
|
109
|
+
return {}
|
|
110
|
+
snapshot = {}
|
|
111
|
+
for table in tables[:3]: # cap the cost; first few referenced tables
|
|
112
|
+
try:
|
|
113
|
+
rows = self.spark.sql(f"DESCRIBE TABLE {table}").collect()
|
|
114
|
+
snapshot[table] = [r["col_name"] for r in rows if r["col_name"] and not r["col_name"].startswith("#")]
|
|
115
|
+
except Exception:
|
|
116
|
+
continue
|
|
117
|
+
return snapshot
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Stage 4 — compute the blast radius of a failure.
|
|
2
|
+
|
|
3
|
+
Combines two graphs, both deterministic:
|
|
4
|
+
* the job DAG (task-level downstream tasks that are now blocked), reusing the
|
|
5
|
+
existing :class:`DependencyGraphBuilder`; and
|
|
6
|
+
* Unity Catalog table lineage (downstream tables / dashboards), when reachable.
|
|
7
|
+
|
|
8
|
+
The result feeds the impact engine. No LLM, no Spark.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
|
|
15
|
+
from healing_kit.services.dependency_graph import DependencyGraphBuilder
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class BlastRadius:
|
|
20
|
+
"""Everything downstream of the failed task that is now at risk."""
|
|
21
|
+
|
|
22
|
+
downstream_tasks: list[str] = field(default_factory=list)
|
|
23
|
+
downstream_tables: list[str] = field(default_factory=list)
|
|
24
|
+
downstream_dashboards: list[str] = field(default_factory=list)
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def downstream_jobs(self) -> int:
|
|
28
|
+
return len(self.downstream_tasks)
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def affected_tables(self) -> int:
|
|
32
|
+
return len(self.downstream_tables)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def dashboard_impact(self) -> int:
|
|
36
|
+
return len(self.downstream_dashboards)
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def total(self) -> int:
|
|
40
|
+
return self.downstream_jobs + self.affected_tables + self.dashboard_impact
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> dict:
|
|
43
|
+
return {
|
|
44
|
+
"downstream_tasks": self.downstream_tasks,
|
|
45
|
+
"downstream_tables": self.downstream_tables,
|
|
46
|
+
"downstream_dashboards": self.downstream_dashboards,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DependencyAnalyzer:
|
|
51
|
+
"""Computes a :class:`BlastRadius` for a failed task.
|
|
52
|
+
|
|
53
|
+
``client`` is optional and only used for UC table lineage; the task-level
|
|
54
|
+
blast radius is computed purely from the job DAG passed in.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, client=None):
|
|
58
|
+
self.client = client
|
|
59
|
+
self._builder = DependencyGraphBuilder()
|
|
60
|
+
|
|
61
|
+
def analyze(self, tasks: list[dict], failed_task: str, referenced_tables: list[str] | None = None) -> BlastRadius:
|
|
62
|
+
radius = BlastRadius()
|
|
63
|
+
|
|
64
|
+
# Task-level: every transitively downstream task is now blocked.
|
|
65
|
+
try:
|
|
66
|
+
graph = self._builder.build_from_tasks(tasks or [])
|
|
67
|
+
radius.downstream_tasks = self._builder.get_all_downstream(graph, failed_task)
|
|
68
|
+
except Exception:
|
|
69
|
+
radius.downstream_tasks = []
|
|
70
|
+
|
|
71
|
+
# Table-level: UC lineage for each table the failing task writes/reads.
|
|
72
|
+
radius.downstream_tables, radius.downstream_dashboards = self._lineage(referenced_tables or [])
|
|
73
|
+
return radius
|
|
74
|
+
|
|
75
|
+
def _lineage(self, tables: list[str]):
|
|
76
|
+
downstream_tables: set[str] = set()
|
|
77
|
+
dashboards: set[str] = set()
|
|
78
|
+
if not self.client:
|
|
79
|
+
return [], []
|
|
80
|
+
for table in tables[:5]:
|
|
81
|
+
try:
|
|
82
|
+
lineage = self.client.get_table_lineage(table) or {}
|
|
83
|
+
except Exception:
|
|
84
|
+
continue
|
|
85
|
+
for entry in lineage.get("downstreams", []) or []:
|
|
86
|
+
tbl = (entry.get("tableInfo") or {}).get("name")
|
|
87
|
+
if tbl:
|
|
88
|
+
downstream_tables.add(tbl)
|
|
89
|
+
for dash in entry.get("dashboards", []) or []:
|
|
90
|
+
name = dash.get("name") or dash.get("id")
|
|
91
|
+
if name:
|
|
92
|
+
dashboards.add(str(name))
|
|
93
|
+
return sorted(downstream_tables), sorted(dashboards)
|