shkit 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- healing_kit/__init__.py +3 -0
- healing_kit/auth.py +79 -0
- healing_kit/clients/__init__.py +1 -0
- healing_kit/clients/databricks_client.py +183 -0
- healing_kit/clients/teams_client.py +128 -0
- healing_kit/models/__init__.py +1 -0
- healing_kit/models/diagnosis.py +45 -0
- healing_kit/models/events.py +30 -0
- healing_kit/models/evidence.py +83 -0
- healing_kit/runtime/__init__.py +6 -0
- healing_kit/runtime/approval.py +141 -0
- healing_kit/runtime/maintenance.py +52 -0
- healing_kit/services/__init__.py +1 -0
- healing_kit/services/cache_service.py +120 -0
- healing_kit/services/circuit_breaker.py +114 -0
- healing_kit/services/context_agent.py +127 -0
- healing_kit/services/dependency_graph.py +141 -0
- healing_kit/services/diagnosis_engine.py +165 -0
- healing_kit/services/identity.py +61 -0
- healing_kit/services/model_router.py +52 -0
- healing_kit/services/query_guard.py +168 -0
- healing_kit/services/resolution_verifier.py +100 -0
- healing_kit/services/token_budget.py +137 -0
- healing_kit/utils/__init__.py +1 -0
- healing_kit/utils/error_hash.py +15 -0
- healing_kit/utils/hmac_tokens.py +86 -0
- healing_kit/utils/sql_safety.py +84 -0
- iic/__init__.py +51 -0
- iic/__main__.py +18 -0
- iic/_console.py +235 -0
- iic/_doctor.py +143 -0
- iic/change/__init__.py +7 -0
- iic/change/change_detector.py +154 -0
- iic/context/__init__.py +7 -0
- iic/context/context_builder.py +117 -0
- iic/dependency/__init__.py +7 -0
- iic/dependency/dependency_analyzer.py +93 -0
- iic/diagnosis/__init__.py +7 -0
- iic/diagnosis/diagnosis_engine.py +183 -0
- iic/dna/__init__.py +7 -0
- iic/dna/dna_builder.py +184 -0
- iic/impact/__init__.py +7 -0
- iic/impact/impact_engine.py +102 -0
- iic/ingestion/__init__.py +14 -0
- iic/ingestion/base.py +21 -0
- iic/ingestion/databricks_source.py +98 -0
- iic/ingestion/static_source.py +23 -0
- iic/ingestion/webhook_source.py +39 -0
- iic/models/__init__.py +44 -0
- iic/models/change.py +77 -0
- iic/models/context.py +46 -0
- iic/models/diagnosis.py +37 -0
- iic/models/dna.py +77 -0
- iic/models/event.py +78 -0
- iic/models/impact.py +60 -0
- iic/models/report.py +88 -0
- iic/models/routing.py +41 -0
- iic/notify/__init__.py +7 -0
- iic/notify/teams_notifier.py +112 -0
- iic/report/__init__.py +7 -0
- iic/report/report_generator.py +67 -0
- iic/routing/__init__.py +7 -0
- iic/routing/router.py +42 -0
- iic/runtime/__init__.py +10 -0
- iic/runtime/_sql.py +11 -0
- iic/runtime/agent_config.py +48 -0
- iic/runtime/agent_runtime.py +70 -0
- iic/runtime/antibodies.py +100 -0
- iic/runtime/bootstrap.py +157 -0
- iic/runtime/constants.py +40 -0
- iic/runtime/context.py +46 -0
- iic/runtime/detective.py +72 -0
- iic/runtime/hooks.py +85 -0
- iic/runtime/incident_engine.py +207 -0
- iic/runtime/inprocess.py +350 -0
- iic/runtime/ledger.py +120 -0
- iic/runtime/monitor.py +155 -0
- iic/runtime/pattern_store.py +53 -0
- iic/runtime/reconciler.py +139 -0
- iic/runtime/scope_config.py +127 -0
- iic/runtime/store.py +150 -0
- iic/runtime/wrapper.py +28 -0
- iic_autoload.pth +1 -0
- onboarding/__init__.py +1 -0
- onboarding/cli.py +168 -0
- onboarding/config_schema.py +62 -0
- onboarding/manifest.py +27 -0
- onboarding/preflight.py +129 -0
- onboarding/provisioner.py +573 -0
- onboarding/rollback.py +81 -0
- shkit-1.2.0.dist-info/METADATA +239 -0
- shkit-1.2.0.dist-info/RECORD +94 -0
- shkit-1.2.0.dist-info/WHEEL +4 -0
- shkit-1.2.0.dist-info/entry_points.txt +2 -0
healing_kit/__init__.py
ADDED
healing_kit/auth.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Identity & auth (#1) — prefer a service principal (OAuth M2M) over a PAT.
|
|
2
|
+
|
|
3
|
+
The healer should authenticate as a least-privileged **service principal**, not a
|
|
4
|
+
human PAT. This module mints a short-lived OAuth token via the Databricks OIDC
|
|
5
|
+
`client_credentials` grant. It falls back to a PAT or the notebook's ambient
|
|
6
|
+
token only when no SP credentials are configured, so existing deploys keep working.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
import requests
|
|
12
|
+
|
|
13
|
+
from healing_kit.clients.databricks_client import DatabricksClient, DatabricksConfig
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class AuthResult:
|
|
18
|
+
"""Resolved auth: a bearer token plus how it was obtained (for audit)."""
|
|
19
|
+
|
|
20
|
+
config: DatabricksConfig
|
|
21
|
+
method: str # "service_principal" | "pat" | "ambient"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_sp_oauth_token(host: str, client_id: str, client_secret: str, scope: str = "all-apis") -> str:
|
|
25
|
+
"""Exchange SP client credentials for a short-lived OAuth access token."""
|
|
26
|
+
host = host.rstrip("/")
|
|
27
|
+
resp = requests.post(
|
|
28
|
+
f"{host}/oidc/v1/token",
|
|
29
|
+
auth=(client_id, client_secret),
|
|
30
|
+
data={"grant_type": "client_credentials", "scope": scope},
|
|
31
|
+
timeout=30,
|
|
32
|
+
)
|
|
33
|
+
resp.raise_for_status()
|
|
34
|
+
return resp.json()["access_token"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def resolve_auth(
|
|
38
|
+
host: str,
|
|
39
|
+
*,
|
|
40
|
+
sp_client_id: str | None = None,
|
|
41
|
+
sp_client_secret: str | None = None,
|
|
42
|
+
pat: str | None = None,
|
|
43
|
+
ambient_token: str | None = None,
|
|
44
|
+
) -> AuthResult:
|
|
45
|
+
"""Resolve auth in priority order: service principal → PAT → ambient token."""
|
|
46
|
+
host = host.rstrip("/")
|
|
47
|
+
if sp_client_id and sp_client_secret:
|
|
48
|
+
token = get_sp_oauth_token(host, sp_client_id, sp_client_secret)
|
|
49
|
+
return AuthResult(DatabricksConfig(host=host, token=token), "service_principal")
|
|
50
|
+
if pat:
|
|
51
|
+
return AuthResult(DatabricksConfig(host=host, token=pat), "pat")
|
|
52
|
+
if ambient_token:
|
|
53
|
+
return AuthResult(DatabricksConfig(host=host, token=ambient_token), "ambient")
|
|
54
|
+
raise ValueError("No credentials: provide SP client_id/secret, a PAT, or an ambient token")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def build_client(auth: AuthResult) -> DatabricksClient:
|
|
58
|
+
return DatabricksClient(auth.config)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def resolve_auth_from_dbutils(dbutils, secret_scope: str, host: str | None = None) -> AuthResult:
|
|
62
|
+
"""Notebook-side helper: read SP creds from a secret scope, fall back to the
|
|
63
|
+
notebook's ambient token. Never embeds a static PAT in the notebook."""
|
|
64
|
+
ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
|
|
65
|
+
host = (host or ctx.apiUrl().get()).rstrip("/")
|
|
66
|
+
ambient = ctx.apiToken().get()
|
|
67
|
+
|
|
68
|
+
def _secret(key):
|
|
69
|
+
try:
|
|
70
|
+
return dbutils.secrets.get(scope=secret_scope, key=key)
|
|
71
|
+
except Exception:
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
return resolve_auth(
|
|
75
|
+
host,
|
|
76
|
+
sp_client_id=_secret("sp_client_id"),
|
|
77
|
+
sp_client_secret=_secret("sp_client_secret"),
|
|
78
|
+
ambient_token=ambient,
|
|
79
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""External service clients."""
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Unified Databricks API client wrapping Jobs, Runs, Workspace, Clusters, UC Lineage, Model Serving."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import time
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class DatabricksConfig:
|
|
13
|
+
"""Connection configuration."""
|
|
14
|
+
|
|
15
|
+
host: str
|
|
16
|
+
token: str
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def headers(self) -> dict:
|
|
20
|
+
return {"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DatabricksClient:
|
|
24
|
+
"""
|
|
25
|
+
Unified client for all Databricks REST API interactions.
|
|
26
|
+
Uses ambient auth via PAT (env var or secret scope).
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, config: DatabricksConfig):
|
|
30
|
+
self.config = config
|
|
31
|
+
self.base_url = config.host.rstrip("/")
|
|
32
|
+
|
|
33
|
+
def _get(self, path: str, params: dict = None) -> dict:
|
|
34
|
+
r = requests.get(f"{self.base_url}{path}", headers=self.config.headers, params=params, timeout=30)
|
|
35
|
+
r.raise_for_status()
|
|
36
|
+
return r.json()
|
|
37
|
+
|
|
38
|
+
def _post(self, path: str, payload: dict = None) -> dict:
|
|
39
|
+
r = requests.post(f"{self.base_url}{path}", headers=self.config.headers, json=payload or {}, timeout=60)
|
|
40
|
+
r.raise_for_status()
|
|
41
|
+
return r.json() if r.content else {}
|
|
42
|
+
|
|
43
|
+
# ─── Jobs API ───
|
|
44
|
+
|
|
45
|
+
def list_jobs(self, limit: int = 50) -> list[dict]:
|
|
46
|
+
data = self._get("/api/2.1/jobs/list", {"limit": limit})
|
|
47
|
+
return data.get("jobs", [])
|
|
48
|
+
|
|
49
|
+
def get_job(self, job_id: int) -> dict:
|
|
50
|
+
return self._get("/api/2.1/jobs/get", {"job_id": job_id})
|
|
51
|
+
|
|
52
|
+
def list_runs(self, job_id: int, limit: int = 5, expand_tasks: bool = True) -> list[dict]:
|
|
53
|
+
params = {"job_id": job_id, "limit": limit, "expand_tasks": str(expand_tasks).lower()}
|
|
54
|
+
data = self._get("/api/2.1/jobs/runs/list", params)
|
|
55
|
+
return data.get("runs", [])
|
|
56
|
+
|
|
57
|
+
def get_run_output(self, run_id: int) -> dict:
|
|
58
|
+
return self._get("/api/2.1/jobs/runs/get-output", {"run_id": run_id})
|
|
59
|
+
|
|
60
|
+
def repair_run(self, run_id: int, rerun_tasks: list[str]) -> dict:
|
|
61
|
+
return self._post("/api/2.1/jobs/runs/repair", {"run_id": run_id, "rerun_tasks": rerun_tasks})
|
|
62
|
+
|
|
63
|
+
def create_job(self, settings: dict) -> int:
|
|
64
|
+
result = self._post("/api/2.1/jobs/create", settings)
|
|
65
|
+
return result["job_id"]
|
|
66
|
+
|
|
67
|
+
def reset_job(self, job_id: int, new_settings: dict) -> None:
|
|
68
|
+
self._post("/api/2.1/jobs/reset", {"job_id": job_id, "new_settings": new_settings})
|
|
69
|
+
|
|
70
|
+
def run_job_now(self, job_id: int, params: dict = None) -> int:
|
|
71
|
+
payload = {"job_id": job_id}
|
|
72
|
+
if params:
|
|
73
|
+
payload["notebook_params"] = params
|
|
74
|
+
result = self._post("/api/2.1/jobs/run-now", payload)
|
|
75
|
+
return result.get("run_id")
|
|
76
|
+
|
|
77
|
+
# ─── Workspace API ───
|
|
78
|
+
|
|
79
|
+
def export_notebook(self, path: str) -> Optional[str]:
|
|
80
|
+
"""Export notebook source code. Returns decoded content or None."""
|
|
81
|
+
try:
|
|
82
|
+
data = self._get("/api/2.0/workspace/export", {"path": path, "format": "SOURCE"})
|
|
83
|
+
return base64.b64decode(data.get("content", "")).decode("utf-8", errors="replace")
|
|
84
|
+
except Exception:
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
def import_notebook(self, path: str, content: str, language: str = "PYTHON") -> bool:
|
|
88
|
+
"""Upload a notebook. Returns True on success."""
|
|
89
|
+
try:
|
|
90
|
+
# Ensure parent directory exists
|
|
91
|
+
parent = "/".join(path.split("/")[:-1])
|
|
92
|
+
self._post("/api/2.0/workspace/mkdirs", {"path": parent})
|
|
93
|
+
payload = {
|
|
94
|
+
"path": path,
|
|
95
|
+
"content": base64.b64encode(content.encode()).decode(),
|
|
96
|
+
"language": language,
|
|
97
|
+
"format": "SOURCE",
|
|
98
|
+
"overwrite": True,
|
|
99
|
+
}
|
|
100
|
+
self._post("/api/2.0/workspace/import", payload)
|
|
101
|
+
return True
|
|
102
|
+
except Exception:
|
|
103
|
+
return False
|
|
104
|
+
|
|
105
|
+
# ─── Clusters API ───
|
|
106
|
+
|
|
107
|
+
def get_cluster_events(self, cluster_id: str, limit: int = 10) -> list[dict]:
|
|
108
|
+
try:
|
|
109
|
+
data = self._post("/api/2.0/clusters/events", {"cluster_id": cluster_id, "limit": limit})
|
|
110
|
+
return data.get("events", [])
|
|
111
|
+
except Exception:
|
|
112
|
+
return []
|
|
113
|
+
|
|
114
|
+
# ─── Model Serving ───
|
|
115
|
+
|
|
116
|
+
def list_serving_endpoints(self) -> list[dict]:
|
|
117
|
+
data = self._get("/api/2.0/serving-endpoints")
|
|
118
|
+
return data.get("endpoints", [])
|
|
119
|
+
|
|
120
|
+
def invoke_model(self, endpoint_name: str, messages: list[dict], max_tokens: int = 1500, temperature: float = 0.1) -> str:
|
|
121
|
+
"""Call a Model Serving chat endpoint. Returns the response content string."""
|
|
122
|
+
content, _ = self.invoke_model_full(endpoint_name, messages, max_tokens, temperature)
|
|
123
|
+
return content
|
|
124
|
+
|
|
125
|
+
def invoke_model_full(self, endpoint_name: str, messages: list[dict], max_tokens: int = 1500,
|
|
126
|
+
temperature: float = 0.1, retries: int = 4):
|
|
127
|
+
"""Call a Model Serving chat endpoint with backoff on 429/5xx.
|
|
128
|
+
|
|
129
|
+
Returns ``(content, usage_dict)`` where usage carries real token counts so
|
|
130
|
+
the token budget can be enforced. Raises after exhausting retries."""
|
|
131
|
+
payload = {"messages": messages, "max_tokens": max_tokens, "temperature": temperature}
|
|
132
|
+
url = f"{self.base_url}/serving-endpoints/{endpoint_name}/invocations"
|
|
133
|
+
delay, last = 2.0, None
|
|
134
|
+
for _ in range(retries):
|
|
135
|
+
r = requests.post(url, headers=self.config.headers, json=payload, timeout=120)
|
|
136
|
+
last = r
|
|
137
|
+
if r.status_code == 200:
|
|
138
|
+
data = r.json()
|
|
139
|
+
return data["choices"][0]["message"]["content"], (data.get("usage", {}) or {})
|
|
140
|
+
if r.status_code in (429, 500, 502, 503, 504):
|
|
141
|
+
time.sleep(delay)
|
|
142
|
+
delay *= 2
|
|
143
|
+
continue
|
|
144
|
+
r.raise_for_status()
|
|
145
|
+
if last is not None:
|
|
146
|
+
last.raise_for_status()
|
|
147
|
+
raise RuntimeError("Model invocation failed with no response")
|
|
148
|
+
|
|
149
|
+
# ─── SCIM / Identity ───
|
|
150
|
+
|
|
151
|
+
def find_user_by_identity(self, identity: str) -> Optional[dict]:
|
|
152
|
+
"""Look up a workspace user by userName or email via SCIM. Returns the
|
|
153
|
+
SCIM user object (or None). Used to authorize approvers (#3)."""
|
|
154
|
+
ident = (identity or "").strip()
|
|
155
|
+
if not ident:
|
|
156
|
+
return None
|
|
157
|
+
for flt in (f'userName eq "{ident}"', f'emails.value eq "{ident}"'):
|
|
158
|
+
try:
|
|
159
|
+
data = self._get("/api/2.0/preview/scim/v2/Users", {"filter": flt})
|
|
160
|
+
except Exception:
|
|
161
|
+
continue
|
|
162
|
+
resources = data.get("Resources", [])
|
|
163
|
+
if resources:
|
|
164
|
+
return resources[0]
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
def user_in_group(self, user: dict, group_name: str) -> bool:
|
|
168
|
+
"""True if the SCIM user object lists membership in group_name."""
|
|
169
|
+
if not group_name:
|
|
170
|
+
return True
|
|
171
|
+
for g in user.get("groups", []) or []:
|
|
172
|
+
if g.get("display") == group_name or g.get("value") == group_name:
|
|
173
|
+
return True
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
# ─── Unity Catalog Lineage ───
|
|
177
|
+
|
|
178
|
+
def get_table_lineage(self, table_name: str) -> dict:
|
|
179
|
+
"""Get upstream/downstream lineage for a table."""
|
|
180
|
+
try:
|
|
181
|
+
return self._get("/api/2.1/unity-catalog/lineage/table-lineage", {"table_name": table_name})
|
|
182
|
+
except Exception:
|
|
183
|
+
return {}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Microsoft Teams webhook client — sends full diagnostic report cards with Approve/Reject."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from healing_kit.utils.hmac_tokens import generate_token
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TeamsClient:
|
|
11
|
+
"""Sends comprehensive diagnostic report cards to Teams with approval buttons."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, webhook_url: str, approval_job_url: str = "", secret_key: bytes = b""):
|
|
14
|
+
self.webhook_url = webhook_url
|
|
15
|
+
self.approval_job_url = approval_job_url
|
|
16
|
+
self.secret_key = secret_key
|
|
17
|
+
|
|
18
|
+
def send_diagnosis_report(self, results: list[dict], run_id: str, pipeline_name: str = "",
|
|
19
|
+
approver_email: str = "team@company.com") -> bool:
|
|
20
|
+
"""
|
|
21
|
+
Send ONE consolidated report card with full analysis + Approve/Reject per failure.
|
|
22
|
+
NEVER auto-fixes. Always waits for human approval.
|
|
23
|
+
|
|
24
|
+
Each failure includes:
|
|
25
|
+
- Root cause analysis
|
|
26
|
+
- Data evidence (diagnostic query results)
|
|
27
|
+
- Proposed fix
|
|
28
|
+
- Fix simulation (impact analysis)
|
|
29
|
+
- Approve / Reject buttons (signed HMAC URLs)
|
|
30
|
+
"""
|
|
31
|
+
if not self.webhook_url:
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
body_blocks = [
|
|
35
|
+
{"type": "TextBlock", "size": "Large", "weight": "Bolder", "text": "\U0001f916 Pipeline Failure Report — Approval Required"},
|
|
36
|
+
{"type": "TextBlock", "text": f"**Pipeline:** {pipeline_name} | **Run:** {run_id}", "isSubtle": True, "wrap": True},
|
|
37
|
+
{"type": "TextBlock", "text": f"**Time:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')} | **Failures:** {len(results)}", "isSubtle": True, "wrap": True},
|
|
38
|
+
{"type": "TextBlock", "text": "---", "separator": True},
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
actions = []
|
|
42
|
+
|
|
43
|
+
for i, item in enumerate(results, 1):
|
|
44
|
+
ctx = item.get("context", {})
|
|
45
|
+
analysis = item.get("analysis", {})
|
|
46
|
+
evidence = item.get("evidence")
|
|
47
|
+
simulation = item.get("simulation", {})
|
|
48
|
+
|
|
49
|
+
task_key = ctx.get("task_key", "unknown")
|
|
50
|
+
confidence = analysis.get("confidence_score", 0)
|
|
51
|
+
if isinstance(confidence, float) and confidence <= 1.0:
|
|
52
|
+
confidence = int(confidence * 100)
|
|
53
|
+
|
|
54
|
+
# Section header
|
|
55
|
+
body_blocks.append({"type": "TextBlock", "size": "Medium", "weight": "Bolder",
|
|
56
|
+
"text": f"\u26a0\ufe0f Issue {i}: {task_key}", "separator": True, "wrap": True})
|
|
57
|
+
|
|
58
|
+
# Root cause
|
|
59
|
+
body_blocks.append({"type": "TextBlock", "weight": "Bolder", "text": "Root Cause:", "wrap": True})
|
|
60
|
+
body_blocks.append({"type": "TextBlock", "text": str(analysis.get("root_cause", "Unknown"))[:400], "wrap": True})
|
|
61
|
+
|
|
62
|
+
# Data evidence
|
|
63
|
+
if evidence and "0 rows" not in str(evidence) and "BLOCKED" not in str(evidence):
|
|
64
|
+
body_blocks.append({"type": "TextBlock", "weight": "Bolder", "text": "Data Evidence:", "wrap": True})
|
|
65
|
+
body_blocks.append({"type": "TextBlock", "text": f"```\n{str(evidence)[:350]}\n```", "wrap": True, "fontType": "Monospace"})
|
|
66
|
+
|
|
67
|
+
# Proposed fix
|
|
68
|
+
body_blocks.append({"type": "TextBlock", "weight": "Bolder", "text": "Proposed Fix:", "wrap": True})
|
|
69
|
+
body_blocks.append({"type": "TextBlock", "text": str(analysis.get("reasoning", analysis.get("suggested_fix", "N/A")))[:300], "wrap": True})
|
|
70
|
+
|
|
71
|
+
# Fix simulation / impact analysis
|
|
72
|
+
if simulation:
|
|
73
|
+
body_blocks.append({"type": "TextBlock", "weight": "Bolder", "text": "Impact Simulation:", "wrap": True})
|
|
74
|
+
sim_text = f"\u2022 Action: {simulation.get('action_description', 'N/A')}\n"
|
|
75
|
+
sim_text += f"\u2022 Affected: {simulation.get('affected_tables', 'N/A')}\n"
|
|
76
|
+
sim_text += f"\u2022 Expected outcome: {simulation.get('expected_outcome', 'N/A')}\n"
|
|
77
|
+
sim_text += f"\u2022 Risk if fails: {simulation.get('risk_if_fails', 'N/A')}\n"
|
|
78
|
+
sim_text += f"\u2022 Reversible: {simulation.get('reversible', 'Unknown')}\n"
|
|
79
|
+
sim_text += f"\u2022 Confidence: {confidence}%"
|
|
80
|
+
body_blocks.append({"type": "TextBlock", "text": sim_text, "wrap": True})
|
|
81
|
+
else:
|
|
82
|
+
body_blocks.append({"type": "FactSet", "facts": [
|
|
83
|
+
{"title": "Action", "value": str(analysis.get("action_id", "RETRY"))},
|
|
84
|
+
{"title": "Confidence", "value": f"{confidence}%"},
|
|
85
|
+
{"title": "Reversible", "value": "Yes (rerun is safe)"},
|
|
86
|
+
]})
|
|
87
|
+
|
|
88
|
+
# Generate signed approval tokens
|
|
89
|
+
if self.secret_key and self.approval_job_url:
|
|
90
|
+
token = generate_token(
|
|
91
|
+
run_id=run_id,
|
|
92
|
+
approver_email=approver_email,
|
|
93
|
+
action_id=str(analysis.get("action_id", "RETRY")),
|
|
94
|
+
secret_key=self.secret_key,
|
|
95
|
+
)
|
|
96
|
+
approve_url = f"{self.approval_job_url}?token={token}&action=approve&task={task_key}"
|
|
97
|
+
reject_url = f"{self.approval_job_url}?token={token}&action=reject&task={task_key}"
|
|
98
|
+
|
|
99
|
+
actions.append({"type": "Action.OpenUrl", "title": f"\u2705 Approve Fix: {task_key}", "url": approve_url})
|
|
100
|
+
actions.append({"type": "Action.OpenUrl", "title": f"\u274c Reject: {task_key}", "url": reject_url})
|
|
101
|
+
|
|
102
|
+
# Summary footer
|
|
103
|
+
body_blocks.append({"type": "TextBlock", "text": "---", "separator": True})
|
|
104
|
+
body_blocks.append({"type": "TextBlock", "text": "\u23f0 **Approval tokens expire in 15 minutes.** No action is taken until you approve.", "wrap": True, "isSubtle": True})
|
|
105
|
+
|
|
106
|
+
card = {
|
|
107
|
+
"type": "message",
|
|
108
|
+
"attachments": [{
|
|
109
|
+
"contentType": "application/vnd.microsoft.card.adaptive",
|
|
110
|
+
"content": {
|
|
111
|
+
"$schema": "http://adaptivecards.io/schemas/adaptive-card.json",
|
|
112
|
+
"type": "AdaptiveCard",
|
|
113
|
+
"version": "1.4",
|
|
114
|
+
"body": body_blocks,
|
|
115
|
+
"actions": actions if actions else None,
|
|
116
|
+
},
|
|
117
|
+
}],
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
# Remove None actions field if empty
|
|
121
|
+
if not actions:
|
|
122
|
+
del card["attachments"][0]["content"]["actions"]
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
r = requests.post(self.webhook_url, json=card, headers={"Content-Type": "application/json"}, timeout=15)
|
|
126
|
+
return r.status_code in (200, 202)
|
|
127
|
+
except Exception:
|
|
128
|
+
return False
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Data models for the healing pipeline."""
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Diagnosis response model from the AI engine."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ActionId(str, Enum):
|
|
8
|
+
"""Deterministic action identifiers for resolution routing."""
|
|
9
|
+
|
|
10
|
+
CLUSTER_RESIZE = "CLUSTER_RESIZE"
|
|
11
|
+
RETRY = "RETRY"
|
|
12
|
+
SCHEMA_FIX = "SCHEMA_FIX"
|
|
13
|
+
CODE_PATCH = "CODE_PATCH"
|
|
14
|
+
ESCALATE = "ESCALATE"
|
|
15
|
+
UNKNOWN = "UNKNOWN"
|
|
16
|
+
CACHE_HIT = "CACHE_HIT"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class DiagnosisResponse:
|
|
21
|
+
"""Structured response from the AI Diagnosis Engine."""
|
|
22
|
+
|
|
23
|
+
root_cause: str
|
|
24
|
+
confidence_score: float # 0.0 to 1.0
|
|
25
|
+
action_id: ActionId
|
|
26
|
+
action_params: dict = field(default_factory=dict)
|
|
27
|
+
reasoning: str = ""
|
|
28
|
+
evidence_used: list[str] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
def __post_init__(self):
|
|
31
|
+
"""Validate constraints."""
|
|
32
|
+
self.confidence_score = max(0.0, min(1.0, float(self.confidence_score)))
|
|
33
|
+
if isinstance(self.action_id, str):
|
|
34
|
+
self.action_id = ActionId(self.action_id)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class ExecutionResult:
|
|
39
|
+
"""Result of executing a remediation action."""
|
|
40
|
+
|
|
41
|
+
action_id: str
|
|
42
|
+
success: bool
|
|
43
|
+
run_id: str
|
|
44
|
+
details: dict = field(default_factory=dict)
|
|
45
|
+
error: str = ""
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Event data models for failure detection and batching."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class FailureEvent:
|
|
9
|
+
"""Raw failure event from a webhook or API poll."""
|
|
10
|
+
|
|
11
|
+
run_id: str
|
|
12
|
+
job_id: str
|
|
13
|
+
task_key: str
|
|
14
|
+
workspace_id: str
|
|
15
|
+
failure_time: datetime
|
|
16
|
+
error_message: str = ""
|
|
17
|
+
error_trace: str = ""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class RootCauseEvent:
|
|
22
|
+
"""Deduplicated root-cause event after dependency graph analysis."""
|
|
23
|
+
|
|
24
|
+
root_run_id: str
|
|
25
|
+
root_job_id: str
|
|
26
|
+
root_task_key: str
|
|
27
|
+
error_hash: str
|
|
28
|
+
derived_failures: list[FailureEvent] = field(default_factory=list)
|
|
29
|
+
total_downstream_impact: int = 0
|
|
30
|
+
detected_at: datetime = field(default_factory=datetime.utcnow)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Structured evidence package assembled by the Context Agent."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class SparkLogEvidence:
|
|
10
|
+
"""Evidence from Spark event logs."""
|
|
11
|
+
|
|
12
|
+
first_errors: list[str] = field(default_factory=list)
|
|
13
|
+
oom_events: list[str] = field(default_factory=list)
|
|
14
|
+
task_failure_traces: list[str] = field(default_factory=list)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class DriverLogEvidence:
|
|
19
|
+
"""Evidence from driver stdout."""
|
|
20
|
+
|
|
21
|
+
schema_errors: list[str] = field(default_factory=list)
|
|
22
|
+
data_source_exceptions: list[str] = field(default_factory=list)
|
|
23
|
+
missing_table_errors: list[str] = field(default_factory=list)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class TaskMetricsEvidence:
|
|
28
|
+
"""Evidence from task runtime metrics."""
|
|
29
|
+
|
|
30
|
+
spill_to_disk_bytes: int = 0
|
|
31
|
+
gc_overhead_pct: float = 0.0
|
|
32
|
+
shuffle_read_bytes: int = 0
|
|
33
|
+
shuffle_write_bytes: int = 0
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class LineageEvidence:
|
|
38
|
+
"""Evidence from Unity Catalog lineage."""
|
|
39
|
+
|
|
40
|
+
upstream_tables: list[dict] = field(default_factory=list) # [{name, last_modified}]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class SchemaHistoryEvidence:
|
|
45
|
+
"""Evidence from DESCRIBE HISTORY."""
|
|
46
|
+
|
|
47
|
+
recent_changes: list[dict] = field(default_factory=list) # Last 5 schema changes
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class GitContextEvidence:
|
|
52
|
+
"""Evidence from job git metadata."""
|
|
53
|
+
|
|
54
|
+
last_commit_message: str = ""
|
|
55
|
+
last_commit_author: str = ""
|
|
56
|
+
last_commit_timestamp: Optional[datetime] = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class ClusterEventsEvidence:
|
|
61
|
+
"""Evidence from cluster events API."""
|
|
62
|
+
|
|
63
|
+
termination_reason: str = ""
|
|
64
|
+
autoscaling_events: list[str] = field(default_factory=list)
|
|
65
|
+
spot_preemptions: int = 0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class EvidencePackage:
|
|
70
|
+
"""Complete structured evidence package for AI diagnosis."""
|
|
71
|
+
|
|
72
|
+
run_id: str
|
|
73
|
+
job_id: str
|
|
74
|
+
task_key: str
|
|
75
|
+
spark_event_logs: Optional[SparkLogEvidence] = None
|
|
76
|
+
driver_stdout: Optional[DriverLogEvidence] = None
|
|
77
|
+
task_metrics: Optional[TaskMetricsEvidence] = None
|
|
78
|
+
lineage: Optional[LineageEvidence] = None
|
|
79
|
+
schema_history: Optional[SchemaHistoryEvidence] = None
|
|
80
|
+
git_context: Optional[GitContextEvidence] = None
|
|
81
|
+
cluster_events: Optional[ClusterEventsEvidence] = None
|
|
82
|
+
missing_sources: list[str] = field(default_factory=list)
|
|
83
|
+
collected_at: datetime = field(default_factory=datetime.utcnow)
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"""Runtime orchestrators imported by the thin Databricks notebook drivers (#5).
|
|
2
|
+
|
|
3
|
+
All healing logic lives here in the packaged wheel so it is the single, tested
|
|
4
|
+
source of truth — the notebooks are now just entrypoints that pass `spark`/
|
|
5
|
+
`dbutils` in. This replaces the previously inlined copies in the notebooks.
|
|
6
|
+
"""
|