shkit 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. healing_kit/__init__.py +3 -0
  2. healing_kit/auth.py +79 -0
  3. healing_kit/clients/__init__.py +1 -0
  4. healing_kit/clients/databricks_client.py +183 -0
  5. healing_kit/clients/teams_client.py +128 -0
  6. healing_kit/models/__init__.py +1 -0
  7. healing_kit/models/diagnosis.py +45 -0
  8. healing_kit/models/events.py +30 -0
  9. healing_kit/models/evidence.py +83 -0
  10. healing_kit/runtime/__init__.py +6 -0
  11. healing_kit/runtime/approval.py +141 -0
  12. healing_kit/runtime/maintenance.py +52 -0
  13. healing_kit/services/__init__.py +1 -0
  14. healing_kit/services/cache_service.py +120 -0
  15. healing_kit/services/circuit_breaker.py +114 -0
  16. healing_kit/services/context_agent.py +127 -0
  17. healing_kit/services/dependency_graph.py +141 -0
  18. healing_kit/services/diagnosis_engine.py +165 -0
  19. healing_kit/services/identity.py +61 -0
  20. healing_kit/services/model_router.py +52 -0
  21. healing_kit/services/query_guard.py +168 -0
  22. healing_kit/services/resolution_verifier.py +100 -0
  23. healing_kit/services/token_budget.py +137 -0
  24. healing_kit/utils/__init__.py +1 -0
  25. healing_kit/utils/error_hash.py +15 -0
  26. healing_kit/utils/hmac_tokens.py +86 -0
  27. healing_kit/utils/sql_safety.py +84 -0
  28. iic/__init__.py +51 -0
  29. iic/__main__.py +18 -0
  30. iic/_console.py +235 -0
  31. iic/_doctor.py +143 -0
  32. iic/change/__init__.py +7 -0
  33. iic/change/change_detector.py +154 -0
  34. iic/context/__init__.py +7 -0
  35. iic/context/context_builder.py +117 -0
  36. iic/dependency/__init__.py +7 -0
  37. iic/dependency/dependency_analyzer.py +93 -0
  38. iic/diagnosis/__init__.py +7 -0
  39. iic/diagnosis/diagnosis_engine.py +183 -0
  40. iic/dna/__init__.py +7 -0
  41. iic/dna/dna_builder.py +184 -0
  42. iic/impact/__init__.py +7 -0
  43. iic/impact/impact_engine.py +102 -0
  44. iic/ingestion/__init__.py +14 -0
  45. iic/ingestion/base.py +21 -0
  46. iic/ingestion/databricks_source.py +98 -0
  47. iic/ingestion/static_source.py +23 -0
  48. iic/ingestion/webhook_source.py +39 -0
  49. iic/models/__init__.py +44 -0
  50. iic/models/change.py +77 -0
  51. iic/models/context.py +46 -0
  52. iic/models/diagnosis.py +37 -0
  53. iic/models/dna.py +77 -0
  54. iic/models/event.py +78 -0
  55. iic/models/impact.py +60 -0
  56. iic/models/report.py +88 -0
  57. iic/models/routing.py +41 -0
  58. iic/notify/__init__.py +7 -0
  59. iic/notify/teams_notifier.py +112 -0
  60. iic/report/__init__.py +7 -0
  61. iic/report/report_generator.py +67 -0
  62. iic/routing/__init__.py +7 -0
  63. iic/routing/router.py +42 -0
  64. iic/runtime/__init__.py +10 -0
  65. iic/runtime/_sql.py +11 -0
  66. iic/runtime/agent_config.py +48 -0
  67. iic/runtime/agent_runtime.py +70 -0
  68. iic/runtime/antibodies.py +100 -0
  69. iic/runtime/bootstrap.py +157 -0
  70. iic/runtime/constants.py +40 -0
  71. iic/runtime/context.py +46 -0
  72. iic/runtime/detective.py +72 -0
  73. iic/runtime/hooks.py +85 -0
  74. iic/runtime/incident_engine.py +207 -0
  75. iic/runtime/inprocess.py +350 -0
  76. iic/runtime/ledger.py +120 -0
  77. iic/runtime/monitor.py +155 -0
  78. iic/runtime/pattern_store.py +53 -0
  79. iic/runtime/reconciler.py +139 -0
  80. iic/runtime/scope_config.py +127 -0
  81. iic/runtime/store.py +150 -0
  82. iic/runtime/wrapper.py +28 -0
  83. iic_autoload.pth +1 -0
  84. onboarding/__init__.py +1 -0
  85. onboarding/cli.py +168 -0
  86. onboarding/config_schema.py +62 -0
  87. onboarding/manifest.py +27 -0
  88. onboarding/preflight.py +129 -0
  89. onboarding/provisioner.py +573 -0
  90. onboarding/rollback.py +81 -0
  91. shkit-1.2.0.dist-info/METADATA +239 -0
  92. shkit-1.2.0.dist-info/RECORD +94 -0
  93. shkit-1.2.0.dist-info/WHEEL +4 -0
  94. shkit-1.2.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,3 @@
1
+ """Self-Healing Pipeline Kit — Enterprise AI-driven autonomous remediation for Databricks."""
2
+
3
+ __version__ = "2.0.0"
healing_kit/auth.py ADDED
@@ -0,0 +1,79 @@
1
+ """Identity & auth (#1) — prefer a service principal (OAuth M2M) over a PAT.
2
+
3
+ The healer should authenticate as a least-privileged **service principal**, not a
4
+ human PAT. This module mints a short-lived OAuth token via the Databricks OIDC
5
+ `client_credentials` grant. It falls back to a PAT or the notebook's ambient
6
+ token only when no SP credentials are configured, so existing deploys keep working.
7
+ """
8
+
9
+ from dataclasses import dataclass
10
+
11
+ import requests
12
+
13
+ from healing_kit.clients.databricks_client import DatabricksClient, DatabricksConfig
14
+
15
+
16
+ @dataclass
17
+ class AuthResult:
18
+ """Resolved auth: a bearer token plus how it was obtained (for audit)."""
19
+
20
+ config: DatabricksConfig
21
+ method: str # "service_principal" | "pat" | "ambient"
22
+
23
+
24
+ def get_sp_oauth_token(host: str, client_id: str, client_secret: str, scope: str = "all-apis") -> str:
25
+ """Exchange SP client credentials for a short-lived OAuth access token."""
26
+ host = host.rstrip("/")
27
+ resp = requests.post(
28
+ f"{host}/oidc/v1/token",
29
+ auth=(client_id, client_secret),
30
+ data={"grant_type": "client_credentials", "scope": scope},
31
+ timeout=30,
32
+ )
33
+ resp.raise_for_status()
34
+ return resp.json()["access_token"]
35
+
36
+
37
+ def resolve_auth(
38
+ host: str,
39
+ *,
40
+ sp_client_id: str | None = None,
41
+ sp_client_secret: str | None = None,
42
+ pat: str | None = None,
43
+ ambient_token: str | None = None,
44
+ ) -> AuthResult:
45
+ """Resolve auth in priority order: service principal → PAT → ambient token."""
46
+ host = host.rstrip("/")
47
+ if sp_client_id and sp_client_secret:
48
+ token = get_sp_oauth_token(host, sp_client_id, sp_client_secret)
49
+ return AuthResult(DatabricksConfig(host=host, token=token), "service_principal")
50
+ if pat:
51
+ return AuthResult(DatabricksConfig(host=host, token=pat), "pat")
52
+ if ambient_token:
53
+ return AuthResult(DatabricksConfig(host=host, token=ambient_token), "ambient")
54
+ raise ValueError("No credentials: provide SP client_id/secret, a PAT, or an ambient token")
55
+
56
+
57
+ def build_client(auth: AuthResult) -> DatabricksClient:
58
+ return DatabricksClient(auth.config)
59
+
60
+
61
+ def resolve_auth_from_dbutils(dbutils, secret_scope: str, host: str | None = None) -> AuthResult:
62
+ """Notebook-side helper: read SP creds from a secret scope, fall back to the
63
+ notebook's ambient token. Never embeds a static PAT in the notebook."""
64
+ ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
65
+ host = (host or ctx.apiUrl().get()).rstrip("/")
66
+ ambient = ctx.apiToken().get()
67
+
68
+ def _secret(key):
69
+ try:
70
+ return dbutils.secrets.get(scope=secret_scope, key=key)
71
+ except Exception:
72
+ return None
73
+
74
+ return resolve_auth(
75
+ host,
76
+ sp_client_id=_secret("sp_client_id"),
77
+ sp_client_secret=_secret("sp_client_secret"),
78
+ ambient_token=ambient,
79
+ )
@@ -0,0 +1 @@
1
+ """External service clients."""
@@ -0,0 +1,183 @@
1
+ """Unified Databricks API client wrapping Jobs, Runs, Workspace, Clusters, UC Lineage, Model Serving."""
2
+
3
+ import base64
4
+ import time
5
+ from dataclasses import dataclass
6
+ from typing import Optional
7
+
8
+ import requests
9
+
10
+
11
+ @dataclass
12
+ class DatabricksConfig:
13
+ """Connection configuration."""
14
+
15
+ host: str
16
+ token: str
17
+
18
+ @property
19
+ def headers(self) -> dict:
20
+ return {"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"}
21
+
22
+
23
+ class DatabricksClient:
24
+ """
25
+ Unified client for all Databricks REST API interactions.
26
+ Uses ambient auth via PAT (env var or secret scope).
27
+ """
28
+
29
+ def __init__(self, config: DatabricksConfig):
30
+ self.config = config
31
+ self.base_url = config.host.rstrip("/")
32
+
33
+ def _get(self, path: str, params: dict = None) -> dict:
34
+ r = requests.get(f"{self.base_url}{path}", headers=self.config.headers, params=params, timeout=30)
35
+ r.raise_for_status()
36
+ return r.json()
37
+
38
+ def _post(self, path: str, payload: dict = None) -> dict:
39
+ r = requests.post(f"{self.base_url}{path}", headers=self.config.headers, json=payload or {}, timeout=60)
40
+ r.raise_for_status()
41
+ return r.json() if r.content else {}
42
+
43
+ # ─── Jobs API ───
44
+
45
+ def list_jobs(self, limit: int = 50) -> list[dict]:
46
+ data = self._get("/api/2.1/jobs/list", {"limit": limit})
47
+ return data.get("jobs", [])
48
+
49
+ def get_job(self, job_id: int) -> dict:
50
+ return self._get("/api/2.1/jobs/get", {"job_id": job_id})
51
+
52
+ def list_runs(self, job_id: int, limit: int = 5, expand_tasks: bool = True) -> list[dict]:
53
+ params = {"job_id": job_id, "limit": limit, "expand_tasks": str(expand_tasks).lower()}
54
+ data = self._get("/api/2.1/jobs/runs/list", params)
55
+ return data.get("runs", [])
56
+
57
+ def get_run_output(self, run_id: int) -> dict:
58
+ return self._get("/api/2.1/jobs/runs/get-output", {"run_id": run_id})
59
+
60
+ def repair_run(self, run_id: int, rerun_tasks: list[str]) -> dict:
61
+ return self._post("/api/2.1/jobs/runs/repair", {"run_id": run_id, "rerun_tasks": rerun_tasks})
62
+
63
+ def create_job(self, settings: dict) -> int:
64
+ result = self._post("/api/2.1/jobs/create", settings)
65
+ return result["job_id"]
66
+
67
+ def reset_job(self, job_id: int, new_settings: dict) -> None:
68
+ self._post("/api/2.1/jobs/reset", {"job_id": job_id, "new_settings": new_settings})
69
+
70
+ def run_job_now(self, job_id: int, params: dict = None) -> int:
71
+ payload = {"job_id": job_id}
72
+ if params:
73
+ payload["notebook_params"] = params
74
+ result = self._post("/api/2.1/jobs/run-now", payload)
75
+ return result.get("run_id")
76
+
77
+ # ─── Workspace API ───
78
+
79
+ def export_notebook(self, path: str) -> Optional[str]:
80
+ """Export notebook source code. Returns decoded content or None."""
81
+ try:
82
+ data = self._get("/api/2.0/workspace/export", {"path": path, "format": "SOURCE"})
83
+ return base64.b64decode(data.get("content", "")).decode("utf-8", errors="replace")
84
+ except Exception:
85
+ return None
86
+
87
+ def import_notebook(self, path: str, content: str, language: str = "PYTHON") -> bool:
88
+ """Upload a notebook. Returns True on success."""
89
+ try:
90
+ # Ensure parent directory exists
91
+ parent = "/".join(path.split("/")[:-1])
92
+ self._post("/api/2.0/workspace/mkdirs", {"path": parent})
93
+ payload = {
94
+ "path": path,
95
+ "content": base64.b64encode(content.encode()).decode(),
96
+ "language": language,
97
+ "format": "SOURCE",
98
+ "overwrite": True,
99
+ }
100
+ self._post("/api/2.0/workspace/import", payload)
101
+ return True
102
+ except Exception:
103
+ return False
104
+
105
+ # ─── Clusters API ───
106
+
107
+ def get_cluster_events(self, cluster_id: str, limit: int = 10) -> list[dict]:
108
+ try:
109
+ data = self._post("/api/2.0/clusters/events", {"cluster_id": cluster_id, "limit": limit})
110
+ return data.get("events", [])
111
+ except Exception:
112
+ return []
113
+
114
+ # ─── Model Serving ───
115
+
116
+ def list_serving_endpoints(self) -> list[dict]:
117
+ data = self._get("/api/2.0/serving-endpoints")
118
+ return data.get("endpoints", [])
119
+
120
+ def invoke_model(self, endpoint_name: str, messages: list[dict], max_tokens: int = 1500, temperature: float = 0.1) -> str:
121
+ """Call a Model Serving chat endpoint. Returns the response content string."""
122
+ content, _ = self.invoke_model_full(endpoint_name, messages, max_tokens, temperature)
123
+ return content
124
+
125
+ def invoke_model_full(self, endpoint_name: str, messages: list[dict], max_tokens: int = 1500,
126
+ temperature: float = 0.1, retries: int = 4):
127
+ """Call a Model Serving chat endpoint with backoff on 429/5xx.
128
+
129
+ Returns ``(content, usage_dict)`` where usage carries real token counts so
130
+ the token budget can be enforced. Raises after exhausting retries."""
131
+ payload = {"messages": messages, "max_tokens": max_tokens, "temperature": temperature}
132
+ url = f"{self.base_url}/serving-endpoints/{endpoint_name}/invocations"
133
+ delay, last = 2.0, None
134
+ for _ in range(retries):
135
+ r = requests.post(url, headers=self.config.headers, json=payload, timeout=120)
136
+ last = r
137
+ if r.status_code == 200:
138
+ data = r.json()
139
+ return data["choices"][0]["message"]["content"], (data.get("usage", {}) or {})
140
+ if r.status_code in (429, 500, 502, 503, 504):
141
+ time.sleep(delay)
142
+ delay *= 2
143
+ continue
144
+ r.raise_for_status()
145
+ if last is not None:
146
+ last.raise_for_status()
147
+ raise RuntimeError("Model invocation failed with no response")
148
+
149
+ # ─── SCIM / Identity ───
150
+
151
+ def find_user_by_identity(self, identity: str) -> Optional[dict]:
152
+ """Look up a workspace user by userName or email via SCIM. Returns the
153
+ SCIM user object (or None). Used to authorize approvers (#3)."""
154
+ ident = (identity or "").strip()
155
+ if not ident:
156
+ return None
157
+ for flt in (f'userName eq "{ident}"', f'emails.value eq "{ident}"'):
158
+ try:
159
+ data = self._get("/api/2.0/preview/scim/v2/Users", {"filter": flt})
160
+ except Exception:
161
+ continue
162
+ resources = data.get("Resources", [])
163
+ if resources:
164
+ return resources[0]
165
+ return None
166
+
167
+ def user_in_group(self, user: dict, group_name: str) -> bool:
168
+ """True if the SCIM user object lists membership in group_name."""
169
+ if not group_name:
170
+ return True
171
+ for g in user.get("groups", []) or []:
172
+ if g.get("display") == group_name or g.get("value") == group_name:
173
+ return True
174
+ return False
175
+
176
+ # ─── Unity Catalog Lineage ───
177
+
178
+ def get_table_lineage(self, table_name: str) -> dict:
179
+ """Get upstream/downstream lineage for a table."""
180
+ try:
181
+ return self._get("/api/2.1/unity-catalog/lineage/table-lineage", {"table_name": table_name})
182
+ except Exception:
183
+ return {}
@@ -0,0 +1,128 @@
1
+ """Microsoft Teams webhook client — sends full diagnostic report cards with Approve/Reject."""
2
+
3
+ from datetime import datetime
4
+
5
+ import requests
6
+
7
+ from healing_kit.utils.hmac_tokens import generate_token
8
+
9
+
10
+ class TeamsClient:
11
+ """Sends comprehensive diagnostic report cards to Teams with approval buttons."""
12
+
13
+ def __init__(self, webhook_url: str, approval_job_url: str = "", secret_key: bytes = b""):
14
+ self.webhook_url = webhook_url
15
+ self.approval_job_url = approval_job_url
16
+ self.secret_key = secret_key
17
+
18
+ def send_diagnosis_report(self, results: list[dict], run_id: str, pipeline_name: str = "",
19
+ approver_email: str = "team@company.com") -> bool:
20
+ """
21
+ Send ONE consolidated report card with full analysis + Approve/Reject per failure.
22
+ NEVER auto-fixes. Always waits for human approval.
23
+
24
+ Each failure includes:
25
+ - Root cause analysis
26
+ - Data evidence (diagnostic query results)
27
+ - Proposed fix
28
+ - Fix simulation (impact analysis)
29
+ - Approve / Reject buttons (signed HMAC URLs)
30
+ """
31
+ if not self.webhook_url:
32
+ return False
33
+
34
+ body_blocks = [
35
+ {"type": "TextBlock", "size": "Large", "weight": "Bolder", "text": "\U0001f916 Pipeline Failure Report — Approval Required"},
36
+ {"type": "TextBlock", "text": f"**Pipeline:** {pipeline_name} | **Run:** {run_id}", "isSubtle": True, "wrap": True},
37
+ {"type": "TextBlock", "text": f"**Time:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')} | **Failures:** {len(results)}", "isSubtle": True, "wrap": True},
38
+ {"type": "TextBlock", "text": "---", "separator": True},
39
+ ]
40
+
41
+ actions = []
42
+
43
+ for i, item in enumerate(results, 1):
44
+ ctx = item.get("context", {})
45
+ analysis = item.get("analysis", {})
46
+ evidence = item.get("evidence")
47
+ simulation = item.get("simulation", {})
48
+
49
+ task_key = ctx.get("task_key", "unknown")
50
+ confidence = analysis.get("confidence_score", 0)
51
+ if isinstance(confidence, float) and confidence <= 1.0:
52
+ confidence = int(confidence * 100)
53
+
54
+ # Section header
55
+ body_blocks.append({"type": "TextBlock", "size": "Medium", "weight": "Bolder",
56
+ "text": f"\u26a0\ufe0f Issue {i}: {task_key}", "separator": True, "wrap": True})
57
+
58
+ # Root cause
59
+ body_blocks.append({"type": "TextBlock", "weight": "Bolder", "text": "Root Cause:", "wrap": True})
60
+ body_blocks.append({"type": "TextBlock", "text": str(analysis.get("root_cause", "Unknown"))[:400], "wrap": True})
61
+
62
+ # Data evidence
63
+ if evidence and "0 rows" not in str(evidence) and "BLOCKED" not in str(evidence):
64
+ body_blocks.append({"type": "TextBlock", "weight": "Bolder", "text": "Data Evidence:", "wrap": True})
65
+ body_blocks.append({"type": "TextBlock", "text": f"```\n{str(evidence)[:350]}\n```", "wrap": True, "fontType": "Monospace"})
66
+
67
+ # Proposed fix
68
+ body_blocks.append({"type": "TextBlock", "weight": "Bolder", "text": "Proposed Fix:", "wrap": True})
69
+ body_blocks.append({"type": "TextBlock", "text": str(analysis.get("reasoning", analysis.get("suggested_fix", "N/A")))[:300], "wrap": True})
70
+
71
+ # Fix simulation / impact analysis
72
+ if simulation:
73
+ body_blocks.append({"type": "TextBlock", "weight": "Bolder", "text": "Impact Simulation:", "wrap": True})
74
+ sim_text = f"\u2022 Action: {simulation.get('action_description', 'N/A')}\n"
75
+ sim_text += f"\u2022 Affected: {simulation.get('affected_tables', 'N/A')}\n"
76
+ sim_text += f"\u2022 Expected outcome: {simulation.get('expected_outcome', 'N/A')}\n"
77
+ sim_text += f"\u2022 Risk if fails: {simulation.get('risk_if_fails', 'N/A')}\n"
78
+ sim_text += f"\u2022 Reversible: {simulation.get('reversible', 'Unknown')}\n"
79
+ sim_text += f"\u2022 Confidence: {confidence}%"
80
+ body_blocks.append({"type": "TextBlock", "text": sim_text, "wrap": True})
81
+ else:
82
+ body_blocks.append({"type": "FactSet", "facts": [
83
+ {"title": "Action", "value": str(analysis.get("action_id", "RETRY"))},
84
+ {"title": "Confidence", "value": f"{confidence}%"},
85
+ {"title": "Reversible", "value": "Yes (rerun is safe)"},
86
+ ]})
87
+
88
+ # Generate signed approval tokens
89
+ if self.secret_key and self.approval_job_url:
90
+ token = generate_token(
91
+ run_id=run_id,
92
+ approver_email=approver_email,
93
+ action_id=str(analysis.get("action_id", "RETRY")),
94
+ secret_key=self.secret_key,
95
+ )
96
+ approve_url = f"{self.approval_job_url}?token={token}&action=approve&task={task_key}"
97
+ reject_url = f"{self.approval_job_url}?token={token}&action=reject&task={task_key}"
98
+
99
+ actions.append({"type": "Action.OpenUrl", "title": f"\u2705 Approve Fix: {task_key}", "url": approve_url})
100
+ actions.append({"type": "Action.OpenUrl", "title": f"\u274c Reject: {task_key}", "url": reject_url})
101
+
102
+ # Summary footer
103
+ body_blocks.append({"type": "TextBlock", "text": "---", "separator": True})
104
+ body_blocks.append({"type": "TextBlock", "text": "\u23f0 **Approval tokens expire in 15 minutes.** No action is taken until you approve.", "wrap": True, "isSubtle": True})
105
+
106
+ card = {
107
+ "type": "message",
108
+ "attachments": [{
109
+ "contentType": "application/vnd.microsoft.card.adaptive",
110
+ "content": {
111
+ "$schema": "http://adaptivecards.io/schemas/adaptive-card.json",
112
+ "type": "AdaptiveCard",
113
+ "version": "1.4",
114
+ "body": body_blocks,
115
+ "actions": actions if actions else None,
116
+ },
117
+ }],
118
+ }
119
+
120
+ # Remove None actions field if empty
121
+ if not actions:
122
+ del card["attachments"][0]["content"]["actions"]
123
+
124
+ try:
125
+ r = requests.post(self.webhook_url, json=card, headers={"Content-Type": "application/json"}, timeout=15)
126
+ return r.status_code in (200, 202)
127
+ except Exception:
128
+ return False
@@ -0,0 +1 @@
1
+ """Data models for the healing pipeline."""
@@ -0,0 +1,45 @@
1
+ """Diagnosis response model from the AI engine."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+
6
+
7
+ class ActionId(str, Enum):
8
+ """Deterministic action identifiers for resolution routing."""
9
+
10
+ CLUSTER_RESIZE = "CLUSTER_RESIZE"
11
+ RETRY = "RETRY"
12
+ SCHEMA_FIX = "SCHEMA_FIX"
13
+ CODE_PATCH = "CODE_PATCH"
14
+ ESCALATE = "ESCALATE"
15
+ UNKNOWN = "UNKNOWN"
16
+ CACHE_HIT = "CACHE_HIT"
17
+
18
+
19
+ @dataclass
20
+ class DiagnosisResponse:
21
+ """Structured response from the AI Diagnosis Engine."""
22
+
23
+ root_cause: str
24
+ confidence_score: float # 0.0 to 1.0
25
+ action_id: ActionId
26
+ action_params: dict = field(default_factory=dict)
27
+ reasoning: str = ""
28
+ evidence_used: list[str] = field(default_factory=list)
29
+
30
+ def __post_init__(self):
31
+ """Validate constraints."""
32
+ self.confidence_score = max(0.0, min(1.0, float(self.confidence_score)))
33
+ if isinstance(self.action_id, str):
34
+ self.action_id = ActionId(self.action_id)
35
+
36
+
37
+ @dataclass
38
+ class ExecutionResult:
39
+ """Result of executing a remediation action."""
40
+
41
+ action_id: str
42
+ success: bool
43
+ run_id: str
44
+ details: dict = field(default_factory=dict)
45
+ error: str = ""
@@ -0,0 +1,30 @@
1
+ """Event data models for failure detection and batching."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+
6
+
7
+ @dataclass
8
+ class FailureEvent:
9
+ """Raw failure event from a webhook or API poll."""
10
+
11
+ run_id: str
12
+ job_id: str
13
+ task_key: str
14
+ workspace_id: str
15
+ failure_time: datetime
16
+ error_message: str = ""
17
+ error_trace: str = ""
18
+
19
+
20
+ @dataclass
21
+ class RootCauseEvent:
22
+ """Deduplicated root-cause event after dependency graph analysis."""
23
+
24
+ root_run_id: str
25
+ root_job_id: str
26
+ root_task_key: str
27
+ error_hash: str
28
+ derived_failures: list[FailureEvent] = field(default_factory=list)
29
+ total_downstream_impact: int = 0
30
+ detected_at: datetime = field(default_factory=datetime.utcnow)
@@ -0,0 +1,83 @@
1
+ """Structured evidence package assembled by the Context Agent."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import Optional
6
+
7
+
8
+ @dataclass
9
+ class SparkLogEvidence:
10
+ """Evidence from Spark event logs."""
11
+
12
+ first_errors: list[str] = field(default_factory=list)
13
+ oom_events: list[str] = field(default_factory=list)
14
+ task_failure_traces: list[str] = field(default_factory=list)
15
+
16
+
17
+ @dataclass
18
+ class DriverLogEvidence:
19
+ """Evidence from driver stdout."""
20
+
21
+ schema_errors: list[str] = field(default_factory=list)
22
+ data_source_exceptions: list[str] = field(default_factory=list)
23
+ missing_table_errors: list[str] = field(default_factory=list)
24
+
25
+
26
+ @dataclass
27
+ class TaskMetricsEvidence:
28
+ """Evidence from task runtime metrics."""
29
+
30
+ spill_to_disk_bytes: int = 0
31
+ gc_overhead_pct: float = 0.0
32
+ shuffle_read_bytes: int = 0
33
+ shuffle_write_bytes: int = 0
34
+
35
+
36
+ @dataclass
37
+ class LineageEvidence:
38
+ """Evidence from Unity Catalog lineage."""
39
+
40
+ upstream_tables: list[dict] = field(default_factory=list) # [{name, last_modified}]
41
+
42
+
43
+ @dataclass
44
+ class SchemaHistoryEvidence:
45
+ """Evidence from DESCRIBE HISTORY."""
46
+
47
+ recent_changes: list[dict] = field(default_factory=list) # Last 5 schema changes
48
+
49
+
50
+ @dataclass
51
+ class GitContextEvidence:
52
+ """Evidence from job git metadata."""
53
+
54
+ last_commit_message: str = ""
55
+ last_commit_author: str = ""
56
+ last_commit_timestamp: Optional[datetime] = None
57
+
58
+
59
+ @dataclass
60
+ class ClusterEventsEvidence:
61
+ """Evidence from cluster events API."""
62
+
63
+ termination_reason: str = ""
64
+ autoscaling_events: list[str] = field(default_factory=list)
65
+ spot_preemptions: int = 0
66
+
67
+
68
+ @dataclass
69
+ class EvidencePackage:
70
+ """Complete structured evidence package for AI diagnosis."""
71
+
72
+ run_id: str
73
+ job_id: str
74
+ task_key: str
75
+ spark_event_logs: Optional[SparkLogEvidence] = None
76
+ driver_stdout: Optional[DriverLogEvidence] = None
77
+ task_metrics: Optional[TaskMetricsEvidence] = None
78
+ lineage: Optional[LineageEvidence] = None
79
+ schema_history: Optional[SchemaHistoryEvidence] = None
80
+ git_context: Optional[GitContextEvidence] = None
81
+ cluster_events: Optional[ClusterEventsEvidence] = None
82
+ missing_sources: list[str] = field(default_factory=list)
83
+ collected_at: datetime = field(default_factory=datetime.utcnow)
@@ -0,0 +1,6 @@
1
+ """Runtime orchestrators imported by the thin Databricks notebook drivers (#5).
2
+
3
+ All healing logic lives here in the packaged wheel so it is the single, tested
4
+ source of truth — the notebooks are now just entrypoints that pass `spark`/
5
+ `dbutils` in. This replaces the previously inlined copies in the notebooks.
6
+ """