shkit 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- healing_kit/__init__.py +3 -0
- healing_kit/auth.py +79 -0
- healing_kit/clients/__init__.py +1 -0
- healing_kit/clients/databricks_client.py +183 -0
- healing_kit/clients/teams_client.py +128 -0
- healing_kit/models/__init__.py +1 -0
- healing_kit/models/diagnosis.py +45 -0
- healing_kit/models/events.py +30 -0
- healing_kit/models/evidence.py +83 -0
- healing_kit/runtime/__init__.py +6 -0
- healing_kit/runtime/approval.py +141 -0
- healing_kit/runtime/maintenance.py +52 -0
- healing_kit/services/__init__.py +1 -0
- healing_kit/services/cache_service.py +120 -0
- healing_kit/services/circuit_breaker.py +114 -0
- healing_kit/services/context_agent.py +127 -0
- healing_kit/services/dependency_graph.py +141 -0
- healing_kit/services/diagnosis_engine.py +165 -0
- healing_kit/services/identity.py +61 -0
- healing_kit/services/model_router.py +52 -0
- healing_kit/services/query_guard.py +168 -0
- healing_kit/services/resolution_verifier.py +100 -0
- healing_kit/services/token_budget.py +137 -0
- healing_kit/utils/__init__.py +1 -0
- healing_kit/utils/error_hash.py +15 -0
- healing_kit/utils/hmac_tokens.py +86 -0
- healing_kit/utils/sql_safety.py +84 -0
- iic/__init__.py +51 -0
- iic/__main__.py +18 -0
- iic/_console.py +235 -0
- iic/_doctor.py +143 -0
- iic/change/__init__.py +7 -0
- iic/change/change_detector.py +154 -0
- iic/context/__init__.py +7 -0
- iic/context/context_builder.py +117 -0
- iic/dependency/__init__.py +7 -0
- iic/dependency/dependency_analyzer.py +93 -0
- iic/diagnosis/__init__.py +7 -0
- iic/diagnosis/diagnosis_engine.py +183 -0
- iic/dna/__init__.py +7 -0
- iic/dna/dna_builder.py +184 -0
- iic/impact/__init__.py +7 -0
- iic/impact/impact_engine.py +102 -0
- iic/ingestion/__init__.py +14 -0
- iic/ingestion/base.py +21 -0
- iic/ingestion/databricks_source.py +98 -0
- iic/ingestion/static_source.py +23 -0
- iic/ingestion/webhook_source.py +39 -0
- iic/models/__init__.py +44 -0
- iic/models/change.py +77 -0
- iic/models/context.py +46 -0
- iic/models/diagnosis.py +37 -0
- iic/models/dna.py +77 -0
- iic/models/event.py +78 -0
- iic/models/impact.py +60 -0
- iic/models/report.py +88 -0
- iic/models/routing.py +41 -0
- iic/notify/__init__.py +7 -0
- iic/notify/teams_notifier.py +112 -0
- iic/report/__init__.py +7 -0
- iic/report/report_generator.py +67 -0
- iic/routing/__init__.py +7 -0
- iic/routing/router.py +42 -0
- iic/runtime/__init__.py +10 -0
- iic/runtime/_sql.py +11 -0
- iic/runtime/agent_config.py +48 -0
- iic/runtime/agent_runtime.py +70 -0
- iic/runtime/antibodies.py +100 -0
- iic/runtime/bootstrap.py +157 -0
- iic/runtime/constants.py +40 -0
- iic/runtime/context.py +46 -0
- iic/runtime/detective.py +72 -0
- iic/runtime/hooks.py +85 -0
- iic/runtime/incident_engine.py +207 -0
- iic/runtime/inprocess.py +350 -0
- iic/runtime/ledger.py +120 -0
- iic/runtime/monitor.py +155 -0
- iic/runtime/pattern_store.py +53 -0
- iic/runtime/reconciler.py +139 -0
- iic/runtime/scope_config.py +127 -0
- iic/runtime/store.py +150 -0
- iic/runtime/wrapper.py +28 -0
- iic_autoload.pth +1 -0
- onboarding/__init__.py +1 -0
- onboarding/cli.py +168 -0
- onboarding/config_schema.py +62 -0
- onboarding/manifest.py +27 -0
- onboarding/preflight.py +129 -0
- onboarding/provisioner.py +573 -0
- onboarding/rollback.py +81 -0
- shkit-1.2.0.dist-info/METADATA +239 -0
- shkit-1.2.0.dist-info/RECORD +94 -0
- shkit-1.2.0.dist-info/WHEEL +4 -0
- shkit-1.2.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Approval orchestrator (#3 + #5) — called by the approval_handler notebook /
|
|
2
|
+
relay when a user clicks Approve or Reject.
|
|
3
|
+
|
|
4
|
+
Flow:
|
|
5
|
+
1. Validate the HMAC token (signature + 15-min TTL).
|
|
6
|
+
2. Enforce one-time use (token signature recorded in approval_tokens).
|
|
7
|
+
3. Authorize the approver by governance — resolve their SSO login / Teams email
|
|
8
|
+
and confirm they have workspace access via SCIM; refuse otherwise (#3).
|
|
9
|
+
4. On approve: run the ResolutionVerifier (re-run + verify + rollback) so we
|
|
10
|
+
only mark RESOLVED when the fix is actually verified (#5).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import hashlib
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
|
|
16
|
+
from healing_kit.auth import build_client, resolve_auth_from_dbutils
|
|
17
|
+
from healing_kit.services.identity import ACCESS_DENIED_MESSAGE, authorize_approver
|
|
18
|
+
from healing_kit.services.resolution_verifier import ResolutionVerifier
|
|
19
|
+
from healing_kit.utils.hmac_tokens import validate_token
|
|
20
|
+
from healing_kit.utils.sql_safety import sql_lit
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _w(dbutils, name, default=""):
|
|
24
|
+
dbutils.widgets.text(name, default)
|
|
25
|
+
return dbutils.widgets.get(name)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _resolve_secret(dbutils, scope, host):
|
|
29
|
+
try:
|
|
30
|
+
return dbutils.secrets.get(scope=scope, key="hmac_key").encode()
|
|
31
|
+
except Exception:
|
|
32
|
+
return hashlib.sha256(("healing_kit::" + host).encode()).digest()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _audit(spark, catalog, schema, cols):
|
|
36
|
+
try:
|
|
37
|
+
spark.sql(f"INSERT INTO {catalog}.{schema}.healing_audit_log VALUES ("
|
|
38
|
+
+ ", ".join(sql_lit(c) for c in cols) + ")")
|
|
39
|
+
except Exception as ex:
|
|
40
|
+
print(f" audit write failed: {str(ex)[:80]}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _claim_token_once(spark, catalog, schema, jti, run_id) -> bool:
|
|
44
|
+
"""Record the token signature; return False if it was already used."""
|
|
45
|
+
table = f"{catalog}.{schema}.approval_tokens"
|
|
46
|
+
now = datetime.utcnow().isoformat()
|
|
47
|
+
try:
|
|
48
|
+
used = spark.sql(f"SELECT count(*) AS n FROM {table} WHERE jti = {sql_lit(jti)}").collect()[0]["n"]
|
|
49
|
+
if used and used > 0:
|
|
50
|
+
return False
|
|
51
|
+
spark.sql(f"INSERT INTO {table} VALUES ({sql_lit(jti)}, {sql_lit(run_id)}, {sql_lit(now)})")
|
|
52
|
+
return True
|
|
53
|
+
except Exception as ex:
|
|
54
|
+
# If the table is missing we fail closed (treat as unusable) rather than
|
|
55
|
+
# silently allowing replay.
|
|
56
|
+
print(f" one-time-use check failed: {str(ex)[:80]}")
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def process_approval(spark, dbutils):
|
|
61
|
+
"""Entry point called by the approval_handler notebook driver."""
|
|
62
|
+
token = _w(dbutils, "token", "")
|
|
63
|
+
action = _w(dbutils, "action", "") # approve | reject
|
|
64
|
+
task_key = _w(dbutils, "task", "")
|
|
65
|
+
catalog = _w(dbutils, "catalog", "dev_catalog")
|
|
66
|
+
schema = _w(dbutils, "schema", "healing_schema")
|
|
67
|
+
secret_scope = _w(dbutils, "secret_scope", "healing_kit")
|
|
68
|
+
# Identity is supplied by the authenticated relay (SSO) or the clicker's Teams email.
|
|
69
|
+
approver_identity = _w(dbutils, "approver_identity", "")
|
|
70
|
+
approvers_group = _w(dbutils, "approvers_group", "")
|
|
71
|
+
|
|
72
|
+
auth = resolve_auth_from_dbutils(dbutils, secret_scope)
|
|
73
|
+
client = build_client(auth)
|
|
74
|
+
host = auth.config.host
|
|
75
|
+
secret = _resolve_secret(dbutils, secret_scope, host)
|
|
76
|
+
now = datetime.utcnow().isoformat()
|
|
77
|
+
|
|
78
|
+
# 1. Validate token.
|
|
79
|
+
try:
|
|
80
|
+
payload = validate_token(token, secret)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
_audit(spark, catalog, schema,
|
|
83
|
+
[hashlib.sha256(token.encode()).hexdigest()[:16], "unknown", "unknown", "",
|
|
84
|
+
f"Token validation failed: {str(e)[:100]}", "REJECTED", 0, "none", "",
|
|
85
|
+
False, False, None, 0, 0, 0, now])
|
|
86
|
+
return f"REJECTED: {e}"
|
|
87
|
+
|
|
88
|
+
run_id = payload.run_id
|
|
89
|
+
action_id = payload.action_id
|
|
90
|
+
|
|
91
|
+
# 2. One-time use.
|
|
92
|
+
jti = hashlib.sha256(token.encode()).hexdigest()
|
|
93
|
+
if not _claim_token_once(spark, catalog, schema, jti, run_id):
|
|
94
|
+
return "REJECTED: token already used or unusable"
|
|
95
|
+
|
|
96
|
+
# 3. Authorize the approver by governance (#3).
|
|
97
|
+
identity = approver_identity or payload.approver_email
|
|
98
|
+
decision = authorize_approver(client, identity, approvers_group=approvers_group)
|
|
99
|
+
if not decision.authorized:
|
|
100
|
+
_audit(spark, catalog, schema,
|
|
101
|
+
[jti[:16], run_id, "", "", f"Approver denied ({identity}): {decision.reason}",
|
|
102
|
+
"ACCESS_DENIED", 0, "approval_handler", "", False, False, None, 0, 0, 0, now])
|
|
103
|
+
print(f"ACCESS DENIED for '{identity}': {decision.reason}")
|
|
104
|
+
return ACCESS_DENIED_MESSAGE if not decision.reason else decision.reason
|
|
105
|
+
|
|
106
|
+
approver = decision.identity
|
|
107
|
+
print(f"Approver authorized: {approver} ({decision.display_name})")
|
|
108
|
+
|
|
109
|
+
# 4a. Reject path.
|
|
110
|
+
if action == "reject":
|
|
111
|
+
spark.sql(f"UPDATE {catalog}.{schema}.healing_state SET current_status = 'ESCALATED', "
|
|
112
|
+
f"resolved_by = {sql_lit(approver)}, updated_at = {sql_lit(now)} "
|
|
113
|
+
f"WHERE run_id = {sql_lit(run_id)}")
|
|
114
|
+
_audit(spark, catalog, schema,
|
|
115
|
+
[jti[:16], run_id, "", "", f"Rejected by {approver}", action_id, 0,
|
|
116
|
+
"approval_handler", "", False, True, "user_rejected", 0, 0, 0, now])
|
|
117
|
+
return f"REJECTED by {approver}: {task_key}"
|
|
118
|
+
|
|
119
|
+
# 4b. Approve path → verify the fix.
|
|
120
|
+
verifier = ResolutionVerifier(client)
|
|
121
|
+
# apply_fn/rollback_fn for CODE_PATCH/SCHEMA_FIX are wired once fix-generation lands;
|
|
122
|
+
# until then those actions return STAGED (never falsely marked resolved).
|
|
123
|
+
result = verifier.verify(action_id, run_id, task_key)
|
|
124
|
+
if result.verified:
|
|
125
|
+
status = "RESOLVED"
|
|
126
|
+
elif result.final_state in ("ESCALATE", "STAGED"):
|
|
127
|
+
status = "ESCALATED"
|
|
128
|
+
else:
|
|
129
|
+
status = "EXECUTING"
|
|
130
|
+
|
|
131
|
+
spark.sql(f"UPDATE {catalog}.{schema}.healing_state SET current_status = {sql_lit(status)}, "
|
|
132
|
+
f"resolved_by = {sql_lit(approver)}, updated_at = {sql_lit(now)} "
|
|
133
|
+
f"WHERE run_id = {sql_lit(run_id)}")
|
|
134
|
+
_audit(spark, catalog, schema,
|
|
135
|
+
[jti[:16], run_id, "", "", f"Approved by {approver}: {result.final_state} ({result.detail})",
|
|
136
|
+
action_id, float(getattr(payload, "confidence", 0) or 0), "approval_handler", "",
|
|
137
|
+
bool(result.verified), False, None, 0, 0, 0, now])
|
|
138
|
+
|
|
139
|
+
summary = f"APPROVED by {approver} | {action_id} | {result.final_state} | verified={result.verified}"
|
|
140
|
+
print(summary)
|
|
141
|
+
return summary
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Audit retention / PII maintenance (#4) — called by the audit_retention notebook.
|
|
2
|
+
|
|
3
|
+
The audit log is append-only and may contain sensitive text copied from error
|
|
4
|
+
logs. This job anonymizes the free-text fields on rows older than the retention
|
|
5
|
+
window (kept for forensics but scrubbed of payload), and expires stale one-time
|
|
6
|
+
approval tokens. It does not DELETE audit rows (append-only), it overwrites the
|
|
7
|
+
free-text columns with a tombstone.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
|
|
12
|
+
from healing_kit.utils.sql_safety import sql_lit
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _w(dbutils, name, default=""):
|
|
16
|
+
dbutils.widgets.text(name, default)
|
|
17
|
+
return dbutils.widgets.get(name)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def run_retention(spark, dbutils):
|
|
21
|
+
catalog = _w(dbutils, "catalog", "dev_catalog")
|
|
22
|
+
schema = _w(dbutils, "schema", "healing_schema")
|
|
23
|
+
retention_days = int(_w(dbutils, "retention_days", "90"))
|
|
24
|
+
|
|
25
|
+
cutoff = (datetime.utcnow() - timedelta(days=retention_days)).isoformat()
|
|
26
|
+
audit = f"{catalog}.{schema}.healing_audit_log"
|
|
27
|
+
tokens = f"{catalog}.{schema}.approval_tokens"
|
|
28
|
+
|
|
29
|
+
# Anonymize free-text payload on old rows (append-only table allows UPDATE of
|
|
30
|
+
# non-key columns only if appendOnly is off; if appendOnly is enforced this
|
|
31
|
+
# will no-op with a clear message — see provisioner notes).
|
|
32
|
+
scrubbed = 0
|
|
33
|
+
try:
|
|
34
|
+
spark.sql(
|
|
35
|
+
f"UPDATE {audit} SET root_cause_predicted = '[anonymized]' "
|
|
36
|
+
f"WHERE created_at < {sql_lit(cutoff)} AND root_cause_predicted <> '[anonymized]'"
|
|
37
|
+
)
|
|
38
|
+
scrubbed = spark.sql(
|
|
39
|
+
f"SELECT count(*) AS n FROM {audit} WHERE root_cause_predicted = '[anonymized]'"
|
|
40
|
+
).collect()[0]["n"]
|
|
41
|
+
print(f"Anonymized audit rows older than {retention_days}d (total anonymized: {scrubbed})")
|
|
42
|
+
except Exception as ex:
|
|
43
|
+
print(f"Audit anonymization skipped (append-only or perms): {str(ex)[:100]}")
|
|
44
|
+
|
|
45
|
+
# Expire used/stale approval tokens.
|
|
46
|
+
try:
|
|
47
|
+
spark.sql(f"DELETE FROM {tokens} WHERE used_at < {sql_lit(cutoff)}")
|
|
48
|
+
print("Expired stale approval tokens.")
|
|
49
|
+
except Exception as ex:
|
|
50
|
+
print(f"Token cleanup skipped: {str(ex)[:100]}")
|
|
51
|
+
|
|
52
|
+
return f"Retention pass complete (cutoff {cutoff}, anonymized {scrubbed})."
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core business logic services."""
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Resolution cache — zero-token fast path for previously resolved errors."""
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from healing_kit.models.diagnosis import DiagnosisResponse
|
|
9
|
+
from healing_kit.utils.error_hash import compute_error_hash
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class CachedResolution:
|
|
14
|
+
"""A cached successful resolution."""
|
|
15
|
+
|
|
16
|
+
cache_id: str
|
|
17
|
+
error_hash: str
|
|
18
|
+
action_id: str
|
|
19
|
+
action_params: dict
|
|
20
|
+
resolved_at: datetime
|
|
21
|
+
expires_at: datetime
|
|
22
|
+
hit_count: int
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CacheService:
|
|
26
|
+
"""
|
|
27
|
+
Resolution cache backed by Delta table.
|
|
28
|
+
|
|
29
|
+
- Lookup by error_hash within 7-day TTL window.
|
|
30
|
+
- Store new resolutions after successful fix.
|
|
31
|
+
- Invalidate entries when referenced tables have schema changes.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, spark_session, catalog: str, schema: str = "healing_schema"):
|
|
35
|
+
self.spark = spark_session
|
|
36
|
+
self.table = f"{catalog}.{schema}.resolution_cache"
|
|
37
|
+
|
|
38
|
+
def compute_hash(self, error_type: str, notebook_path: str, affected_tables: list[str]) -> str:
|
|
39
|
+
"""Compute the error fingerprint hash."""
|
|
40
|
+
return compute_error_hash(error_type, notebook_path, affected_tables)
|
|
41
|
+
|
|
42
|
+
def lookup(self, error_hash: str) -> Optional[CachedResolution]:
|
|
43
|
+
"""
|
|
44
|
+
Look up a cached resolution by error hash.
|
|
45
|
+
Returns None if no valid (non-expired, non-invalidated) entry exists.
|
|
46
|
+
"""
|
|
47
|
+
import json
|
|
48
|
+
|
|
49
|
+
now = datetime.utcnow().isoformat()
|
|
50
|
+
df = self.spark.sql(f"""
|
|
51
|
+
SELECT cache_id, error_hash, action_id, action_params,
|
|
52
|
+
resolved_at, expires_at, hit_count
|
|
53
|
+
FROM {self.table}
|
|
54
|
+
WHERE error_hash = '{error_hash}'
|
|
55
|
+
AND invalidated = FALSE
|
|
56
|
+
AND expires_at > '{now}'
|
|
57
|
+
ORDER BY resolved_at DESC
|
|
58
|
+
LIMIT 1
|
|
59
|
+
""")
|
|
60
|
+
|
|
61
|
+
rows = df.collect()
|
|
62
|
+
if not rows:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
row = rows[0]
|
|
66
|
+
# Increment hit count
|
|
67
|
+
self.spark.sql(f"""
|
|
68
|
+
UPDATE {self.table}
|
|
69
|
+
SET hit_count = hit_count + 1
|
|
70
|
+
WHERE cache_id = '{row['cache_id']}'
|
|
71
|
+
""")
|
|
72
|
+
|
|
73
|
+
return CachedResolution(
|
|
74
|
+
cache_id=row["cache_id"],
|
|
75
|
+
error_hash=row["error_hash"],
|
|
76
|
+
action_id=row["action_id"],
|
|
77
|
+
action_params=json.loads(row["action_params"]) if row["action_params"] else {},
|
|
78
|
+
resolved_at=row["resolved_at"],
|
|
79
|
+
expires_at=row["expires_at"],
|
|
80
|
+
hit_count=row["hit_count"] + 1,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def store(self, error_hash: str, error_type: str, notebook_path: str,
|
|
84
|
+
affected_tables: list[str], diagnosis: DiagnosisResponse,
|
|
85
|
+
ttl_days: int = 7) -> str:
|
|
86
|
+
"""Store a successful resolution in the cache. Returns cache_id."""
|
|
87
|
+
import json
|
|
88
|
+
|
|
89
|
+
cache_id = str(uuid.uuid4())
|
|
90
|
+
now = datetime.utcnow()
|
|
91
|
+
expires = now + timedelta(days=ttl_days)
|
|
92
|
+
tables_str = "|".join(sorted(affected_tables))
|
|
93
|
+
params_json = json.dumps(diagnosis.action_params).replace("'", "''")
|
|
94
|
+
|
|
95
|
+
self.spark.sql(f"""
|
|
96
|
+
INSERT INTO {self.table} VALUES (
|
|
97
|
+
'{cache_id}', '{error_hash}', '{error_type}',
|
|
98
|
+
'{notebook_path}', '{tables_str}',
|
|
99
|
+
'{diagnosis.action_id.value}', '{params_json}',
|
|
100
|
+
0, '{now.isoformat()}', '{expires.isoformat()}',
|
|
101
|
+
FALSE, NULL, '{now.isoformat()}'
|
|
102
|
+
)
|
|
103
|
+
""")
|
|
104
|
+
return cache_id
|
|
105
|
+
|
|
106
|
+
def invalidate_by_table(self, table_name: str) -> int:
|
|
107
|
+
"""
|
|
108
|
+
Invalidate all cache entries that reference the given table.
|
|
109
|
+
Called when schema changes are detected on that table.
|
|
110
|
+
Returns the number of entries invalidated.
|
|
111
|
+
"""
|
|
112
|
+
result = self.spark.sql(f"""
|
|
113
|
+
UPDATE {self.table}
|
|
114
|
+
SET invalidated = TRUE,
|
|
115
|
+
invalidated_reason = 'Schema change on {table_name}'
|
|
116
|
+
WHERE invalidated = FALSE
|
|
117
|
+
AND affected_tables LIKE '%{table_name}%'
|
|
118
|
+
""")
|
|
119
|
+
# Return affected row count (Delta returns this in metrics)
|
|
120
|
+
return result.first()[0] if result.first() else 0
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Circuit breaker — prevents infinite remediation loops."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class CircuitBreakerState:
|
|
10
|
+
"""Current state of the circuit breaker for a run_id."""
|
|
11
|
+
|
|
12
|
+
run_id: str
|
|
13
|
+
attempt_count: int
|
|
14
|
+
breaker_open: bool
|
|
15
|
+
opened_at: Optional[datetime] = None
|
|
16
|
+
resolved_by: Optional[str] = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CircuitBreaker:
|
|
20
|
+
"""
|
|
21
|
+
State machine that blocks automated actions after N consecutive failures.
|
|
22
|
+
|
|
23
|
+
- Attempt 1: AI diagnoses and applies fix.
|
|
24
|
+
- Attempt 2: If still fails, retry with updated context.
|
|
25
|
+
- Attempt 3+: breaker_open = TRUE. Only human can reset.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, spark_session, catalog: str, schema: str = "healing_schema", threshold: int = 2):
|
|
29
|
+
self.spark = spark_session
|
|
30
|
+
self.table = f"{catalog}.{schema}.healing_state"
|
|
31
|
+
self.threshold = threshold
|
|
32
|
+
|
|
33
|
+
def check(self, run_id: str) -> CircuitBreakerState:
|
|
34
|
+
"""Get current breaker state for a run_id."""
|
|
35
|
+
df = self.spark.sql(f"""
|
|
36
|
+
SELECT run_id, attempt_count, breaker_open, opened_at, resolved_by
|
|
37
|
+
FROM {self.table}
|
|
38
|
+
WHERE run_id = '{run_id}'
|
|
39
|
+
""")
|
|
40
|
+
rows = df.collect()
|
|
41
|
+
if not rows:
|
|
42
|
+
return CircuitBreakerState(run_id=run_id, attempt_count=0, breaker_open=False)
|
|
43
|
+
|
|
44
|
+
row = rows[0]
|
|
45
|
+
return CircuitBreakerState(
|
|
46
|
+
run_id=row["run_id"],
|
|
47
|
+
attempt_count=row["attempt_count"],
|
|
48
|
+
breaker_open=row["breaker_open"],
|
|
49
|
+
opened_at=row["opened_at"],
|
|
50
|
+
resolved_by=row["resolved_by"],
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def is_open(self, run_id: str) -> bool:
|
|
54
|
+
"""Check if breaker is tripped for this run."""
|
|
55
|
+
state = self.check(run_id)
|
|
56
|
+
return state.breaker_open
|
|
57
|
+
|
|
58
|
+
def increment_attempt(self, run_id: str) -> int:
|
|
59
|
+
"""
|
|
60
|
+
Increment attempt count. If threshold reached, open the breaker.
|
|
61
|
+
Returns the new attempt count.
|
|
62
|
+
"""
|
|
63
|
+
state = self.check(run_id)
|
|
64
|
+
new_count = state.attempt_count + 1
|
|
65
|
+
now = datetime.utcnow().isoformat()
|
|
66
|
+
|
|
67
|
+
if state.attempt_count == 0:
|
|
68
|
+
# First time seeing this run
|
|
69
|
+
self.spark.sql(f"""
|
|
70
|
+
MERGE INTO {self.table} AS target
|
|
71
|
+
USING (SELECT '{run_id}' AS run_id) AS source
|
|
72
|
+
ON target.run_id = source.run_id
|
|
73
|
+
WHEN MATCHED THEN UPDATE SET attempt_count = target.attempt_count + 1, updated_at = '{now}'
|
|
74
|
+
WHEN NOT MATCHED THEN INSERT (run_id, attempt_count, breaker_open, updated_at)
|
|
75
|
+
VALUES ('{run_id}', 1, FALSE, '{now}')
|
|
76
|
+
""")
|
|
77
|
+
else:
|
|
78
|
+
self.spark.sql(f"""
|
|
79
|
+
UPDATE {self.table}
|
|
80
|
+
SET attempt_count = {new_count}, updated_at = '{now}'
|
|
81
|
+
WHERE run_id = '{run_id}'
|
|
82
|
+
""")
|
|
83
|
+
|
|
84
|
+
# Open breaker if threshold hit
|
|
85
|
+
if new_count >= self.threshold:
|
|
86
|
+
self.open_breaker(run_id)
|
|
87
|
+
|
|
88
|
+
return new_count
|
|
89
|
+
|
|
90
|
+
def open_breaker(self, run_id: str) -> None:
|
|
91
|
+
"""Trip the circuit breaker. No more automated actions allowed."""
|
|
92
|
+
now = datetime.utcnow().isoformat()
|
|
93
|
+
self.spark.sql(f"""
|
|
94
|
+
UPDATE {self.table}
|
|
95
|
+
SET breaker_open = TRUE, opened_at = '{now}', updated_at = '{now}',
|
|
96
|
+
current_status = 'ESCALATED'
|
|
97
|
+
WHERE run_id = '{run_id}'
|
|
98
|
+
""")
|
|
99
|
+
|
|
100
|
+
def reset_breaker(self, run_id: str, resolved_by_email: str) -> None:
|
|
101
|
+
"""
|
|
102
|
+
Reset the breaker. ONLY callable by authenticated human.
|
|
103
|
+
Automated callers are rejected.
|
|
104
|
+
"""
|
|
105
|
+
if not resolved_by_email or "@" not in resolved_by_email:
|
|
106
|
+
raise PermissionError("Circuit breaker can only be reset by an authenticated human (email required)")
|
|
107
|
+
|
|
108
|
+
now = datetime.utcnow().isoformat()
|
|
109
|
+
self.spark.sql(f"""
|
|
110
|
+
UPDATE {self.table}
|
|
111
|
+
SET breaker_open = FALSE, resolved_by = '{resolved_by_email}',
|
|
112
|
+
attempt_count = 0, updated_at = '{now}', current_status = 'RESOLVED'
|
|
113
|
+
WHERE run_id = '{run_id}'
|
|
114
|
+
""")
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Context Agent — assembles structured evidence from 7 Databricks API sources."""
|
|
2
|
+
|
|
3
|
+
from healing_kit.clients.databricks_client import DatabricksClient
|
|
4
|
+
from healing_kit.models.evidence import (
|
|
5
|
+
ClusterEventsEvidence,
|
|
6
|
+
DriverLogEvidence,
|
|
7
|
+
EvidencePackage,
|
|
8
|
+
GitContextEvidence,
|
|
9
|
+
LineageEvidence,
|
|
10
|
+
SchemaHistoryEvidence,
|
|
11
|
+
SparkLogEvidence,
|
|
12
|
+
TaskMetricsEvidence,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ContextAgent:
|
|
17
|
+
"""
|
|
18
|
+
Assembles a structured evidence package before invoking the LLM.
|
|
19
|
+
Each source fails independently — unavailable sources are logged and skipped.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, client: DatabricksClient, spark_session=None):
|
|
23
|
+
self.client = client
|
|
24
|
+
self.spark = spark_session
|
|
25
|
+
|
|
26
|
+
def collect_evidence(self, run_id: str, job_id: str, task_key: str, notebook_path: str = "") -> EvidencePackage:
|
|
27
|
+
"""Gather all 7 evidence sources for a failed task."""
|
|
28
|
+
package = EvidencePackage(run_id=run_id, job_id=job_id, task_key=task_key)
|
|
29
|
+
|
|
30
|
+
# 1. Driver stdout + error (via Runs Get Output API)
|
|
31
|
+
package.driver_stdout = self._collect_driver_logs(run_id)
|
|
32
|
+
|
|
33
|
+
# 2. Task metrics
|
|
34
|
+
package.task_metrics = self._collect_task_metrics(run_id)
|
|
35
|
+
|
|
36
|
+
# 3. Notebook source code
|
|
37
|
+
if notebook_path:
|
|
38
|
+
source = self.client.export_notebook(notebook_path)
|
|
39
|
+
if source:
|
|
40
|
+
# Store in spark_event_logs field (reusing for code context)
|
|
41
|
+
package.spark_event_logs = SparkLogEvidence(
|
|
42
|
+
first_errors=[source[:3000]]
|
|
43
|
+
)
|
|
44
|
+
else:
|
|
45
|
+
package.missing_sources.append("notebook_source")
|
|
46
|
+
|
|
47
|
+
# 4. Unity Catalog Lineage
|
|
48
|
+
package.lineage = self._collect_lineage(run_id)
|
|
49
|
+
|
|
50
|
+
# 5. Schema history
|
|
51
|
+
package.schema_history = self._collect_schema_history(run_id)
|
|
52
|
+
|
|
53
|
+
# 6. Git context
|
|
54
|
+
package.git_context = self._collect_git_context(job_id)
|
|
55
|
+
|
|
56
|
+
# 7. Cluster events
|
|
57
|
+
package.cluster_events = self._collect_cluster_events(run_id)
|
|
58
|
+
|
|
59
|
+
return package
|
|
60
|
+
|
|
61
|
+
def _collect_driver_logs(self, run_id: str) -> DriverLogEvidence | None:
|
|
62
|
+
"""Collect driver stdout/stderr from Runs Get Output API."""
|
|
63
|
+
try:
|
|
64
|
+
output = self.client.get_run_output(int(run_id))
|
|
65
|
+
error = output.get("error", "") or ""
|
|
66
|
+
trace = output.get("error_trace", "") or ""
|
|
67
|
+
combined = error + "\n" + trace
|
|
68
|
+
|
|
69
|
+
evidence = DriverLogEvidence()
|
|
70
|
+
lines = combined.split("\n")
|
|
71
|
+
for line in lines:
|
|
72
|
+
lower = line.lower()
|
|
73
|
+
if "schema" in lower or "column" in lower or "structtype" in lower:
|
|
74
|
+
evidence.schema_errors.append(line.strip()[:200])
|
|
75
|
+
elif "not found" in lower or "does not exist" in lower:
|
|
76
|
+
evidence.missing_table_errors.append(line.strip()[:200])
|
|
77
|
+
elif "exception" in lower or "error" in lower:
|
|
78
|
+
evidence.data_source_exceptions.append(line.strip()[:200])
|
|
79
|
+
|
|
80
|
+
return evidence
|
|
81
|
+
except Exception:
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
def _collect_task_metrics(self, run_id: str) -> TaskMetricsEvidence | None:
|
|
85
|
+
"""Extract task runtime metrics if available."""
|
|
86
|
+
try:
|
|
87
|
+
output = self.client.get_run_output(int(run_id))
|
|
88
|
+
_ = output.get("metadata", {})
|
|
89
|
+
# Metrics may be nested in cluster_instance or task details
|
|
90
|
+
return TaskMetricsEvidence()
|
|
91
|
+
except Exception:
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
def _collect_lineage(self, run_id: str) -> LineageEvidence | None:
|
|
95
|
+
"""Collect Unity Catalog lineage for tables referenced in the run."""
|
|
96
|
+
try:
|
|
97
|
+
return LineageEvidence()
|
|
98
|
+
except Exception:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
def _collect_schema_history(self, run_id: str) -> SchemaHistoryEvidence | None:
|
|
102
|
+
"""Run DESCRIBE HISTORY on affected tables."""
|
|
103
|
+
try:
|
|
104
|
+
return SchemaHistoryEvidence()
|
|
105
|
+
except Exception:
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
def _collect_git_context(self, job_id: str) -> GitContextEvidence | None:
|
|
109
|
+
"""Extract git metadata from the job configuration."""
|
|
110
|
+
try:
|
|
111
|
+
job = self.client.get_job(int(job_id))
|
|
112
|
+
git_source = job.get("settings", {}).get("git_source", {})
|
|
113
|
+
if git_source:
|
|
114
|
+
return GitContextEvidence(
|
|
115
|
+
last_commit_message=git_source.get("git_commit", {}).get("message", ""),
|
|
116
|
+
last_commit_author=git_source.get("git_commit", {}).get("author", ""),
|
|
117
|
+
)
|
|
118
|
+
return None
|
|
119
|
+
except Exception:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
def _collect_cluster_events(self, run_id: str) -> ClusterEventsEvidence | None:
|
|
123
|
+
"""Get cluster termination/autoscaling events."""
|
|
124
|
+
try:
|
|
125
|
+
return ClusterEventsEvidence()
|
|
126
|
+
except Exception:
|
|
127
|
+
return None
|