shkit 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- healing_kit/__init__.py +3 -0
- healing_kit/auth.py +79 -0
- healing_kit/clients/__init__.py +1 -0
- healing_kit/clients/databricks_client.py +183 -0
- healing_kit/clients/teams_client.py +128 -0
- healing_kit/models/__init__.py +1 -0
- healing_kit/models/diagnosis.py +45 -0
- healing_kit/models/events.py +30 -0
- healing_kit/models/evidence.py +83 -0
- healing_kit/runtime/__init__.py +6 -0
- healing_kit/runtime/approval.py +141 -0
- healing_kit/runtime/maintenance.py +52 -0
- healing_kit/services/__init__.py +1 -0
- healing_kit/services/cache_service.py +120 -0
- healing_kit/services/circuit_breaker.py +114 -0
- healing_kit/services/context_agent.py +127 -0
- healing_kit/services/dependency_graph.py +141 -0
- healing_kit/services/diagnosis_engine.py +165 -0
- healing_kit/services/identity.py +61 -0
- healing_kit/services/model_router.py +52 -0
- healing_kit/services/query_guard.py +168 -0
- healing_kit/services/resolution_verifier.py +100 -0
- healing_kit/services/token_budget.py +137 -0
- healing_kit/utils/__init__.py +1 -0
- healing_kit/utils/error_hash.py +15 -0
- healing_kit/utils/hmac_tokens.py +86 -0
- healing_kit/utils/sql_safety.py +84 -0
- iic/__init__.py +51 -0
- iic/__main__.py +18 -0
- iic/_console.py +235 -0
- iic/_doctor.py +143 -0
- iic/change/__init__.py +7 -0
- iic/change/change_detector.py +154 -0
- iic/context/__init__.py +7 -0
- iic/context/context_builder.py +117 -0
- iic/dependency/__init__.py +7 -0
- iic/dependency/dependency_analyzer.py +93 -0
- iic/diagnosis/__init__.py +7 -0
- iic/diagnosis/diagnosis_engine.py +183 -0
- iic/dna/__init__.py +7 -0
- iic/dna/dna_builder.py +184 -0
- iic/impact/__init__.py +7 -0
- iic/impact/impact_engine.py +102 -0
- iic/ingestion/__init__.py +14 -0
- iic/ingestion/base.py +21 -0
- iic/ingestion/databricks_source.py +98 -0
- iic/ingestion/static_source.py +23 -0
- iic/ingestion/webhook_source.py +39 -0
- iic/models/__init__.py +44 -0
- iic/models/change.py +77 -0
- iic/models/context.py +46 -0
- iic/models/diagnosis.py +37 -0
- iic/models/dna.py +77 -0
- iic/models/event.py +78 -0
- iic/models/impact.py +60 -0
- iic/models/report.py +88 -0
- iic/models/routing.py +41 -0
- iic/notify/__init__.py +7 -0
- iic/notify/teams_notifier.py +112 -0
- iic/report/__init__.py +7 -0
- iic/report/report_generator.py +67 -0
- iic/routing/__init__.py +7 -0
- iic/routing/router.py +42 -0
- iic/runtime/__init__.py +10 -0
- iic/runtime/_sql.py +11 -0
- iic/runtime/agent_config.py +48 -0
- iic/runtime/agent_runtime.py +70 -0
- iic/runtime/antibodies.py +100 -0
- iic/runtime/bootstrap.py +157 -0
- iic/runtime/constants.py +40 -0
- iic/runtime/context.py +46 -0
- iic/runtime/detective.py +72 -0
- iic/runtime/hooks.py +85 -0
- iic/runtime/incident_engine.py +207 -0
- iic/runtime/inprocess.py +350 -0
- iic/runtime/ledger.py +120 -0
- iic/runtime/monitor.py +155 -0
- iic/runtime/pattern_store.py +53 -0
- iic/runtime/reconciler.py +139 -0
- iic/runtime/scope_config.py +127 -0
- iic/runtime/store.py +150 -0
- iic/runtime/wrapper.py +28 -0
- iic_autoload.pth +1 -0
- onboarding/__init__.py +1 -0
- onboarding/cli.py +168 -0
- onboarding/config_schema.py +62 -0
- onboarding/manifest.py +27 -0
- onboarding/preflight.py +129 -0
- onboarding/provisioner.py +573 -0
- onboarding/rollback.py +81 -0
- shkit-1.2.0.dist-info/METADATA +239 -0
- shkit-1.2.0.dist-info/RECORD +94 -0
- shkit-1.2.0.dist-info/WHEEL +4 -0
- shkit-1.2.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Token budget enforcer — cost governance for LLM calls."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from enum import Enum
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BudgetMode(str, Enum):
|
|
9
|
+
NORMAL = "normal"
|
|
10
|
+
WARNING = "warning"
|
|
11
|
+
DEGRADED = "degraded"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class BudgetState:
|
|
16
|
+
"""Current budget window state."""
|
|
17
|
+
|
|
18
|
+
window_id: str
|
|
19
|
+
tokens_used: int
|
|
20
|
+
cost_usd: float
|
|
21
|
+
hourly_budget: int
|
|
22
|
+
current_mode: BudgetMode
|
|
23
|
+
spend_pct: float
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TokenBudgetEnforcer:
|
|
27
|
+
"""
|
|
28
|
+
Tracks hourly LLM token spend and enforces mode transitions.
|
|
29
|
+
|
|
30
|
+
Modes:
|
|
31
|
+
- Normal (< 70% budget): Full AI healing active
|
|
32
|
+
- Warning (70-90%): Lightweight model only
|
|
33
|
+
- Degraded (> 90%): No LLM calls, log + page on-call
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, spark_session, catalog: str, schema: str = "healing_schema", hourly_budget: int = 50000):
|
|
37
|
+
self.spark = spark_session
|
|
38
|
+
self.table = f"{catalog}.{schema}.token_budget"
|
|
39
|
+
self.hourly_budget = hourly_budget
|
|
40
|
+
|
|
41
|
+
def _current_window_id(self) -> str:
|
|
42
|
+
"""Get the current hourly window ID (YYYY-MM-DD-HH)."""
|
|
43
|
+
return datetime.utcnow().strftime("%Y-%m-%d-%H")
|
|
44
|
+
|
|
45
|
+
def get_current_state(self) -> BudgetState:
|
|
46
|
+
"""Get the current budget window state."""
|
|
47
|
+
window_id = self._current_window_id()
|
|
48
|
+
df = self.spark.sql(f"""
|
|
49
|
+
SELECT window_id, tokens_used, cost_usd, hourly_budget, current_mode
|
|
50
|
+
FROM {self.table}
|
|
51
|
+
WHERE window_id = '{window_id}'
|
|
52
|
+
""")
|
|
53
|
+
rows = df.collect()
|
|
54
|
+
|
|
55
|
+
if not rows:
|
|
56
|
+
return BudgetState(
|
|
57
|
+
window_id=window_id,
|
|
58
|
+
tokens_used=0,
|
|
59
|
+
cost_usd=0.0,
|
|
60
|
+
hourly_budget=self.hourly_budget,
|
|
61
|
+
current_mode=BudgetMode.NORMAL,
|
|
62
|
+
spend_pct=0.0,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
row = rows[0]
|
|
66
|
+
tokens = row["tokens_used"]
|
|
67
|
+
budget = row["hourly_budget"]
|
|
68
|
+
spend_pct = (tokens / budget * 100) if budget > 0 else 0
|
|
69
|
+
|
|
70
|
+
return BudgetState(
|
|
71
|
+
window_id=row["window_id"],
|
|
72
|
+
tokens_used=tokens,
|
|
73
|
+
cost_usd=row["cost_usd"],
|
|
74
|
+
hourly_budget=budget,
|
|
75
|
+
current_mode=BudgetMode(row["current_mode"]),
|
|
76
|
+
spend_pct=spend_pct,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def get_current_mode(self) -> BudgetMode:
|
|
80
|
+
"""Return current mode based on spend percentage."""
|
|
81
|
+
state = self.get_current_state()
|
|
82
|
+
return self._compute_mode(state.spend_pct)
|
|
83
|
+
|
|
84
|
+
def can_invoke_model(self, model_tier: str) -> bool:
|
|
85
|
+
"""
|
|
86
|
+
Check if a model invocation is allowed under current budget mode.
|
|
87
|
+
|
|
88
|
+
model_tier: 'lightweight' or 'powerful'
|
|
89
|
+
"""
|
|
90
|
+
mode = self.get_current_mode()
|
|
91
|
+
if mode == BudgetMode.DEGRADED:
|
|
92
|
+
return False
|
|
93
|
+
if mode == BudgetMode.WARNING and model_tier == "powerful":
|
|
94
|
+
return False
|
|
95
|
+
return True
|
|
96
|
+
|
|
97
|
+
def record_usage(self, tokens_used: int, cost_usd: float) -> BudgetMode:
|
|
98
|
+
"""
|
|
99
|
+
Record token usage and return the resulting mode.
|
|
100
|
+
Creates the window entry if it doesn't exist.
|
|
101
|
+
"""
|
|
102
|
+
window_id = self._current_window_id()
|
|
103
|
+
now = datetime.utcnow().isoformat()
|
|
104
|
+
|
|
105
|
+
self.spark.sql(f"""
|
|
106
|
+
MERGE INTO {self.table} AS target
|
|
107
|
+
USING (SELECT '{window_id}' AS window_id) AS source
|
|
108
|
+
ON target.window_id = source.window_id
|
|
109
|
+
WHEN MATCHED THEN UPDATE SET
|
|
110
|
+
tokens_used = target.tokens_used + {tokens_used},
|
|
111
|
+
cost_usd = target.cost_usd + {cost_usd},
|
|
112
|
+
updated_at = '{now}'
|
|
113
|
+
WHEN NOT MATCHED THEN INSERT
|
|
114
|
+
(window_id, window_start, window_end, tokens_used, cost_usd, hourly_budget, current_mode, created_at, updated_at)
|
|
115
|
+
VALUES ('{window_id}', '{now}', '{now}', {tokens_used}, {cost_usd}, {self.hourly_budget}, 'normal', '{now}', '{now}')
|
|
116
|
+
""")
|
|
117
|
+
|
|
118
|
+
# Evaluate and update mode
|
|
119
|
+
state = self.get_current_state()
|
|
120
|
+
new_mode = self._compute_mode(state.spend_pct)
|
|
121
|
+
|
|
122
|
+
if new_mode != state.current_mode:
|
|
123
|
+
self.spark.sql(f"""
|
|
124
|
+
UPDATE {self.table}
|
|
125
|
+
SET current_mode = '{new_mode.value}', mode_changed_at = '{now}', updated_at = '{now}'
|
|
126
|
+
WHERE window_id = '{window_id}'
|
|
127
|
+
""")
|
|
128
|
+
|
|
129
|
+
return new_mode
|
|
130
|
+
|
|
131
|
+
def _compute_mode(self, spend_pct: float) -> BudgetMode:
|
|
132
|
+
"""Determine mode from spend percentage."""
|
|
133
|
+
if spend_pct >= 90:
|
|
134
|
+
return BudgetMode.DEGRADED
|
|
135
|
+
elif spend_pct >= 70:
|
|
136
|
+
return BudgetMode.WARNING
|
|
137
|
+
return BudgetMode.NORMAL
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utility functions."""
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""SHA-256 error fingerprinting for resolution cache keys."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def compute_error_hash(error_type: str, notebook_path: str, affected_tables: list[str]) -> str:
|
|
7
|
+
"""
|
|
8
|
+
Compute a deterministic SHA-256 hash of the failure fingerprint.
|
|
9
|
+
|
|
10
|
+
The hash is built from: error_type + notebook_path + sorted affected table names.
|
|
11
|
+
This ensures the same failure pattern always produces the same cache key.
|
|
12
|
+
"""
|
|
13
|
+
sorted_tables = sorted(t.strip() for t in affected_tables if t.strip())
|
|
14
|
+
payload = f"{error_type}||{notebook_path}||{'|'.join(sorted_tables)}"
|
|
15
|
+
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""HMAC-SHA256 signed one-time tokens for HITL approval flow."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import hmac
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
TOKEN_TTL_SECONDS = 900 # 15 minutes
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class TokenPayload:
|
|
14
|
+
"""Decoded token payload."""
|
|
15
|
+
|
|
16
|
+
run_id: str
|
|
17
|
+
approver_email: str
|
|
18
|
+
action_id: str
|
|
19
|
+
issued_at: float
|
|
20
|
+
expires_at: float
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def generate_token(run_id: str, approver_email: str, action_id: str, secret_key: bytes) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Generate a signed HMAC-SHA256 token with 15-minute TTL.
|
|
26
|
+
|
|
27
|
+
The token encodes: run_id + approver_email + action_id + timestamp.
|
|
28
|
+
"""
|
|
29
|
+
issued_at = time.time()
|
|
30
|
+
expires_at = issued_at + TOKEN_TTL_SECONDS
|
|
31
|
+
|
|
32
|
+
payload = json.dumps({
|
|
33
|
+
"run_id": run_id,
|
|
34
|
+
"approver_email": approver_email,
|
|
35
|
+
"action_id": action_id,
|
|
36
|
+
"issued_at": issued_at,
|
|
37
|
+
"expires_at": expires_at,
|
|
38
|
+
}, separators=(",", ":"))
|
|
39
|
+
|
|
40
|
+
signature = hmac.HMAC(secret_key, payload.encode(), hashlib.sha256).hexdigest()
|
|
41
|
+
# Token = base64-like: payload_hex.signature
|
|
42
|
+
payload_hex = payload.encode().hex()
|
|
43
|
+
return f"{payload_hex}.{signature}"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def validate_token(token: str, secret_key: bytes) -> TokenPayload:
|
|
47
|
+
"""
|
|
48
|
+
Validate a signed token. Raises ValueError if invalid or expired.
|
|
49
|
+
|
|
50
|
+
Checks:
|
|
51
|
+
1. Token format is valid
|
|
52
|
+
2. HMAC signature matches
|
|
53
|
+
3. Token has not expired (15-min TTL)
|
|
54
|
+
"""
|
|
55
|
+
parts = token.split(".")
|
|
56
|
+
if len(parts) != 2:
|
|
57
|
+
raise ValueError("Invalid token format")
|
|
58
|
+
|
|
59
|
+
payload_hex, signature = parts
|
|
60
|
+
|
|
61
|
+
# Reconstruct payload
|
|
62
|
+
try:
|
|
63
|
+
payload_bytes = bytes.fromhex(payload_hex)
|
|
64
|
+
payload_str = payload_bytes.decode()
|
|
65
|
+
except (ValueError, UnicodeDecodeError):
|
|
66
|
+
raise ValueError("Invalid token encoding")
|
|
67
|
+
|
|
68
|
+
# Verify signature
|
|
69
|
+
expected_sig = hmac.HMAC(secret_key, payload_bytes, hashlib.sha256).hexdigest()
|
|
70
|
+
if not hmac.compare_digest(signature, expected_sig):
|
|
71
|
+
raise ValueError("Invalid token signature — possible tampering")
|
|
72
|
+
|
|
73
|
+
# Parse payload
|
|
74
|
+
data = json.loads(payload_str)
|
|
75
|
+
|
|
76
|
+
# Check expiry
|
|
77
|
+
if time.time() > data["expires_at"]:
|
|
78
|
+
raise ValueError(f"Token expired at {data['expires_at']}")
|
|
79
|
+
|
|
80
|
+
return TokenPayload(
|
|
81
|
+
run_id=data["run_id"],
|
|
82
|
+
approver_email=data["approver_email"],
|
|
83
|
+
action_id=data["action_id"],
|
|
84
|
+
issued_at=data["issued_at"],
|
|
85
|
+
expires_at=data["expires_at"],
|
|
86
|
+
)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Safe SQL helpers — literal escaping and a word-boundary SELECT guard.
|
|
2
|
+
|
|
3
|
+
These exist because the runtime previously (a) interpolated LLM/free-text output
|
|
4
|
+
directly into INSERT statements and (b) used a substring keyword blocklist that
|
|
5
|
+
both let crafted statements through and false-positived on legitimate columns
|
|
6
|
+
like ``created_at`` (contains "CREATE") and ``updated_at`` (contains "UPDATE").
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
# DDL/DML keywords that must never appear in a diagnostic query. Matched as whole
|
|
12
|
+
# words (see ``is_safe_select``) so ``created_at`` / ``updated_at`` are allowed.
|
|
13
|
+
_FORBIDDEN_KEYWORDS = (
|
|
14
|
+
"INSERT", "UPDATE", "DELETE", "DROP", "ALTER", "TRUNCATE",
|
|
15
|
+
"CREATE", "MERGE", "GRANT", "REVOKE", "REPLACE", "COPY",
|
|
16
|
+
"CALL", "EXECUTE", "SET", "USE",
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
_KEYWORD_RE = re.compile(
|
|
20
|
+
r"\b(" + "|".join(_FORBIDDEN_KEYWORDS) + r")\b", re.IGNORECASE
|
|
21
|
+
)
|
|
22
|
+
# A SQL line/block comment can be used to smuggle a second statement past naive
|
|
23
|
+
# checks; reject anything containing comment markers.
|
|
24
|
+
_COMMENT_RE = re.compile(r"(--|/\*|\*/|#)")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def sql_literal(value) -> str:
|
|
28
|
+
"""Render a Python value as a safe SQL literal.
|
|
29
|
+
|
|
30
|
+
- ``None`` -> ``NULL``
|
|
31
|
+
- ``bool`` -> ``TRUE``/``FALSE``
|
|
32
|
+
- ``int``/``float`` -> bare numeric
|
|
33
|
+
- everything else -> single-quoted string with quotes doubled
|
|
34
|
+
"""
|
|
35
|
+
if value is None:
|
|
36
|
+
return "NULL"
|
|
37
|
+
if isinstance(value, bool):
|
|
38
|
+
return "TRUE" if value else "FALSE"
|
|
39
|
+
if isinstance(value, (int, float)):
|
|
40
|
+
return repr(value)
|
|
41
|
+
return "'" + str(value).replace("'", "''") + "'"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Short alias used by the runtime / notebook code paths.
|
|
45
|
+
sql_lit = sql_literal
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def is_safe_select(query: str) -> bool:
|
|
49
|
+
"""Return True only if ``query`` is a single read-only SELECT statement.
|
|
50
|
+
|
|
51
|
+
Rejects: empty/non-SELECT queries, any forbidden DDL/DML keyword (whole-word),
|
|
52
|
+
multiple statements (a non-trailing semicolon), and SQL comments.
|
|
53
|
+
"""
|
|
54
|
+
if not query or not isinstance(query, str):
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
stripped = query.strip().rstrip(";").strip()
|
|
58
|
+
if not stripped:
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
# No comments (could hide a second statement or smuggle keywords).
|
|
62
|
+
if _COMMENT_RE.search(stripped):
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
# Only one statement allowed: no semicolons left after stripping a trailing one.
|
|
66
|
+
if ";" in stripped:
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
upper = stripped.upper()
|
|
70
|
+
if not (upper.startswith("SELECT") or upper.startswith("WITH")):
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
if _KEYWORD_RE.search(stripped):
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def ensure_limit(query: str, limit: int = 20) -> str:
|
|
80
|
+
"""Append a LIMIT clause if the query doesn't already have one."""
|
|
81
|
+
stripped = query.strip().rstrip(";")
|
|
82
|
+
if re.search(r"\bLIMIT\b", stripped, re.IGNORECASE):
|
|
83
|
+
return stripped
|
|
84
|
+
return f"{stripped} LIMIT {limit}"
|
iic/__init__.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Incident Intelligence Core (IIC).
|
|
2
|
+
|
|
3
|
+
A deterministic + AI hybrid engine that converts raw data-pipeline failures into
|
|
4
|
+
structured, prioritized, explainable incident knowledge.
|
|
5
|
+
|
|
6
|
+
Design principles
|
|
7
|
+
-----------------
|
|
8
|
+
1. Deterministic first, AI second — the LLM only runs after the full structured
|
|
9
|
+
context (the :class:`~iic.models.IncidentDNA`) has been built.
|
|
10
|
+
2. Every failure becomes a structured object — one ``IncidentReport`` per root cause.
|
|
11
|
+
3. No external workflow dependencies — Databricks + logs only, no Jira/ServiceNow.
|
|
12
|
+
4. No "chatty AI" — the diagnosis engine returns structured output only.
|
|
13
|
+
5. Every decision is traceable from evidence.
|
|
14
|
+
|
|
15
|
+
The 11-stage pipeline lives in :mod:`iic.runtime.incident_engine`.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
__version__ = "1.2.0"
|
|
21
|
+
|
|
22
|
+
# NOTE: `import iic` is intentionally lightweight and does NOT arm anything.
|
|
23
|
+
# The self-arming tripwire is installed by `iic_autoload.pth` at interpreter
|
|
24
|
+
# startup (it runs `import iic.runtime.bootstrap`). See docs/SELF_ARMING.md.
|
|
25
|
+
|
|
26
|
+
# Operator/console entry points — thin lazy wrappers so `import iic` stays light
|
|
27
|
+
# (the heavy bits load only when these are actually called from a notebook).
|
|
28
|
+
|
|
29
|
+
def console(*args, **kwargs):
|
|
30
|
+
"""Fold pending occurrences and render the antibody ledger."""
|
|
31
|
+
from iic._console import console as _fn
|
|
32
|
+
return _fn(*args, **kwargs)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def console_record(*args, **kwargs):
|
|
36
|
+
"""Record a human-confirmed resolution for a pattern (append-only)."""
|
|
37
|
+
from iic._console import console_record as _fn
|
|
38
|
+
return _fn(*args, **kwargs)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def onboard(*args, **kwargs):
|
|
42
|
+
"""One-time first-run setup: secret scope, keys, ACL, dirs, doctor, test card."""
|
|
43
|
+
from iic._console import onboard as _fn
|
|
44
|
+
return _fn(*args, **kwargs)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def doctor(*args, **kwargs):
|
|
48
|
+
"""Verify the secret scope, keys, volume write, and webhook. Exit 0 iff healthy."""
|
|
49
|
+
from iic._doctor import doctor as _fn
|
|
50
|
+
return _fn(*args, **kwargs)
|
|
51
|
+
|
iic/__main__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""``python -m iic doctor [--check-principal <name>]`` — the support entry point."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main(argv=None) -> int:
|
|
9
|
+
argv = argv if argv is not None else sys.argv[1:]
|
|
10
|
+
if argv and argv[0] == "doctor":
|
|
11
|
+
from iic._doctor import main as doctor_main
|
|
12
|
+
return doctor_main(argv[1:])
|
|
13
|
+
print("usage: python -m iic doctor [--check-principal <name>]")
|
|
14
|
+
return 2
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
if __name__ == "__main__":
|
|
18
|
+
sys.exit(main())
|
iic/_console.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""In-workspace console (``iic.console``) and first-run onboarding (``iic.onboard``).
|
|
2
|
+
|
|
3
|
+
Runs from a Databricks notebook on serverless. The console is the ONLY writer of
|
|
4
|
+
``antibodies.yaml`` besides a human editing it: it folds ``.iic_pending/*`` into the
|
|
5
|
+
ledger with the canonical merge rule (:mod:`iic.runtime.ledger`), renders the ledger,
|
|
6
|
+
and records resolutions (append-only). ``onboard`` does the one-time setup: create
|
|
7
|
+
the ``iic`` secret scope, write the keys, grant READ, create the volume dirs, run
|
|
8
|
+
doctor, and send a test card. Everything is best-effort / fail-open.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import glob
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
from iic.runtime.antibodies import load_ledger
|
|
18
|
+
from iic.runtime.constants import (
|
|
19
|
+
ANTIBODIES_FILENAME,
|
|
20
|
+
DEFAULT_SECRET_SCOPE,
|
|
21
|
+
PENDING_DIRNAME,
|
|
22
|
+
)
|
|
23
|
+
from iic.runtime.ledger import dump_ledger, merge_ledgers
|
|
24
|
+
|
|
25
|
+
# ── ledger console ──
|
|
26
|
+
|
|
27
|
+
def _resolve_base(base_dir):
|
|
28
|
+
if base_dir:
|
|
29
|
+
return base_dir
|
|
30
|
+
try:
|
|
31
|
+
from iic.runtime.scope_config import load_settings
|
|
32
|
+
return (load_settings() or {}).get("volume_path")
|
|
33
|
+
except Exception:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _read_markers(base_dir):
|
|
38
|
+
out = []
|
|
39
|
+
try:
|
|
40
|
+
for p in glob.glob(os.path.join(base_dir, PENDING_DIRNAME, "*.json")):
|
|
41
|
+
try:
|
|
42
|
+
with open(p) as f:
|
|
43
|
+
out.append((p, json.load(f)))
|
|
44
|
+
except Exception:
|
|
45
|
+
continue
|
|
46
|
+
except Exception:
|
|
47
|
+
pass
|
|
48
|
+
return out
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _write_ledger(base_dir, ledger):
|
|
52
|
+
with open(os.path.join(base_dir, ANTIBODIES_FILENAME), "w") as f:
|
|
53
|
+
f.write(dump_ledger(ledger))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def fold(base_dir=None) -> dict:
|
|
57
|
+
"""Fold ``.iic_pending/*`` into ``antibodies.yaml`` (the only non-human writer)."""
|
|
58
|
+
base = _resolve_base(base_dir)
|
|
59
|
+
if not base:
|
|
60
|
+
raise RuntimeError("no volume_path — configure the 'iic' secret scope or pass base_dir")
|
|
61
|
+
existing = load_ledger(base)
|
|
62
|
+
markers = _read_markers(base)
|
|
63
|
+
merged, stats = merge_ledgers(existing, {}, [m for _, m in markers])
|
|
64
|
+
_write_ledger(base, merged)
|
|
65
|
+
for p, _ in markers:
|
|
66
|
+
try:
|
|
67
|
+
os.remove(p)
|
|
68
|
+
except Exception:
|
|
69
|
+
pass
|
|
70
|
+
fresh = stats["new_keys"]
|
|
71
|
+
print(f"[console] folded {len(markers)} occurrence(s); "
|
|
72
|
+
+ (f"new pattern(s): {fresh}" if fresh else "no new patterns"))
|
|
73
|
+
return merged
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def render(ledger: dict) -> str:
|
|
77
|
+
if not ledger:
|
|
78
|
+
return "No patterns recorded yet. Break something to see your first incident."
|
|
79
|
+
resolved = [(k, e) for k, e in ledger.items() if str((e or {}).get("resolution", "")).strip()]
|
|
80
|
+
unresolved = [(k, e) for k, e in ledger.items() if not str((e or {}).get("resolution", "")).strip()]
|
|
81
|
+
lines = [f"Antibody ledger — {len(ledger)} pattern(s): "
|
|
82
|
+
f"{len(resolved)} resolved, {len(unresolved)} awaiting a fix", ""]
|
|
83
|
+
if unresolved:
|
|
84
|
+
lines.append("⚠️ Awaiting resolution (record a fix for these):")
|
|
85
|
+
for k, e in sorted(unresolved, key=lambda kv: -_int((kv[1] or {}).get("times_seen"))):
|
|
86
|
+
lines.append(f" • {k} (seen {_int((e or {}).get('times_seen'))}×) "
|
|
87
|
+
f"e.g. {(e or {}).get('example', '')}")
|
|
88
|
+
lines.append("")
|
|
89
|
+
if resolved:
|
|
90
|
+
lines.append("♻️ Resolved (these fixes show on the card):")
|
|
91
|
+
for k, e in sorted(resolved):
|
|
92
|
+
lines.append(f" • {k} → {(e or {}).get('resolution', '')}")
|
|
93
|
+
return "\n".join(lines)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def record_resolution(pattern_id, resolution, *, base_dir=None, overwrite=False) -> bool:
|
|
97
|
+
"""Append-only: set a resolution for a pattern. Refuses to overwrite a non-empty
|
|
98
|
+
resolution unless ``overwrite=True``. Returns True if written."""
|
|
99
|
+
base = _resolve_base(base_dir)
|
|
100
|
+
if not base:
|
|
101
|
+
raise RuntimeError("no volume_path — configure the 'iic' secret scope or pass base_dir")
|
|
102
|
+
ledger = load_ledger(base)
|
|
103
|
+
entry = dict(ledger.get(pattern_id) or {})
|
|
104
|
+
if str(entry.get("resolution", "")).strip() and not overwrite:
|
|
105
|
+
print(f"[console] '{pattern_id}' already has a resolution; pass overwrite=True to replace it")
|
|
106
|
+
return False
|
|
107
|
+
entry.setdefault("times_seen", entry.get("times_seen", 0))
|
|
108
|
+
entry["resolution"] = resolution
|
|
109
|
+
ledger[pattern_id] = entry
|
|
110
|
+
_write_ledger(base, ledger)
|
|
111
|
+
print(f"[console] recorded resolution for '{pattern_id}'")
|
|
112
|
+
return True
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def console(base_dir=None) -> dict:
|
|
116
|
+
"""Fold pending occurrences, then render the ledger. Returns the ledger dict."""
|
|
117
|
+
ledger = fold(base_dir)
|
|
118
|
+
print(render(ledger))
|
|
119
|
+
print("\nRecord a fix: iic.console_record('<pattern_id>', 'the fix that worked')")
|
|
120
|
+
return ledger
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def console_record(pattern_id, resolution, *, base_dir=None, overwrite=False) -> bool:
|
|
124
|
+
"""Alias matching the hint printed by console()."""
|
|
125
|
+
return record_resolution(pattern_id, resolution, base_dir=base_dir, overwrite=overwrite)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _int(x) -> int:
|
|
129
|
+
try:
|
|
130
|
+
return int(x or 0)
|
|
131
|
+
except Exception:
|
|
132
|
+
return 0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ── onboarding ──
|
|
136
|
+
|
|
137
|
+
_SCOPE_KEYS = ("teams_webhook", "volume_path", "host", "pat",
|
|
138
|
+
"github_repo", "github_dispatch_token", "dedup_ttl_seconds")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _workspace_client():
|
|
142
|
+
try:
|
|
143
|
+
from databricks.sdk import WorkspaceClient
|
|
144
|
+
return WorkspaceClient()
|
|
145
|
+
except Exception:
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _prompt_answers() -> dict: # pragma: no cover - interactive
|
|
150
|
+
print("IIC onboarding — answer a few questions (blank to skip optional ones):")
|
|
151
|
+
a = {
|
|
152
|
+
"teams_webhook": input(" Teams webhook URL (required): ").strip(),
|
|
153
|
+
"volume_path": input(" Volume path for memory, e.g. /Volumes/cat/sch/libs (required): ").strip(),
|
|
154
|
+
"host": input(" Workspace host for 'View Run' links (optional): ").strip(),
|
|
155
|
+
"pat": input(" Workspace PAT to enable enrichment (optional): ").strip(),
|
|
156
|
+
"github_repo": input(" GitHub owner/repo for incident archiving (optional): ").strip(),
|
|
157
|
+
"github_dispatch_token": input(" GitHub dispatch token (optional): ").strip(),
|
|
158
|
+
}
|
|
159
|
+
principals = input(" Group/SP names to grant READ (comma-separated): ").strip()
|
|
160
|
+
a["read_principals"] = [p.strip() for p in principals.split(",") if p.strip()]
|
|
161
|
+
return a
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _send_test_card(webhook) -> None:
|
|
165
|
+
if not webhook:
|
|
166
|
+
return
|
|
167
|
+
try:
|
|
168
|
+
import requests
|
|
169
|
+
card = {"type": "message", "attachments": [{
|
|
170
|
+
"contentType": "application/vnd.microsoft.card.adaptive",
|
|
171
|
+
"content": {"type": "AdaptiveCard", "version": "1.4",
|
|
172
|
+
"body": [{"type": "TextBlock", "weight": "Bolder",
|
|
173
|
+
"text": "✅ IIC connected — you'll see incident cards here."}]}}]}
|
|
174
|
+
requests.post(webhook, json=card, headers={"Content-Type": "application/json"}, timeout=5)
|
|
175
|
+
except Exception:
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def onboard(*, scope=None, answers=None) -> bool: # pragma: no cover - Databricks/interactive
|
|
180
|
+
"""One-time setup: create the secret scope, write keys, grant READ, create the
|
|
181
|
+
volume dirs, run doctor, and send a test card. Pass ``answers`` to skip prompts."""
|
|
182
|
+
scope = scope or os.environ.get("IIC_SECRET_SCOPE", DEFAULT_SECRET_SCOPE)
|
|
183
|
+
a = answers or _prompt_answers()
|
|
184
|
+
if not (a.get("teams_webhook") and a.get("volume_path")):
|
|
185
|
+
print("❌ teams_webhook and volume_path are required.")
|
|
186
|
+
return False
|
|
187
|
+
w = _workspace_client()
|
|
188
|
+
if w is None:
|
|
189
|
+
print("❌ could not construct a Databricks WorkspaceClient — run this in your workspace.")
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
existing = [s.name for s in w.secrets.list_scopes()]
|
|
194
|
+
if scope not in existing:
|
|
195
|
+
w.secrets.create_scope(scope=scope)
|
|
196
|
+
print(f"✅ created secret scope '{scope}'")
|
|
197
|
+
else:
|
|
198
|
+
print(f"✅ secret scope '{scope}' already exists")
|
|
199
|
+
except Exception as ex:
|
|
200
|
+
print(f"⚠️ could not list/create scope ({str(ex)[:120]}); assuming it exists")
|
|
201
|
+
|
|
202
|
+
for key in _SCOPE_KEYS:
|
|
203
|
+
val = a.get(key)
|
|
204
|
+
if val:
|
|
205
|
+
try:
|
|
206
|
+
w.secrets.put_secret(scope=scope, key=key, string_value=str(val))
|
|
207
|
+
except Exception as ex:
|
|
208
|
+
print(f"⚠️ put_secret {key} failed: {str(ex)[:100]}")
|
|
209
|
+
print(f"✅ wrote secrets to scope '{scope}'")
|
|
210
|
+
|
|
211
|
+
for principal in a.get("read_principals", []):
|
|
212
|
+
try:
|
|
213
|
+
from databricks.sdk.service.workspace import AclPermission
|
|
214
|
+
w.secrets.put_acl(scope=scope, principal=principal, permission=AclPermission.READ)
|
|
215
|
+
print(f"✅ granted READ on '{scope}' to {principal}")
|
|
216
|
+
except Exception as ex:
|
|
217
|
+
print(f"⚠️ put_acl {principal} failed: {str(ex)[:100]}")
|
|
218
|
+
print("ℹ️ EVERY identity that runs monitored jobs must have READ on this scope, "
|
|
219
|
+
"or the agent can't load config there.")
|
|
220
|
+
|
|
221
|
+
vol = a.get("volume_path")
|
|
222
|
+
for d in (vol, os.path.join(vol, PENDING_DIRNAME), os.path.join(vol, ".iic_seen")):
|
|
223
|
+
try:
|
|
224
|
+
os.makedirs(d, exist_ok=True)
|
|
225
|
+
except Exception:
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
from iic._doctor import doctor
|
|
230
|
+
doctor()
|
|
231
|
+
except Exception:
|
|
232
|
+
pass
|
|
233
|
+
_send_test_card(a.get("teams_webhook"))
|
|
234
|
+
print("✅ onboarding complete — break something to see your first card.")
|
|
235
|
+
return True
|