shkit 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. healing_kit/__init__.py +3 -0
  2. healing_kit/auth.py +79 -0
  3. healing_kit/clients/__init__.py +1 -0
  4. healing_kit/clients/databricks_client.py +183 -0
  5. healing_kit/clients/teams_client.py +128 -0
  6. healing_kit/models/__init__.py +1 -0
  7. healing_kit/models/diagnosis.py +45 -0
  8. healing_kit/models/events.py +30 -0
  9. healing_kit/models/evidence.py +83 -0
  10. healing_kit/runtime/__init__.py +6 -0
  11. healing_kit/runtime/approval.py +141 -0
  12. healing_kit/runtime/maintenance.py +52 -0
  13. healing_kit/services/__init__.py +1 -0
  14. healing_kit/services/cache_service.py +120 -0
  15. healing_kit/services/circuit_breaker.py +114 -0
  16. healing_kit/services/context_agent.py +127 -0
  17. healing_kit/services/dependency_graph.py +141 -0
  18. healing_kit/services/diagnosis_engine.py +165 -0
  19. healing_kit/services/identity.py +61 -0
  20. healing_kit/services/model_router.py +52 -0
  21. healing_kit/services/query_guard.py +168 -0
  22. healing_kit/services/resolution_verifier.py +100 -0
  23. healing_kit/services/token_budget.py +137 -0
  24. healing_kit/utils/__init__.py +1 -0
  25. healing_kit/utils/error_hash.py +15 -0
  26. healing_kit/utils/hmac_tokens.py +86 -0
  27. healing_kit/utils/sql_safety.py +84 -0
  28. iic/__init__.py +51 -0
  29. iic/__main__.py +18 -0
  30. iic/_console.py +235 -0
  31. iic/_doctor.py +143 -0
  32. iic/change/__init__.py +7 -0
  33. iic/change/change_detector.py +154 -0
  34. iic/context/__init__.py +7 -0
  35. iic/context/context_builder.py +117 -0
  36. iic/dependency/__init__.py +7 -0
  37. iic/dependency/dependency_analyzer.py +93 -0
  38. iic/diagnosis/__init__.py +7 -0
  39. iic/diagnosis/diagnosis_engine.py +183 -0
  40. iic/dna/__init__.py +7 -0
  41. iic/dna/dna_builder.py +184 -0
  42. iic/impact/__init__.py +7 -0
  43. iic/impact/impact_engine.py +102 -0
  44. iic/ingestion/__init__.py +14 -0
  45. iic/ingestion/base.py +21 -0
  46. iic/ingestion/databricks_source.py +98 -0
  47. iic/ingestion/static_source.py +23 -0
  48. iic/ingestion/webhook_source.py +39 -0
  49. iic/models/__init__.py +44 -0
  50. iic/models/change.py +77 -0
  51. iic/models/context.py +46 -0
  52. iic/models/diagnosis.py +37 -0
  53. iic/models/dna.py +77 -0
  54. iic/models/event.py +78 -0
  55. iic/models/impact.py +60 -0
  56. iic/models/report.py +88 -0
  57. iic/models/routing.py +41 -0
  58. iic/notify/__init__.py +7 -0
  59. iic/notify/teams_notifier.py +112 -0
  60. iic/report/__init__.py +7 -0
  61. iic/report/report_generator.py +67 -0
  62. iic/routing/__init__.py +7 -0
  63. iic/routing/router.py +42 -0
  64. iic/runtime/__init__.py +10 -0
  65. iic/runtime/_sql.py +11 -0
  66. iic/runtime/agent_config.py +48 -0
  67. iic/runtime/agent_runtime.py +70 -0
  68. iic/runtime/antibodies.py +100 -0
  69. iic/runtime/bootstrap.py +157 -0
  70. iic/runtime/constants.py +40 -0
  71. iic/runtime/context.py +46 -0
  72. iic/runtime/detective.py +72 -0
  73. iic/runtime/hooks.py +85 -0
  74. iic/runtime/incident_engine.py +207 -0
  75. iic/runtime/inprocess.py +350 -0
  76. iic/runtime/ledger.py +120 -0
  77. iic/runtime/monitor.py +155 -0
  78. iic/runtime/pattern_store.py +53 -0
  79. iic/runtime/reconciler.py +139 -0
  80. iic/runtime/scope_config.py +127 -0
  81. iic/runtime/store.py +150 -0
  82. iic/runtime/wrapper.py +28 -0
  83. iic_autoload.pth +1 -0
  84. onboarding/__init__.py +1 -0
  85. onboarding/cli.py +168 -0
  86. onboarding/config_schema.py +62 -0
  87. onboarding/manifest.py +27 -0
  88. onboarding/preflight.py +129 -0
  89. onboarding/provisioner.py +573 -0
  90. onboarding/rollback.py +81 -0
  91. shkit-1.2.0.dist-info/METADATA +239 -0
  92. shkit-1.2.0.dist-info/RECORD +94 -0
  93. shkit-1.2.0.dist-info/WHEEL +4 -0
  94. shkit-1.2.0.dist-info/entry_points.txt +2 -0
iic/_doctor.py ADDED
@@ -0,0 +1,143 @@
1
+ """``iic doctor`` — the #1 support tool. Verifies, in order, with a clear ✅/❌ line:
2
+ secret scope readable → required keys present → volume writable → webhook reachable
3
+ → optional-key consistency → (optionally) a named principal's READ ACL.
4
+
5
+ ``python -m iic doctor [--check-principal <name>]`` and ``iic.doctor()``. Exit code
6
+ is 0 only when all REQUIRED checks pass. Failure messages are actionable on purpose.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ import sys
13
+
14
+ from iic.runtime.constants import (
15
+ DEFAULT_SECRET_SCOPE,
16
+ SECRET_KEY_GITHUB_REPO,
17
+ SECRET_KEY_GITHUB_TOKEN,
18
+ SECRET_KEY_HOST,
19
+ SECRET_KEY_PAT,
20
+ SECRET_KEY_TEAMS_WEBHOOK,
21
+ SECRET_KEY_VOLUME_PATH,
22
+ TEAMS_TIMEOUT,
23
+ )
24
+
25
+ _LOAD = "__load__"
26
+
27
+
28
+ def doctor(check_principal=None, *, settings=_LOAD, probes=None) -> int:
29
+ scope = os.environ.get("IIC_SECRET_SCOPE", DEFAULT_SECRET_SCOPE)
30
+ probes = probes or _default_probes()
31
+ ok_all = True
32
+
33
+ s = _safe_load_settings() if settings == _LOAD else settings
34
+ if s is None:
35
+ print(f"❌ secret scope '{scope}' not readable as the current identity")
36
+ print(f" → grant READ: databricks secrets put-acl --scope {scope} "
37
+ "--principal <you-or-your-SP> --permission READ")
38
+ return 1
39
+ print(f"✅ secret scope '{scope}' readable")
40
+
41
+ missing = [k for k in (SECRET_KEY_TEAMS_WEBHOOK, SECRET_KEY_VOLUME_PATH) if not s.get(k)]
42
+ if missing:
43
+ print(f"❌ missing required key(s): {missing}")
44
+ print(f" → databricks secrets put-secret --scope {scope} --key {missing[0]} ...")
45
+ return 1
46
+ print("✅ required keys present (teams_webhook, volume_path)")
47
+
48
+ ok, msg = probes["volume_write"](s.get(SECRET_KEY_VOLUME_PATH))
49
+ print(("✅" if ok else "❌") + f" volume writable: {msg}")
50
+ if not ok:
51
+ print(f" → ensure the run identity can write {s.get(SECRET_KEY_VOLUME_PATH)} "
52
+ "(WRITE VOLUME / Unity Catalog grants)")
53
+ ok_all = False
54
+
55
+ ok, msg = probes["webhook"](s.get(SECRET_KEY_TEAMS_WEBHOOK))
56
+ print(("✅" if ok else "❌") + f" webhook reachable: {msg}")
57
+ if not ok:
58
+ print(" → check the teams_webhook URL (is the Power Automate flow enabled?)")
59
+ ok_all = False
60
+
61
+ if bool(s.get(SECRET_KEY_PAT)) != bool(s.get(SECRET_KEY_HOST)):
62
+ print("⚠️ pat/host should be set together — enrichment needs both (non-fatal)")
63
+ if bool(s.get(SECRET_KEY_GITHUB_REPO)) != bool(s.get(SECRET_KEY_GITHUB_TOKEN)):
64
+ print("⚠️ github_repo/github_dispatch_token should be set together (non-fatal)")
65
+
66
+ if check_principal:
67
+ ok, msg = probes["principal"](scope, check_principal)
68
+ print(("✅" if ok else "❌") + f" principal '{check_principal}' can read '{scope}': {msg}")
69
+ if not ok:
70
+ print(f" → databricks secrets put-acl --scope {scope} "
71
+ f"--principal {check_principal} --permission READ")
72
+ ok_all = False
73
+
74
+ print("✅ all required checks passed" if ok_all else "❌ doctor found problems (see above)")
75
+ return 0 if ok_all else 1
76
+
77
+
78
+ def _safe_load_settings():
79
+ try:
80
+ from iic.runtime.scope_config import load_settings
81
+ return load_settings()
82
+ except Exception:
83
+ return None
84
+
85
+
86
+ def _default_probes():
87
+ return {"volume_write": _probe_volume_write, "webhook": _probe_webhook,
88
+ "principal": _probe_principal}
89
+
90
+
91
+ def _probe_volume_write(volume_path):
92
+ import uuid
93
+ if not volume_path:
94
+ return False, "no volume_path"
95
+ try:
96
+ probe = os.path.join(volume_path, f".iic_doctor_{uuid.uuid4().hex[:8]}")
97
+ with open(probe, "w") as f:
98
+ f.write("ok")
99
+ os.remove(probe)
100
+ return True, f"{volume_path} (probe write+delete ok)"
101
+ except Exception as ex:
102
+ return False, f"{volume_path}: {str(ex)[:120]}"
103
+
104
+
105
+ def _probe_webhook(webhook):
106
+ if not webhook:
107
+ return False, "no teams_webhook"
108
+ try:
109
+ import requests
110
+ card = {"type": "message", "attachments": [{
111
+ "contentType": "application/vnd.microsoft.card.adaptive",
112
+ "content": {"type": "AdaptiveCard", "version": "1.4",
113
+ "body": [{"type": "TextBlock", "text": "✅ IIC doctor probe"}]}}]}
114
+ r = requests.post(webhook, json=card, headers={"Content-Type": "application/json"},
115
+ timeout=TEAMS_TIMEOUT)
116
+ return (r.status_code in (200, 202)), f"HTTP {r.status_code}"
117
+ except Exception as ex:
118
+ return False, str(ex)[:120]
119
+
120
+
121
+ def _probe_principal(scope, principal):
122
+ try:
123
+ from databricks.sdk import WorkspaceClient
124
+ w = WorkspaceClient()
125
+ for acl in w.secrets.list_acls(scope=scope):
126
+ if getattr(acl, "principal", None) == principal:
127
+ return True, str(getattr(acl, "permission", ""))
128
+ return False, "no ACL entry"
129
+ except Exception as ex:
130
+ return False, str(ex)[:120]
131
+
132
+
133
+ def main(argv=None) -> int:
134
+ argv = argv if argv is not None else sys.argv[1:]
135
+ principal = None
136
+ if "--check-principal" in argv:
137
+ i = argv.index("--check-principal")
138
+ principal = argv[i + 1] if i + 1 < len(argv) else None
139
+ return doctor(check_principal=principal)
140
+
141
+
142
+ if __name__ == "__main__": # pragma: no cover
143
+ sys.exit(main())
iic/change/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Stage 5 — change detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from iic.change.change_detector import ChangeDetector
6
+
7
+ __all__ = ["ChangeDetector"]
@@ -0,0 +1,154 @@
1
+ """Stage 5 — diff the failed run against the last successful run.
2
+
3
+ Most production failures correlate with a recent change, so this is one of the
4
+ highest-value signals. The comparison is split into a pure, fully-testable core
5
+ (``extract_profile`` + ``diff_profiles`` + ``diff_schemas``) and a thin I/O wrapper
6
+ (``ChangeDetector.detect``) that fetches the two runs via the client.
7
+
8
+ What we can compare from the Jobs/Runs API without extra infrastructure:
9
+ * config — task base_parameters
10
+ * runtime — spark_version, node_type_id, worker count
11
+ * deployment — git commit of the run's source
12
+ * code — notebook revision timestamps
13
+ * schema — only when before/after snapshots are supplied
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from iic.models.change import ChangeDiffObject, FieldChange
19
+
20
+
21
+ def extract_profile(run: dict) -> dict:
22
+ """Flatten a Jobs/Runs API run dict into a comparable profile."""
23
+ run = run or {}
24
+ settings = run.get("settings", run) # runs/get nests under nothing; jobs/get under settings
25
+ tasks = run.get("tasks") or settings.get("tasks") or []
26
+
27
+ params: dict = {}
28
+ notebook_revisions: dict = {}
29
+ for t in tasks:
30
+ nb = t.get("notebook_task", {}) or {}
31
+ for k, v in (nb.get("base_parameters") or {}).items():
32
+ params[f"{t.get('task_key', '?')}.{k}"] = str(v)
33
+ if nb.get("source") == "GIT" or nb.get("notebook_path"):
34
+ rev = t.get("git_source_revision") or nb.get("revision_timestamp")
35
+ if rev:
36
+ notebook_revisions[t.get("task_key", "?")] = str(rev)
37
+
38
+ cluster = _first_cluster(run, tasks)
39
+ git = run.get("git_source") or settings.get("git_source") or {}
40
+
41
+ return {
42
+ "params": params,
43
+ "spark_version": cluster.get("spark_version", ""),
44
+ "node_type_id": cluster.get("node_type_id", ""),
45
+ "num_workers": str(cluster.get("num_workers", "")),
46
+ "git_commit": git.get("git_commit") or git.get("git_branch") or "",
47
+ "notebook_revisions": notebook_revisions,
48
+ }
49
+
50
+
51
+ def _first_cluster(run: dict, tasks: list) -> dict:
52
+ for jc in run.get("job_clusters", []) or []:
53
+ spec = jc.get("new_cluster") or {}
54
+ if spec:
55
+ return spec
56
+ for t in tasks:
57
+ spec = t.get("new_cluster") or {}
58
+ if spec:
59
+ return spec
60
+ return {}
61
+
62
+
63
+ def diff_profiles(failed: dict, success: dict) -> ChangeDiffObject:
64
+ """Pure diff of two run profiles → ChangeDiffObject (no schema diff)."""
65
+ diff = ChangeDiffObject(has_prior_success=True)
66
+
67
+ # config — parameter values
68
+ diff.config_changes = _diff_maps(failed.get("params", {}), success.get("params", {}), "config")
69
+
70
+ # runtime — cluster shape
71
+ for key, label in (("spark_version", "spark_version"),
72
+ ("node_type_id", "node_type"),
73
+ ("num_workers", "num_workers")):
74
+ before, after = success.get(key, ""), failed.get(key, "")
75
+ if before != after and (before or after):
76
+ diff.runtime_changes.append(FieldChange("runtime", label, before, after))
77
+
78
+ # deployment — git commit moved
79
+ bc, ac = success.get("git_commit", ""), failed.get("git_commit", "")
80
+ if bc != ac and (bc or ac):
81
+ diff.deployment_changes.append(FieldChange("deployment", "git_commit", bc, ac))
82
+
83
+ # code — notebook revisions changed
84
+ diff.code_changes = _diff_maps(
85
+ failed.get("notebook_revisions", {}), success.get("notebook_revisions", {}), "code"
86
+ )
87
+ return diff
88
+
89
+
90
+ def diff_schemas(before: dict, after: dict) -> list[FieldChange]:
91
+ """Diff two ``{table: [columns]}`` snapshots into schema FieldChanges."""
92
+ changes: list[FieldChange] = []
93
+ for table in sorted(set(before) | set(after)):
94
+ prev_cols = set(before.get(table, []))
95
+ curr_cols = set(after.get(table, []))
96
+ for col in sorted(prev_cols - curr_cols):
97
+ changes.append(FieldChange("schema", f"{table}.{col}", before=col, after=""))
98
+ for col in sorted(curr_cols - prev_cols):
99
+ changes.append(FieldChange("schema", f"{table}.{col}", before="", after=col))
100
+ return changes
101
+
102
+
103
+ def _diff_maps(failed: dict, success: dict, category: str) -> list[FieldChange]:
104
+ changes: list[FieldChange] = []
105
+ for key in sorted(set(failed) | set(success)):
106
+ before, after = success.get(key, ""), failed.get(key, "")
107
+ if before != after:
108
+ changes.append(FieldChange(category, key, str(before), str(after)))
109
+ return changes
110
+
111
+
112
+ class ChangeDetector:
113
+ """Fetches the failed run + last successful run and diffs them."""
114
+
115
+ def __init__(self, client=None):
116
+ self.client = client
117
+
118
+ def detect(self, job_id: str, failed_run_id: str,
119
+ failed_run: dict | None = None,
120
+ prev_schema: dict | None = None,
121
+ curr_schema: dict | None = None) -> ChangeDiffObject:
122
+ if not (self.client and job_id):
123
+ return ChangeDiffObject(has_prior_success=False)
124
+
125
+ failed = failed_run or self._get_run(failed_run_id)
126
+ success = self._last_success(job_id, exclude_run_id=failed_run_id)
127
+ if not success:
128
+ diff = ChangeDiffObject(has_prior_success=False)
129
+ else:
130
+ diff = diff_profiles(extract_profile(failed), extract_profile(success))
131
+ diff.last_success_run_id = str(success.get("run_id", ""))
132
+
133
+ if prev_schema is not None and curr_schema is not None:
134
+ diff.schema_changes.extend(diff_schemas(prev_schema, curr_schema))
135
+ return diff
136
+
137
+ def _get_run(self, run_id: str) -> dict:
138
+ try:
139
+ return self.client._get("/api/2.1/jobs/runs/get", {"run_id": int(run_id)})
140
+ except Exception:
141
+ return {}
142
+
143
+ def _last_success(self, job_id: str, exclude_run_id: str) -> dict:
144
+ try:
145
+ runs = self.client.list_runs(int(job_id), limit=20)
146
+ except Exception:
147
+ return {}
148
+ for run in runs:
149
+ if str(run.get("run_id", "")) == str(exclude_run_id):
150
+ continue
151
+ if run.get("state", {}).get("result_state") == "SUCCESS":
152
+ # Re-fetch for full task/cluster detail.
153
+ return self._get_run(str(run.get("run_id", ""))) or run
154
+ return {}
@@ -0,0 +1,7 @@
1
+ """Stage 3 — context building."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from iic.context.context_builder import ContextBuilder
6
+
7
+ __all__ = ["ContextBuilder"]
@@ -0,0 +1,117 @@
1
+ """Stage 3 — gather everything relevant about a failure.
2
+
3
+ Best-effort by design: each fetch is independently guarded so a missing notebook,
4
+ an unreachable cluster, or a lineage gap degrades the bundle rather than aborting
5
+ the pipeline. Table references are parsed from the notebook source so later stages
6
+ (schema snapshot, dependency analysis) have something to work with even when UC
7
+ lineage is unavailable.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+
14
+ from iic.models.context import IncidentContextBundle
15
+ from iic.models.event import NormalizedFailureEvent
16
+
17
+ # Three-part (catalog.schema.table) or two-part names following FROM/JOIN/INTO/UPDATE.
18
+ _TABLE_RE = re.compile(
19
+ r"\b(?:FROM|JOIN|INTO|UPDATE|TABLE)\s+([a-zA-Z_][\w]*(?:\.[a-zA-Z_][\w]*){1,2})",
20
+ re.IGNORECASE,
21
+ )
22
+ _MAX_NOTEBOOK_CHARS = 4000
23
+
24
+
25
+ class ContextBuilder:
26
+ """Builds an :class:`IncidentContextBundle` from a normalized event.
27
+
28
+ ``client`` is a DatabricksClient-like object; ``spark`` is optional and only
29
+ used to snapshot the failing table's schema. Both may be ``None`` in tests.
30
+ """
31
+
32
+ def __init__(self, client=None, spark=None):
33
+ self.client = client
34
+ self.spark = spark
35
+
36
+ def build(self, event: NormalizedFailureEvent) -> IncidentContextBundle:
37
+ bundle = IncidentContextBundle(event_id=event.event_id)
38
+
39
+ # Logs always include the failure text we already have.
40
+ logs = []
41
+ if event.error_message:
42
+ logs.append(event.error_message)
43
+ if event.error_trace:
44
+ logs.append(event.error_trace)
45
+ bundle.logs = logs
46
+
47
+ bundle.notebook_source = self._notebook(event.notebook_path)
48
+ bundle.referenced_tables = self._tables(bundle.notebook_source)
49
+ bundle.cluster_state = self._cluster(event.cluster_id)
50
+ bundle.job_metadata, dag = self._job_metadata(event.job_id, event.task)
51
+ bundle.upstream_tasks, bundle.downstream_tasks = dag
52
+ bundle.schema_snapshot = self._schema(bundle.referenced_tables)
53
+ return bundle
54
+
55
+ # ─── individual, independently-guarded fetches ───
56
+
57
+ def _notebook(self, path: str) -> str:
58
+ if not (path and self.client):
59
+ return ""
60
+
61
+ def _fetch() -> str:
62
+ try:
63
+ return (self.client.export_notebook(path) or "")[:_MAX_NOTEBOOK_CHARS]
64
+ except Exception:
65
+ return ""
66
+
67
+ # Memoized by path only when the session cache is active (i.e. on a
68
+ # bootstrapped cluster). Off-cluster/tests → always a fresh fetch.
69
+ from iic.runtime.context import cached
70
+ return cached(f"notebook:{path}", _fetch)
71
+
72
+ @staticmethod
73
+ def _tables(notebook_source: str) -> list[str]:
74
+ if not notebook_source:
75
+ return []
76
+ found = {m.group(1).lower() for m in _TABLE_RE.finditer(notebook_source)}
77
+ return sorted(found)
78
+
79
+ def _cluster(self, cluster_id: str) -> dict:
80
+ if not (cluster_id and self.client):
81
+ return {}
82
+ try:
83
+ events = self.client.get_cluster_events(cluster_id, limit=5)
84
+ except Exception:
85
+ return {}
86
+ return {"cluster_id": cluster_id, "recent_events": [e.get("type", "") for e in events]}
87
+
88
+ def _job_metadata(self, job_id: str, failed_task: str):
89
+ """Return (metadata_dict, (upstream_task_keys, downstream_task_keys))."""
90
+ if not (job_id and self.client):
91
+ return {}, ([], [])
92
+ try:
93
+ job = self.client.get_job(int(job_id))
94
+ except Exception:
95
+ return {}, ([], [])
96
+ settings = job.get("settings", {}) or {}
97
+ tasks = settings.get("tasks", []) or []
98
+ upstream, downstream = [], []
99
+ for t in tasks:
100
+ if t.get("task_key") == failed_task:
101
+ upstream = [d.get("task_key", "") for d in t.get("depends_on", [])]
102
+ if any(d.get("task_key") == failed_task for d in t.get("depends_on", [])):
103
+ downstream.append(t.get("task_key", ""))
104
+ meta = {"job_name": settings.get("name", ""), "task_count": len(tasks)}
105
+ return meta, (upstream, downstream)
106
+
107
+ def _schema(self, tables: list[str]) -> dict:
108
+ if not (tables and self.spark):
109
+ return {}
110
+ snapshot = {}
111
+ for table in tables[:3]: # cap the cost; first few referenced tables
112
+ try:
113
+ rows = self.spark.sql(f"DESCRIBE TABLE {table}").collect()
114
+ snapshot[table] = [r["col_name"] for r in rows if r["col_name"] and not r["col_name"].startswith("#")]
115
+ except Exception:
116
+ continue
117
+ return snapshot
@@ -0,0 +1,7 @@
1
+ """Stage 4 — dependency & lineage analysis (blast radius)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from iic.dependency.dependency_analyzer import BlastRadius, DependencyAnalyzer
6
+
7
+ __all__ = ["DependencyAnalyzer", "BlastRadius"]
@@ -0,0 +1,93 @@
1
+ """Stage 4 — compute the blast radius of a failure.
2
+
3
+ Combines two graphs, both deterministic:
4
+ * the job DAG (task-level downstream tasks that are now blocked), reusing the
5
+ existing :class:`DependencyGraphBuilder`; and
6
+ * Unity Catalog table lineage (downstream tables / dashboards), when reachable.
7
+
8
+ The result feeds the impact engine. No LLM, no Spark.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass, field
14
+
15
+ from healing_kit.services.dependency_graph import DependencyGraphBuilder
16
+
17
+
18
+ @dataclass
19
+ class BlastRadius:
20
+ """Everything downstream of the failed task that is now at risk."""
21
+
22
+ downstream_tasks: list[str] = field(default_factory=list)
23
+ downstream_tables: list[str] = field(default_factory=list)
24
+ downstream_dashboards: list[str] = field(default_factory=list)
25
+
26
+ @property
27
+ def downstream_jobs(self) -> int:
28
+ return len(self.downstream_tasks)
29
+
30
+ @property
31
+ def affected_tables(self) -> int:
32
+ return len(self.downstream_tables)
33
+
34
+ @property
35
+ def dashboard_impact(self) -> int:
36
+ return len(self.downstream_dashboards)
37
+
38
+ @property
39
+ def total(self) -> int:
40
+ return self.downstream_jobs + self.affected_tables + self.dashboard_impact
41
+
42
+ def to_dict(self) -> dict:
43
+ return {
44
+ "downstream_tasks": self.downstream_tasks,
45
+ "downstream_tables": self.downstream_tables,
46
+ "downstream_dashboards": self.downstream_dashboards,
47
+ }
48
+
49
+
50
+ class DependencyAnalyzer:
51
+ """Computes a :class:`BlastRadius` for a failed task.
52
+
53
+ ``client`` is optional and only used for UC table lineage; the task-level
54
+ blast radius is computed purely from the job DAG passed in.
55
+ """
56
+
57
+ def __init__(self, client=None):
58
+ self.client = client
59
+ self._builder = DependencyGraphBuilder()
60
+
61
+ def analyze(self, tasks: list[dict], failed_task: str, referenced_tables: list[str] | None = None) -> BlastRadius:
62
+ radius = BlastRadius()
63
+
64
+ # Task-level: every transitively downstream task is now blocked.
65
+ try:
66
+ graph = self._builder.build_from_tasks(tasks or [])
67
+ radius.downstream_tasks = self._builder.get_all_downstream(graph, failed_task)
68
+ except Exception:
69
+ radius.downstream_tasks = []
70
+
71
+ # Table-level: UC lineage for each table the failing task writes/reads.
72
+ radius.downstream_tables, radius.downstream_dashboards = self._lineage(referenced_tables or [])
73
+ return radius
74
+
75
+ def _lineage(self, tables: list[str]):
76
+ downstream_tables: set[str] = set()
77
+ dashboards: set[str] = set()
78
+ if not self.client:
79
+ return [], []
80
+ for table in tables[:5]:
81
+ try:
82
+ lineage = self.client.get_table_lineage(table) or {}
83
+ except Exception:
84
+ continue
85
+ for entry in lineage.get("downstreams", []) or []:
86
+ tbl = (entry.get("tableInfo") or {}).get("name")
87
+ if tbl:
88
+ downstream_tables.add(tbl)
89
+ for dash in entry.get("dashboards", []) or []:
90
+ name = dash.get("name") or dash.get("id")
91
+ if name:
92
+ dashboards.add(str(name))
93
+ return sorted(downstream_tables), sorted(dashboards)
@@ -0,0 +1,7 @@
1
+ """Stage 9 — diagnosis (LLM, structured output only)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from iic.diagnosis.diagnosis_engine import DiagnosisEngine
6
+
7
+ __all__ = ["DiagnosisEngine"]