steadystate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. steadystate/__init__.py +7 -0
  2. steadystate/_http.py +33 -0
  3. steadystate/act/__init__.py +57 -0
  4. steadystate/act/ansible.py +106 -0
  5. steadystate/act/approve.py +144 -0
  6. steadystate/act/artifact.py +94 -0
  7. steadystate/act/base.py +45 -0
  8. steadystate/act/bounds.py +170 -0
  9. steadystate/act/breakglass.py +36 -0
  10. steadystate/act/catalog.py +298 -0
  11. steadystate/act/cleanup.py +209 -0
  12. steadystate/act/codify.py +142 -0
  13. steadystate/act/decide.py +353 -0
  14. steadystate/act/deliver/__init__.py +44 -0
  15. steadystate/act/deliver/base.py +40 -0
  16. steadystate/act/deliver/github_pr.py +189 -0
  17. steadystate/act/deliver/patch_file.py +37 -0
  18. steadystate/act/execute.py +93 -0
  19. steadystate/act/learn.py +284 -0
  20. steadystate/act/plan.py +112 -0
  21. steadystate/act/reflex.py +320 -0
  22. steadystate/act/solution_remedy.py +201 -0
  23. steadystate/act/terraform.py +130 -0
  24. steadystate/catalog.py +206 -0
  25. steadystate/classify.py +103 -0
  26. steadystate/cli.py +2525 -0
  27. steadystate/compliance.py +169 -0
  28. steadystate/config.py +46 -0
  29. steadystate/discover.py +1227 -0
  30. steadystate/domains/__init__.py +89 -0
  31. steadystate/domains/base.py +140 -0
  32. steadystate/domains/compliance.py +217 -0
  33. steadystate/domains/security.py +192 -0
  34. steadystate/domains/security_azure.py +270 -0
  35. steadystate/domains/security_gcp.py +224 -0
  36. steadystate/domains/security_k8s.py +238 -0
  37. steadystate/engine.py +202 -0
  38. steadystate/health.py +53 -0
  39. steadystate/inbound/__init__.py +43 -0
  40. steadystate/inbound/base.py +409 -0
  41. steadystate/inbound/discord.py +201 -0
  42. steadystate/inbound/mcp.py +403 -0
  43. steadystate/inbound/server.py +1365 -0
  44. steadystate/inbound/slack.py +141 -0
  45. steadystate/inbound/teams.py +112 -0
  46. steadystate/inbound/translate.py +249 -0
  47. steadystate/metrics.py +155 -0
  48. steadystate/model.py +85 -0
  49. steadystate/notify/__init__.py +62 -0
  50. steadystate/notify/base.py +21 -0
  51. steadystate/notify/console.py +185 -0
  52. steadystate/notify/discord.py +142 -0
  53. steadystate/notify/github.py +220 -0
  54. steadystate/notify/grafana.py +103 -0
  55. steadystate/notify/pagerduty.py +98 -0
  56. steadystate/notify/prometheus.py +150 -0
  57. steadystate/notify/servicenow.py +234 -0
  58. steadystate/notify/slack.py +137 -0
  59. steadystate/notify/teams.py +138 -0
  60. steadystate/notify/webhook.py +100 -0
  61. steadystate/onboarding.py +398 -0
  62. steadystate/plugins.py +81 -0
  63. steadystate/probe/__init__.py +99 -0
  64. steadystate/probe/ansible_health.py +318 -0
  65. steadystate/probe/argocd.py +75 -0
  66. steadystate/probe/base.py +69 -0
  67. steadystate/probe/custom.py +830 -0
  68. steadystate/probe/docker.py +169 -0
  69. steadystate/probe/kubectl.py +613 -0
  70. steadystate/probe/solutions.py +241 -0
  71. steadystate/reason/__init__.py +1 -0
  72. steadystate/reason/alert.py +119 -0
  73. steadystate/reason/correlate.py +154 -0
  74. steadystate/reason/cost.py +188 -0
  75. steadystate/reason/enrich.py +357 -0
  76. steadystate/reason/explain.py +64 -0
  77. steadystate/reason/llm.py +457 -0
  78. steadystate/reason/pipeline.py +397 -0
  79. steadystate/reason/report.py +74 -0
  80. steadystate/reconcile.py +54 -0
  81. steadystate/reconcile_state.py +230 -0
  82. steadystate/serialize.py +117 -0
  83. steadystate/silos.py +85 -0
  84. steadystate/sources/__init__.py +204 -0
  85. steadystate/sources/ansible.py +138 -0
  86. steadystate/sources/argocd.py +79 -0
  87. steadystate/sources/base.py +141 -0
  88. steadystate/sources/docker_compose.py +169 -0
  89. steadystate/sources/helm.py +96 -0
  90. steadystate/sources/k8s.py +757 -0
  91. steadystate/sources/rancher.py +105 -0
  92. steadystate/sources/terraform.py +119 -0
  93. steadystate/state.py +708 -0
  94. steadystate/sweep.py +176 -0
  95. steadystate/targets.py +140 -0
  96. steadystate-0.1.0.dist-info/METADATA +214 -0
  97. steadystate-0.1.0.dist-info/RECORD +100 -0
  98. steadystate-0.1.0.dist-info/WHEEL +4 -0
  99. steadystate-0.1.0.dist-info/entry_points.txt +2 -0
  100. steadystate-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,7 @@
1
+ """steadystate.ai — stateful monitoring.
2
+
3
+ Reconcile declared infrastructure state (Terraform, ArgoCD, ...) against observed
4
+ reality, reason about the drift, and surface only what's actionable.
5
+ """
6
+
7
+ __version__ = "0.1.0"
steadystate/_http.py ADDED
@@ -0,0 +1,33 @@
1
+ """Internal HTTP helper: every outbound request goes through one audited urlopen.
2
+
3
+ steadystate opens URLs the operator configures -- chat webhooks, a Prometheus/Grafana base,
4
+ the ArgoCD/Rancher APIs, an LLM endpoint. Routing them all through one place lets us enforce a
5
+ single invariant: we only ever speak http(s). That rejects ``file://``, ``ftp://``, ``gopher://``
6
+ and the other schemes ``urllib`` would otherwise honor (the local-file / SSRF surface), and it
7
+ fails fast with a clear error on a mistyped URL -- instead of silently reading a local file.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import urllib.request
13
+ from typing import Any
14
+ from urllib.parse import urlparse
15
+
16
+ _ALLOWED_SCHEMES = frozenset({"http", "https"})
17
+
18
+
19
+ def _url_of(target: str | urllib.request.Request) -> str:
20
+ return target.full_url if isinstance(target, urllib.request.Request) else target
21
+
22
+
23
+ def safe_urlopen(target: str | urllib.request.Request, *, timeout: float | None = None) -> Any:
24
+ """``urllib.request.urlopen`` restricted to http(s).
25
+
26
+ Raises ``ValueError`` for any other scheme (or a schemeless URL) *before* a socket opens.
27
+ Callers keep their own timeout + error handling; this only narrows *which* URLs may open.
28
+ """
29
+ scheme = urlparse(_url_of(target)).scheme.lower()
30
+ if scheme not in _ALLOWED_SCHEMES:
31
+ raise ValueError(f"refusing to open a non-http(s) URL (scheme: {scheme or 'none'!r})")
32
+ # B310: scheme is allow-listed to http(s) immediately above, so this is the audited gate.
33
+ return urllib.request.urlopen(target, timeout=timeout) # nosec B310
@@ -0,0 +1,57 @@
1
+ """Executor plugins + guardrails -- the act seam, keyed by source.
2
+
3
+ Every remediation is apply-eligibility-checked, snapshotted, verified, and reversible; chat
4
+ (or any trigger) is a convenience, never a bypass of those guardrails. Executors register
5
+ here per source, mirroring DRIFT_SOURCES: a source with an executor can be *acted on*; a
6
+ source with none is **observe-only** -- steadystate detects its drift but cannot remediate it,
7
+ and build_executor returns None. Adding an in-tree backend's act half is one line in
8
+ _BUILTIN_EXECUTORS.
9
+
10
+ Out-of-tree executors register the same way without editing this file: a separately installed
11
+ package declares a `steadystate.executors` entry point (a factory(path) -> Executor) and
12
+ `merged()` overlays it on the built-ins (built-ins win a name clash). See plugins.py. A
13
+ discovered executor is bound by source name, so it pairs with a discovered (or built-in) source.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from collections.abc import Callable
19
+ from pathlib import Path
20
+
21
+ from ..plugins import merged
22
+ from .ansible import AnsibleExecutor
23
+ from .base import Executor
24
+ from .terraform import TerraformExecutor
25
+
26
+
27
+ def _terraform(path: Path) -> Executor:
28
+ # A working dir can apply; a captured plan file can only plan (no dir to run in).
29
+ return TerraformExecutor(working_dir=None if path.is_file() else path)
30
+
31
+
32
+ def _ansible(path: Path) -> Executor:
33
+ # The playbook + inventory come from env (STEADYSTATE_ANSIBLE_PLAYBOOK/_INVENTORY); a dir
34
+ # path is the working dir to run the playbook in (a captured-check file has none).
35
+ return AnsibleExecutor(working_dir=None if path.is_file() else path)
36
+
37
+
38
+ # source name -> factory(path) -> Executor. Only sources listed here can act; everything
39
+ # else is observe-only by omission. (k8s/compose are the next entries.)
40
+ _BUILTIN_EXECUTORS: dict[str, Callable[[Path], Executor]] = {
41
+ "terraform": _terraform,
42
+ "ansible": _ansible,
43
+ }
44
+
45
+ # Built-ins overlaid with discovered `steadystate.executors` entry points.
46
+ EXECUTORS: dict[str, Callable[[Path], Executor]] = merged("executors", _BUILTIN_EXECUTORS)
47
+
48
+ __all__ = ["EXECUTORS", "Executor", "build_executor"]
49
+
50
+ # Note: reflex/hold (the control loop) lives in act.reflex and is imported directly by callers --
51
+ # kept out of this module's import graph to avoid a cycle (reflex -> approve -> act).
52
+
53
+
54
+ def build_executor(source: str, path: Path) -> Executor | None:
55
+ """The registered Executor for ``source``, or None when the source is observe-only."""
56
+ factory = EXECUTORS.get(source)
57
+ return factory(path) if factory is not None else None
@@ -0,0 +1,106 @@
1
+ """Ansible executor: reconcile a drifted host back to its playbook, with guardrails.
2
+
3
+ A drift from the ansible source is `host:task` -- a task `ansible-playbook --check` said
4
+ would change on a host. Remediation is the natural inverse: run the playbook for real,
5
+ scoped to that host (`ansible-playbook --limit <host>`), which is reconcile-toward-declared
6
+ (the safe self-heal direction -- Ansible doesn't destroy undeclared resources). Live apply is
7
+ gated behind apply-eligibility AND `confirm=True`; nothing runs by default.
8
+
9
+ Ansible is not transactional, so there is no clean snapshot/auto-revert (unlike terraform's
10
+ plan). We're honest about that in the plan's revert guidance. Verify re-runs `--check` for the
11
+ host and reports whether the drift cleared.
12
+
13
+ The playbook + inventory are configured out of band (constructor or the env vars
14
+ STEADYSTATE_ANSIBLE_PLAYBOOK / STEADYSTATE_ANSIBLE_INVENTORY), since the drift input the CLI
15
+ passes is the captured check output, not the playbook itself.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import os
21
+ import subprocess
22
+ from pathlib import Path
23
+
24
+ from ..model import Drift
25
+ from .base import RemediationResult
26
+ from .plan import RemediationPlan, Risk
27
+
28
+
29
+ class AnsibleExecutor:
30
+ name = "ansible"
31
+
32
+ def __init__(
33
+ self,
34
+ playbook: str | None = None,
35
+ inventory: str | None = None,
36
+ working_dir: str | Path | None = None,
37
+ ) -> None:
38
+ self.playbook = playbook or os.environ.get("STEADYSTATE_ANSIBLE_PLAYBOOK")
39
+ self.inventory = inventory or os.environ.get("STEADYSTATE_ANSIBLE_INVENTORY")
40
+ self.working_dir = Path(working_dir) if working_dir else None
41
+
42
+ def _host(self, drift: Drift) -> str:
43
+ return drift.identity.split(":", 1)[0]
44
+
45
+ def plan_for(self, drift: Drift) -> RemediationPlan:
46
+ host = self._host(drift)
47
+ command = ["ansible-playbook", "--limit", host]
48
+ if self.inventory:
49
+ command += ["-i", self.inventory]
50
+ if self.playbook:
51
+ command.append(self.playbook)
52
+ return RemediationPlan(
53
+ drift_identity=drift.identity,
54
+ # Re-running the playbook reconciles the host to declared -- the safe self-heal
55
+ # direction. Always eligible: Ansible converges toward the playbook, it doesn't
56
+ # destroy resources the playbook doesn't mention.
57
+ eligible=True,
58
+ risk=Risk.MEDIUM,
59
+ reason="Re-running the playbook on the host reconciles it to the declared config.",
60
+ command=command,
61
+ blast_radius=f"Runs the playbook against host {host}.",
62
+ revert=(
63
+ "Ansible is not transactional -- there is no automatic revert; restore from a "
64
+ "known-good playbook state and re-run if needed."
65
+ ),
66
+ )
67
+
68
+ def remediate(self, drift: Drift, *, confirm: bool = False) -> RemediationResult:
69
+ plan = self.plan_for(drift)
70
+ if not confirm:
71
+ return RemediationResult(
72
+ plan=plan,
73
+ applied=False,
74
+ verified=False,
75
+ detail="Dry run: pass confirm=True (or --apply) to reconcile.",
76
+ )
77
+ if not self.playbook:
78
+ return RemediationResult(
79
+ plan=plan,
80
+ applied=False,
81
+ verified=False,
82
+ detail="No playbook configured; set STEADYSTATE_ANSIBLE_PLAYBOOK to apply.",
83
+ )
84
+ self._run(plan.command)
85
+ cleared = not self._still_drifting(drift)
86
+ return RemediationResult(
87
+ plan=plan,
88
+ applied=True,
89
+ verified=cleared,
90
+ detail="Applied and verified clear."
91
+ if cleared
92
+ else "Applied, but the host still drifts on re-check.",
93
+ )
94
+
95
+ # --- live ansible (guarded; not exercised by unit tests) ---
96
+
97
+ def _run(self, command: list[str]) -> None:
98
+ subprocess.run(command, cwd=self.working_dir, check=True, capture_output=True, text=True)
99
+
100
+ def _still_drifting(self, drift: Drift) -> bool:
101
+ from ..sources.ansible import AnsibleSource
102
+
103
+ residual = AnsibleSource(
104
+ playbook=self.playbook, inventory=self.inventory, working_dir=self.working_dir
105
+ ).collect_drift()
106
+ return any(d.identity == drift.identity for d in residual)
@@ -0,0 +1,144 @@
1
+ """Shared remediation-approval core -- the CLI verbs and the chat listener both call here.
2
+
3
+ Approving rebuilds the source + executor from what the suggesting scan recorded, re-collects
4
+ to match the *live* drift by fingerprint (so the executor's snapshot/verify run against
5
+ reality, and an already-cleared drift is a clean no-op), then applies under the usual
6
+ guardrails. Decline marks it so a re-scan won't re-offer it.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from datetime import UTC, datetime
12
+ from pathlib import Path
13
+
14
+ from ..sources import build_drift_source
15
+ from ..state import (
16
+ APPLIED,
17
+ APPROVED,
18
+ BREAKGLASS,
19
+ DECLINED,
20
+ FAILED,
21
+ NOOP,
22
+ PENDING,
23
+ VERIFIED,
24
+ AuditEntry,
25
+ PendingAction,
26
+ StateStore,
27
+ )
28
+ from . import build_executor
29
+ from .base import RemediationResult
30
+ from .bounds import confirmation_tier
31
+ from .breakglass import BREAKGLASS_SOURCE, breakglass_allowed
32
+ from .catalog import action_for_command
33
+ from .cleanup import CLEANUP_SOURCE, run_cleanup
34
+ from .execute import CATALOG_SOURCE, run_catalog_action
35
+ from .solution_remedy import SOLUTION_SOURCE, run_solution, solution_named
36
+
37
+
38
+ def _audit(
39
+ action: PendingAction, actor: str, decision: str, outcome: str, detail: str | None
40
+ ) -> AuditEntry:
41
+ """Build the append-only audit record for a decision on ``action``."""
42
+ return AuditEntry(
43
+ fingerprint=action.fingerprint,
44
+ source=action.source,
45
+ drift_identity=action.drift_identity,
46
+ actor=actor,
47
+ decision=decision,
48
+ outcome=outcome,
49
+ environment=action.environment,
50
+ detail=detail,
51
+ )
52
+
53
+
54
+ def apply_pending(
55
+ store: StateStore,
56
+ fingerprint: str,
57
+ actor: str,
58
+ now: datetime | None = None,
59
+ *,
60
+ token: str = "",
61
+ ) -> tuple[str, RemediationResult | None]:
62
+ """Approve + run the pending remediation for ``fingerprint``. Returns a human message and
63
+ the RemediationResult when one ran (None when there was nothing to do). Every decision that
64
+ reaches a real remediation point is recorded to the append-only audit log. ``token`` is the
65
+ break-glass confirmation (the target's name for a strong-tier override); ignored otherwise."""
66
+ now = now or datetime.now(UTC)
67
+ action = store.get_pending(fingerprint)
68
+ if action is None or action.status != PENDING:
69
+ return "no pending remediation for that fingerprint.", None
70
+ if action.source == BREAKGLASS_SOURCE: # an out-of-bound action awaiting a human override
71
+ # Re-check the allowlist AT CONFIRM time, then the confirmation friction (strong tier: type
72
+ # the target's name, stored as drift_identity), then run with the bound overridden --
73
+ # audited as BREAKGLASS so `history` shows who overrode it.
74
+ if not breakglass_allowed(actor):
75
+ return (
76
+ f"break-glass not enabled for you ({actor}). Set STEADYSTATE_BREAKGLASS_USERS.",
77
+ None,
78
+ )
79
+ matched = action_for_command(action.command)
80
+ tier = confirmation_tier(matched.envelope) if matched is not None else 0
81
+ if tier >= 2 and token != action.drift_identity:
82
+ return (
83
+ f"break-glass: type the target to confirm -- "
84
+ f"approve {action.fingerprint} {action.drift_identity}",
85
+ None,
86
+ )
87
+ if not store.claim_pending(fingerprint, PENDING, APPROVED, actor):
88
+ return "no pending remediation for that fingerprint.", None
89
+ result = run_catalog_action(action, break_glass=True)
90
+ outcome = VERIFIED if result.verified else APPLIED if result.applied else FAILED
91
+ store.record_audit(_audit(action, actor, BREAKGLASS, outcome, result.detail), now)
92
+ return result.detail, result
93
+ if action.source == SOLUTION_SOURCE: # an authored runbook command -- operator-vouched, gated
94
+ # Same race guard + audit as the cleanup, but no content allow-pattern: the operator wrote
95
+ # and vouched for this command (it's IaC-grade runbook intent). Recover the bound from the
96
+ # named solution for the plan; the audit records the solution + author (in drift_identity).
97
+ if not store.claim_pending(fingerprint, PENDING, APPROVED, actor):
98
+ return "no pending remediation for that fingerprint.", None
99
+ result = run_solution(action, solution_named(action.drift_identity))
100
+ outcome = VERIFIED if result.verified else APPLIED if result.applied else FAILED
101
+ store.record_audit(_audit(action, actor, APPROVED, outcome, result.detail), now)
102
+ return result.detail, result
103
+ if action.source in (CLEANUP_SOURCE, CATALOG_SOURCE): # a direct, re-validated catalog command
104
+ # Claim before the irreversible step (same race guard as the drift path), then run the
105
+ # allow-listed command and audit it. No drift source/executor -- the command is it. Both
106
+ # the evicted cleanup and the general `fix`/`run` actions route here through the same gate.
107
+ if not store.claim_pending(fingerprint, PENDING, APPROVED, actor):
108
+ return "no pending remediation for that fingerprint.", None
109
+ result = (
110
+ run_cleanup(action) if action.source == CLEANUP_SOURCE else run_catalog_action(action)
111
+ )
112
+ outcome = VERIFIED if result.verified else APPLIED if result.applied else FAILED
113
+ store.record_audit(_audit(action, actor, APPROVED, outcome, result.detail), now)
114
+ return result.detail, result
115
+ executor = build_executor(action.source, Path(action.path))
116
+ if executor is None:
117
+ return f"source '{action.source}' is observe-only; cannot remediate.", None
118
+ # Atomically claim the action (pending -> approved) BEFORE anything irreversible. Two approvers
119
+ # racing the same fingerprint (two chat users) both read PENDING above; the conditional UPDATE
120
+ # lets exactly one win -- the loser bails here, so the remediation runs at most once.
121
+ if not store.claim_pending(fingerprint, PENDING, APPROVED, actor):
122
+ return "no pending remediation for that fingerprint.", None
123
+ drifts = build_drift_source(action.source, Path(action.path)).collect_drift()
124
+ drift = next((d for d in drifts if d.fingerprint == fingerprint), None)
125
+ if drift is None:
126
+ store.record_audit(_audit(action, actor, APPROVED, NOOP, "drift no longer present"), now)
127
+ return "drift no longer present; nothing to do.", None
128
+ result = executor.remediate(drift, confirm=True)
129
+ outcome = VERIFIED if result.verified else APPLIED if result.applied else FAILED
130
+ store.record_audit(_audit(action, actor, APPROVED, outcome, result.detail), now)
131
+ return result.detail, result
132
+
133
+
134
+ def decline_pending(
135
+ store: StateStore, fingerprint: str, actor: str, now: datetime | None = None
136
+ ) -> str:
137
+ """Decline the pending remediation for ``fingerprint``. Returns a human message."""
138
+ now = now or datetime.now(UTC)
139
+ action = store.get_pending(fingerprint)
140
+ if action is None:
141
+ return "no pending remediation for that fingerprint."
142
+ store.set_pending_status(fingerprint, DECLINED, actor)
143
+ store.record_audit(_audit(action, actor, DECLINED, DECLINED, None), now)
144
+ return f"declined {fingerprint}"
@@ -0,0 +1,94 @@
1
+ """Remediation artifacts -- a remediation expressed as a *code change*, not a live apply.
2
+
3
+ The deterministic counterpart to the live executors. Where ``terraform.py`` reconciles by
4
+ changing reality to match the repo (``terraform apply``), an artifact reconciles the *other*
5
+ direction: it proposes a repo change a human reviews and merges. The canonical form is a
6
+ **patch** (a git-apply-able unified diff) -- auth-free, VCS-agnostic, and a pure string, so it
7
+ is fully testable and provably *not* model-authored. A branch / PR is a way to *deliver* a
8
+ patch (see ``act/deliver/``), never the artifact itself.
9
+
10
+ The artifact is honest about **state**: a code change for a resource that isn't in state can't
11
+ just edit files -- it must import the resource (the safe, non-destructive direction) or destroy
12
+ it (never automatic). ``state_ops`` records that effect in plain language and ``destructive``
13
+ flags the dangerous direction, so the dimension the apply path glosses over is explicit here.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import re
19
+ from dataclasses import dataclass, field
20
+
21
+ from ..model import ChangeType
22
+
23
+
24
+ @dataclass
25
+ class RemediationArtifact:
26
+ """A remediation rendered as a reviewable repo change.
27
+
28
+ ``patch`` is the deterministic fix (a unified diff); ``state_ops`` and ``destructive`` make
29
+ the state effect explicit (an import vs a destroy); ``title`` / ``body`` narrate it -- today
30
+ deterministic, later LLM-authored when reasoning is enabled (the seam is the same)."""
31
+
32
+ drift_identity: str # the resource the change concerns, e.g. "aws_s3_bucket.logs"
33
+ change_type: ChangeType
34
+ path: str # repo-relative file the patch creates or edits
35
+ patch: str # git-apply-able unified diff -- the canonical, auth-free fix
36
+ state_ops: list[str] = field(default_factory=list) # plain-language state effects (imports)
37
+ destructive: bool = False # True only for a destroy variant; gates labeling + delivery
38
+ title: str = "" # PR / commit title
39
+ body: str = "" # PR body: what changes, why, and the import/destroy implication
40
+
41
+ @property
42
+ def slug(self) -> str:
43
+ """A filesystem-safe id for this artifact (used to name a delivered ``.patch``)."""
44
+ return re.sub(r"[^A-Za-z0-9._-]", "_", self.drift_identity)
45
+
46
+
47
+ def files_from_patch(patch: str) -> dict[str, str]:
48
+ """Reconstruct ``{repo_path: full_file_content}`` from a **whole-file-addition** unified diff
49
+ (the form ``new_file_patch`` produces) -- what an API-based delivery (github-pr) needs, since
50
+ it builds a tree from file contents, not a diff. Only new-file hunks are recovered; an edit or
51
+ delete hunk contributes nothing for that path, so a caller can detect the gap and skip rather
52
+ than ship a partial change. Deterministic, no git invoked."""
53
+ files: dict[str, str] = {}
54
+ path: str | None = None
55
+ is_new_file = False
56
+ body: list[str] = []
57
+
58
+ def _flush() -> None:
59
+ if path is not None and is_new_file:
60
+ files[path] = "\n".join(body) + ("\n" if body else "")
61
+
62
+ for line in patch.splitlines():
63
+ if line.startswith("diff --git "):
64
+ _flush()
65
+ path, is_new_file, body = None, False, []
66
+ elif line == "--- /dev/null":
67
+ is_new_file = True
68
+ elif line.startswith("+++ b/"):
69
+ path = line[len("+++ b/") :]
70
+ elif line.startswith("+") and not line.startswith("+++"):
71
+ body.append(line[1:])
72
+ _flush()
73
+ return files
74
+
75
+
76
+ def new_file_patch(path: str, content: str) -> str:
77
+ """A git-apply-able unified diff that *creates* ``path`` with ``content``.
78
+
79
+ The whole-file-addition form (``--- /dev/null`` -> ``+++ b/<path>``) -- what ``git apply``
80
+ expects for a new file. ``content`` is normalized to end in a newline so every added line,
81
+ including the last, terminates cleanly and no ``\`` marker is
82
+ needed. Pure string assembly: deterministic and unit-testable, no git invoked."""
83
+ if not content.endswith("\n"):
84
+ content += "\n"
85
+ lines = content.split("\n")[:-1] # drop the empty trailing element from the final newline
86
+ hunk = "".join(f"+{line}\n" for line in lines)
87
+ return (
88
+ f"diff --git a/{path} b/{path}\n"
89
+ "new file mode 100644\n"
90
+ "--- /dev/null\n"
91
+ f"+++ b/{path}\n"
92
+ f"@@ -0,0 +1,{len(lines)} @@\n"
93
+ f"{hunk}"
94
+ )
@@ -0,0 +1,45 @@
1
+ """The Executor plugin seam + its result type.
2
+
3
+ Every remediation must be apply-eligibility-checked, snapshotted, verified, and
4
+ reversible. Chat is a convenient trigger, never a bypass of these guardrails.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import Protocol, runtime_checkable
11
+
12
+ from ..model import Drift
13
+ from .artifact import RemediationArtifact
14
+ from .plan import RemediationPlan
15
+
16
+
17
+ @dataclass
18
+ class RemediationResult:
19
+ plan: RemediationPlan
20
+ applied: bool
21
+ verified: bool # post-apply re-check: did the drift actually clear?
22
+ detail: str = ""
23
+ snapshot: dict | None = field(default=None) # pre-change state, for the record / revert
24
+
25
+
26
+ @runtime_checkable
27
+ class Executor(Protocol):
28
+ name: str
29
+
30
+ def plan_for(self, drift: Drift) -> RemediationPlan: ...
31
+
32
+ def remediate(self, drift: Drift, *, confirm: bool = False) -> RemediationResult: ...
33
+
34
+
35
+ @runtime_checkable
36
+ class Proposer(Protocol):
37
+ """An *optional* executor capability: render a drift as a reviewable code change instead of a
38
+ live apply. Probed by ``isinstance(executor, Proposer)`` -- like the inbound adapters' optional
39
+ ``defer``/``complete`` -- so an executor that can express a fix as a patch implements it and one
40
+ that can only apply live simply doesn't, and the propose path degrades honestly for it.
41
+
42
+ ``propose`` returns ``None`` for a drift it has no code-change for (e.g. the apply direction is
43
+ the right fix), so a caller can offer artifacts only where one genuinely exists."""
44
+
45
+ def propose(self, drift: Drift) -> RemediationArtifact | None: ...
@@ -0,0 +1,170 @@
1
+ """Action envelopes + the bound: one impact-and-reversibility calculus for *all* infrastructure.
2
+
3
+ The decision a good operator makes before any change is always the same two questions -- *how much
4
+ does this touch, and can I undo it?* -- and it does not depend on whether the change is a
5
+ `kubectl delete`, a `terraform apply`, or an ansible play. So that calculus lives here, once,
6
+ backend-agnostic: every action declares an ``Envelope`` (its reversibility and its blast radius on
7
+ a generic scale), and a human declares the **bound** -- which envelopes may run unattended. The
8
+ gate (``within_bounds``) sees only the envelope, never a backend.
9
+
10
+ What the bound governs today: the autonomous paths -- reflexes (``hold``), the decider
11
+ (``propose``), and the drift ``--autonomy auto`` path for any executor that declares an envelope
12
+ (terraform does; ``can_run_unattended`` in act/plan.py is the gate). An executor that has not yet
13
+ declared envelopes (ansible) falls back to the older ``eligible`` boolean -- the migration is
14
+ incremental, and absence of an envelope only keeps prior behavior, never loosens it. So this is on
15
+ its way to "one grid governs every source", and is honest that it is not all the way there yet.
16
+
17
+ This is the spine the autonomy story stands on. A reflex today, an LLM tomorrow, decides *what to
18
+ do*; the bound decides *how much it is ever allowed to break*. The decider proposes an action and
19
+ its envelope; the gate checks that envelope against the human's bound; out-of-bound escalates no
20
+ matter how confident the decider is. The one decision that never goes to the code or the model is
21
+ the bound itself -- a human sets it (the conservative default, widened via ``STEADYSTATE_BOUND`` as
22
+ trust grows), and flipping a reflex to ``auto`` can never cross it.
23
+
24
+ The two axes are deliberately generic; each backend maps its own nouns onto them:
25
+
26
+ Impact k8s terraform ansible compose
27
+ ------ --- --------- ------- -------
28
+ ONE a pod one resource one host/task one container
29
+ SERVICE a workload a module a role a service
30
+ TENANT a namespace a workspace/state an inventory group a project/stack
31
+ NODE a node -- a managed host the docker host
32
+ FLEET the cluster a root/account/region the whole inventory the engine
33
+
34
+ Reversibility example
35
+ ------------- -------
36
+ LOSSLESS destroys nothing of value (delete an evicted pod, `docker rm` a dead container)
37
+ SELF_HEALING the platform restores it (delete a Running pod, restart a service, cordon a node)
38
+ RECOVERABLE a known inverse exists (scale down<->up, a re-appliable terraform change)
39
+ IRREVERSIBLE real loss, no inverse (delete a PVC, `terraform destroy` a database)
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import os
45
+ from dataclasses import dataclass
46
+ from enum import IntEnum
47
+
48
+ # Both axes are ordinal (IntEnum), worst last -- so a policy is just "the highest tier still
49
+ # allowed" and the gate is a comparison. Names render lowercased for humans; the order is the point.
50
+
51
+
52
+ class Reversibility(IntEnum):
53
+ """Can the action be undone, and is anything of value lost if it can't? Ascending severity."""
54
+
55
+ LOSSLESS = 0
56
+ SELF_HEALING = 1
57
+ RECOVERABLE = 2
58
+ IRREVERSIBLE = 3
59
+
60
+
61
+ class Impact(IntEnum):
62
+ """The blast radius on a generic, cross-backend scale (each backend maps its nouns). Rising."""
63
+
64
+ ONE = 0
65
+ SERVICE = 1
66
+ TENANT = 2
67
+ NODE = 3
68
+ FLEET = 4
69
+
70
+
71
+ @dataclass(frozen=True)
72
+ class Envelope:
73
+ """What an action would do, in the only two terms the bound cares about. Backend-agnostic: a
74
+ kubectl cleanup, a terraform apply, and an ansible play all describe themselves this way."""
75
+
76
+ reversibility: Reversibility
77
+ impact: Impact
78
+
79
+ @property
80
+ def label(self) -> str:
81
+ return f"{self.reversibility.name.lower()}/{self.impact.name.lower()}"
82
+
83
+
84
+ # The bound: for each reversibility, the HIGHEST impact tier that may run unattended (None = never).
85
+ # This is the human's 3am calculus, written down once. Conservative by default -- only a lossless or
86
+ # self-healing action, and only within a small blast radius, runs without a human; anything
87
+ # recoverable-or-worse, or anything reaching a node/the fleet, escalates. An operator widens it as
88
+ # trust grows (the same graduation `hold`'s reflexes use), but it is ALWAYS a human's decision: the
89
+ # bound is the one thing a decider -- reflex or model -- never sets for itself.
90
+ BoundPolicy = dict[Reversibility, "Impact | None"]
91
+
92
+ DEFAULT_BOUND: BoundPolicy = {
93
+ Reversibility.LOSSLESS: Impact.TENANT, # lossless, up to a whole tenant (namespace/stack): auto
94
+ Reversibility.SELF_HEALING: Impact.SERVICE, # self-healing up to one service -> auto
95
+ Reversibility.RECOVERABLE: None, # a known inverse still needs a human, until trust is earned
96
+ Reversibility.IRREVERSIBLE: None, # never autonomous, at any size
97
+ }
98
+
99
+
100
+ def within_bounds(envelope: Envelope, policy: BoundPolicy = DEFAULT_BOUND) -> bool:
101
+ """True iff ``envelope`` may run unattended under ``policy`` -- the gate every decider passes
102
+ through, seeing only the envelope, never a backend. Pure. ``False`` (escalate) is the safe
103
+ default for any reversibility the policy doesn't permit."""
104
+ ceiling = policy.get(envelope.reversibility)
105
+ return ceiling is not None and envelope.impact <= ceiling
106
+
107
+
108
+ def confirmation_tier(envelope: Envelope, policy: BoundPolicy = DEFAULT_BOUND) -> int:
109
+ """How much confirmation friction an action needs, from its envelope alone. ``0`` = within the
110
+ bound -- autonomous-eligible, no confirmation (`fix`/`run` just runs it). Out of bound is
111
+ break-glass: ``2`` (STRONG -- type the target's name to confirm) when it's IRREVERSIBLE or
112
+ reaches a NODE/the FLEET; else ``1`` (light -- a plain confirm). So the most dangerous things
113
+ get the most friction, automatically. Pure."""
114
+ if within_bounds(envelope, policy):
115
+ return 0
116
+ if envelope.reversibility >= Reversibility.IRREVERSIBLE or envelope.impact >= Impact.NODE:
117
+ return 2
118
+ return 1
119
+
120
+
121
+ _REVERSIBILITY_BY_NAME = {r.name.lower(): r for r in Reversibility}
122
+ _IMPACT_BY_NAME = {i.name.lower(): i for i in Impact}
123
+
124
+
125
+ def _config_bound_table() -> dict:
126
+ from ..config import config_table # local import: keep bounds.py importable without the config
127
+
128
+ return config_table("bound")
129
+
130
+
131
+ def _apply_bound_pair(policy: BoundPolicy, rev_name: str, impact_name: str) -> None:
132
+ """Overlay one ``reversibility=impact`` decision onto ``policy`` (in place). An unknown
133
+ reversibility or impact is **skipped**, never applied -- a typo can only leave the bound at the
134
+ conservative default, never silently widen it ('never escalate on uncertainty')."""
135
+ reversibility = _REVERSIBILITY_BY_NAME.get(rev_name.strip().lower())
136
+ if reversibility is None:
137
+ return
138
+ impact = impact_name.strip().lower()
139
+ if impact in ("none", "never", ""):
140
+ policy[reversibility] = None # explicitly forbid auto for this reversibility
141
+ elif impact in _IMPACT_BY_NAME:
142
+ policy[reversibility] = _IMPACT_BY_NAME[impact]
143
+ # else: an unknown impact name -> skip (stay conservative; never widen on a typo)
144
+
145
+
146
+ def bound_from_env(raw: str | None = None) -> BoundPolicy:
147
+ """The *active* bound: ``DEFAULT_BOUND`` overlaid with the committed ``[bound]`` table, then the
148
+ ``STEADYSTATE_BOUND`` env knob (env wins). The widen/narrow dial every autonomy gate reads, so
149
+ one source governs the whole tool: drift auto-apply, reflexes, solution auto-apply, the decider.
150
+
151
+ The bound is the one decision that should never be casual -- so it belongs in the **committed
152
+ config**, reviewed in PRs (`[bound]` with ``reversibility = "impact"`` keys), with the env var
153
+ the per-run override. Reversibility names: ``lossless`` / ``self_healing`` / ``recoverable`` /
154
+ ``irreversible``; impact names ``one`` / ``service`` / ``tenant`` / ``node`` / ``fleet`` (each
155
+ pair names the HIGHEST impact tier that may run unattended for that reversibility; ``none``
156
+ forbids it). Env format mirrors it: comma-separated ``reversibility=impact`` pairs, e.g.
157
+ ``STEADYSTATE_BOUND="recoverable=service"``.
158
+
159
+ Unparseable entries are skipped (never widen on a typo). Pure given ``raw``; with ``raw`` None
160
+ it reads the committed ``[bound]`` table then ``STEADYSTATE_BOUND``."""
161
+ policy = dict(DEFAULT_BOUND)
162
+ if raw is None: # the live resolution: committed config first (baseline), then env (override)
163
+ for rev_name, impact in _config_bound_table().items():
164
+ _apply_bound_pair(policy, rev_name, str(impact))
165
+ raw = os.environ.get("STEADYSTATE_BOUND", "")
166
+ for token in raw.replace(";", ",").split(","):
167
+ key, sep, value = token.partition("=")
168
+ if sep: # a `reversibility=impact` pair (a bare token is skipped)
169
+ _apply_bound_pair(policy, key, value)
170
+ return policy