steadystate 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- steadystate/__init__.py +7 -0
- steadystate/_http.py +33 -0
- steadystate/act/__init__.py +57 -0
- steadystate/act/ansible.py +106 -0
- steadystate/act/approve.py +144 -0
- steadystate/act/artifact.py +94 -0
- steadystate/act/base.py +45 -0
- steadystate/act/bounds.py +170 -0
- steadystate/act/breakglass.py +36 -0
- steadystate/act/catalog.py +298 -0
- steadystate/act/cleanup.py +209 -0
- steadystate/act/codify.py +142 -0
- steadystate/act/decide.py +353 -0
- steadystate/act/deliver/__init__.py +44 -0
- steadystate/act/deliver/base.py +40 -0
- steadystate/act/deliver/github_pr.py +189 -0
- steadystate/act/deliver/patch_file.py +37 -0
- steadystate/act/execute.py +93 -0
- steadystate/act/learn.py +284 -0
- steadystate/act/plan.py +112 -0
- steadystate/act/reflex.py +320 -0
- steadystate/act/solution_remedy.py +201 -0
- steadystate/act/terraform.py +130 -0
- steadystate/catalog.py +206 -0
- steadystate/classify.py +103 -0
- steadystate/cli.py +2525 -0
- steadystate/compliance.py +169 -0
- steadystate/config.py +46 -0
- steadystate/discover.py +1227 -0
- steadystate/domains/__init__.py +89 -0
- steadystate/domains/base.py +140 -0
- steadystate/domains/compliance.py +217 -0
- steadystate/domains/security.py +192 -0
- steadystate/domains/security_azure.py +270 -0
- steadystate/domains/security_gcp.py +224 -0
- steadystate/domains/security_k8s.py +238 -0
- steadystate/engine.py +202 -0
- steadystate/health.py +53 -0
- steadystate/inbound/__init__.py +43 -0
- steadystate/inbound/base.py +409 -0
- steadystate/inbound/discord.py +201 -0
- steadystate/inbound/mcp.py +403 -0
- steadystate/inbound/server.py +1365 -0
- steadystate/inbound/slack.py +141 -0
- steadystate/inbound/teams.py +112 -0
- steadystate/inbound/translate.py +249 -0
- steadystate/metrics.py +155 -0
- steadystate/model.py +85 -0
- steadystate/notify/__init__.py +62 -0
- steadystate/notify/base.py +21 -0
- steadystate/notify/console.py +185 -0
- steadystate/notify/discord.py +142 -0
- steadystate/notify/github.py +220 -0
- steadystate/notify/grafana.py +103 -0
- steadystate/notify/pagerduty.py +98 -0
- steadystate/notify/prometheus.py +150 -0
- steadystate/notify/servicenow.py +234 -0
- steadystate/notify/slack.py +137 -0
- steadystate/notify/teams.py +138 -0
- steadystate/notify/webhook.py +100 -0
- steadystate/onboarding.py +398 -0
- steadystate/plugins.py +81 -0
- steadystate/probe/__init__.py +99 -0
- steadystate/probe/ansible_health.py +318 -0
- steadystate/probe/argocd.py +75 -0
- steadystate/probe/base.py +69 -0
- steadystate/probe/custom.py +830 -0
- steadystate/probe/docker.py +169 -0
- steadystate/probe/kubectl.py +613 -0
- steadystate/probe/solutions.py +241 -0
- steadystate/reason/__init__.py +1 -0
- steadystate/reason/alert.py +119 -0
- steadystate/reason/correlate.py +154 -0
- steadystate/reason/cost.py +188 -0
- steadystate/reason/enrich.py +357 -0
- steadystate/reason/explain.py +64 -0
- steadystate/reason/llm.py +457 -0
- steadystate/reason/pipeline.py +397 -0
- steadystate/reason/report.py +74 -0
- steadystate/reconcile.py +54 -0
- steadystate/reconcile_state.py +230 -0
- steadystate/serialize.py +117 -0
- steadystate/silos.py +85 -0
- steadystate/sources/__init__.py +204 -0
- steadystate/sources/ansible.py +138 -0
- steadystate/sources/argocd.py +79 -0
- steadystate/sources/base.py +141 -0
- steadystate/sources/docker_compose.py +169 -0
- steadystate/sources/helm.py +96 -0
- steadystate/sources/k8s.py +757 -0
- steadystate/sources/rancher.py +105 -0
- steadystate/sources/terraform.py +119 -0
- steadystate/state.py +708 -0
- steadystate/sweep.py +176 -0
- steadystate/targets.py +140 -0
- steadystate-0.1.0.dist-info/METADATA +214 -0
- steadystate-0.1.0.dist-info/RECORD +100 -0
- steadystate-0.1.0.dist-info/WHEEL +4 -0
- steadystate-0.1.0.dist-info/entry_points.txt +2 -0
- steadystate-0.1.0.dist-info/licenses/LICENSE +201 -0
steadystate/__init__.py
ADDED
steadystate/_http.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Internal HTTP helper: every outbound request goes through one audited urlopen.
|
|
2
|
+
|
|
3
|
+
steadystate opens URLs the operator configures -- chat webhooks, a Prometheus/Grafana base,
|
|
4
|
+
the ArgoCD/Rancher APIs, an LLM endpoint. Routing them all through one place lets us enforce a
|
|
5
|
+
single invariant: we only ever speak http(s). That rejects ``file://``, ``ftp://``, ``gopher://``
|
|
6
|
+
and the other schemes ``urllib`` would otherwise honor (the local-file / SSRF surface), and it
|
|
7
|
+
fails fast with a clear error on a mistyped URL -- instead of silently reading a local file.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import urllib.request
|
|
13
|
+
from typing import Any
|
|
14
|
+
from urllib.parse import urlparse
|
|
15
|
+
|
|
16
|
+
_ALLOWED_SCHEMES = frozenset({"http", "https"})
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _url_of(target: str | urllib.request.Request) -> str:
|
|
20
|
+
return target.full_url if isinstance(target, urllib.request.Request) else target
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def safe_urlopen(target: str | urllib.request.Request, *, timeout: float | None = None) -> Any:
|
|
24
|
+
"""``urllib.request.urlopen`` restricted to http(s).
|
|
25
|
+
|
|
26
|
+
Raises ``ValueError`` for any other scheme (or a schemeless URL) *before* a socket opens.
|
|
27
|
+
Callers keep their own timeout + error handling; this only narrows *which* URLs may open.
|
|
28
|
+
"""
|
|
29
|
+
scheme = urlparse(_url_of(target)).scheme.lower()
|
|
30
|
+
if scheme not in _ALLOWED_SCHEMES:
|
|
31
|
+
raise ValueError(f"refusing to open a non-http(s) URL (scheme: {scheme or 'none'!r})")
|
|
32
|
+
# B310: scheme is allow-listed to http(s) immediately above, so this is the audited gate.
|
|
33
|
+
return urllib.request.urlopen(target, timeout=timeout) # nosec B310
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Executor plugins + guardrails -- the act seam, keyed by source.
|
|
2
|
+
|
|
3
|
+
Every remediation is apply-eligibility-checked, snapshotted, verified, and reversible; chat
|
|
4
|
+
(or any trigger) is a convenience, never a bypass of those guardrails. Executors register
|
|
5
|
+
here per source, mirroring DRIFT_SOURCES: a source with an executor can be *acted on*; a
|
|
6
|
+
source with none is **observe-only** -- steadystate detects its drift but cannot remediate it,
|
|
7
|
+
and build_executor returns None. Adding an in-tree backend's act half is one line in
|
|
8
|
+
_BUILTIN_EXECUTORS.
|
|
9
|
+
|
|
10
|
+
Out-of-tree executors register the same way without editing this file: a separately installed
|
|
11
|
+
package declares a `steadystate.executors` entry point (a factory(path) -> Executor) and
|
|
12
|
+
`merged()` overlays it on the built-ins (built-ins win a name clash). See plugins.py. A
|
|
13
|
+
discovered executor is bound by source name, so it pairs with a discovered (or built-in) source.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from collections.abc import Callable
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from ..plugins import merged
|
|
22
|
+
from .ansible import AnsibleExecutor
|
|
23
|
+
from .base import Executor
|
|
24
|
+
from .terraform import TerraformExecutor
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _terraform(path: Path) -> Executor:
|
|
28
|
+
# A working dir can apply; a captured plan file can only plan (no dir to run in).
|
|
29
|
+
return TerraformExecutor(working_dir=None if path.is_file() else path)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _ansible(path: Path) -> Executor:
|
|
33
|
+
# The playbook + inventory come from env (STEADYSTATE_ANSIBLE_PLAYBOOK/_INVENTORY); a dir
|
|
34
|
+
# path is the working dir to run the playbook in (a captured-check file has none).
|
|
35
|
+
return AnsibleExecutor(working_dir=None if path.is_file() else path)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# source name -> factory(path) -> Executor. Only sources listed here can act; everything
|
|
39
|
+
# else is observe-only by omission. (k8s/compose are the next entries.)
|
|
40
|
+
_BUILTIN_EXECUTORS: dict[str, Callable[[Path], Executor]] = {
|
|
41
|
+
"terraform": _terraform,
|
|
42
|
+
"ansible": _ansible,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# Built-ins overlaid with discovered `steadystate.executors` entry points.
|
|
46
|
+
EXECUTORS: dict[str, Callable[[Path], Executor]] = merged("executors", _BUILTIN_EXECUTORS)
|
|
47
|
+
|
|
48
|
+
__all__ = ["EXECUTORS", "Executor", "build_executor"]
|
|
49
|
+
|
|
50
|
+
# Note: reflex/hold (the control loop) lives in act.reflex and is imported directly by callers --
|
|
51
|
+
# kept out of this module's import graph to avoid a cycle (reflex -> approve -> act).
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def build_executor(source: str, path: Path) -> Executor | None:
|
|
55
|
+
"""The registered Executor for ``source``, or None when the source is observe-only."""
|
|
56
|
+
factory = EXECUTORS.get(source)
|
|
57
|
+
return factory(path) if factory is not None else None
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Ansible executor: reconcile a drifted host back to its playbook, with guardrails.
|
|
2
|
+
|
|
3
|
+
A drift from the ansible source is `host:task` -- a task `ansible-playbook --check` said
|
|
4
|
+
would change on a host. Remediation is the natural inverse: run the playbook for real,
|
|
5
|
+
scoped to that host (`ansible-playbook --limit <host>`), which is reconcile-toward-declared
|
|
6
|
+
(the safe self-heal direction -- Ansible doesn't destroy undeclared resources). Live apply is
|
|
7
|
+
gated behind apply-eligibility AND `confirm=True`; nothing runs by default.
|
|
8
|
+
|
|
9
|
+
Ansible is not transactional, so there is no clean snapshot/auto-revert (unlike terraform's
|
|
10
|
+
plan). We're honest about that in the plan's revert guidance. Verify re-runs `--check` for the
|
|
11
|
+
host and reports whether the drift cleared.
|
|
12
|
+
|
|
13
|
+
The playbook + inventory are configured out of band (constructor or the env vars
|
|
14
|
+
STEADYSTATE_ANSIBLE_PLAYBOOK / STEADYSTATE_ANSIBLE_INVENTORY), since the drift input the CLI
|
|
15
|
+
passes is the captured check output, not the playbook itself.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
import subprocess
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
from ..model import Drift
|
|
25
|
+
from .base import RemediationResult
|
|
26
|
+
from .plan import RemediationPlan, Risk
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class AnsibleExecutor:
|
|
30
|
+
name = "ansible"
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
playbook: str | None = None,
|
|
35
|
+
inventory: str | None = None,
|
|
36
|
+
working_dir: str | Path | None = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
self.playbook = playbook or os.environ.get("STEADYSTATE_ANSIBLE_PLAYBOOK")
|
|
39
|
+
self.inventory = inventory or os.environ.get("STEADYSTATE_ANSIBLE_INVENTORY")
|
|
40
|
+
self.working_dir = Path(working_dir) if working_dir else None
|
|
41
|
+
|
|
42
|
+
def _host(self, drift: Drift) -> str:
|
|
43
|
+
return drift.identity.split(":", 1)[0]
|
|
44
|
+
|
|
45
|
+
def plan_for(self, drift: Drift) -> RemediationPlan:
|
|
46
|
+
host = self._host(drift)
|
|
47
|
+
command = ["ansible-playbook", "--limit", host]
|
|
48
|
+
if self.inventory:
|
|
49
|
+
command += ["-i", self.inventory]
|
|
50
|
+
if self.playbook:
|
|
51
|
+
command.append(self.playbook)
|
|
52
|
+
return RemediationPlan(
|
|
53
|
+
drift_identity=drift.identity,
|
|
54
|
+
# Re-running the playbook reconciles the host to declared -- the safe self-heal
|
|
55
|
+
# direction. Always eligible: Ansible converges toward the playbook, it doesn't
|
|
56
|
+
# destroy resources the playbook doesn't mention.
|
|
57
|
+
eligible=True,
|
|
58
|
+
risk=Risk.MEDIUM,
|
|
59
|
+
reason="Re-running the playbook on the host reconciles it to the declared config.",
|
|
60
|
+
command=command,
|
|
61
|
+
blast_radius=f"Runs the playbook against host {host}.",
|
|
62
|
+
revert=(
|
|
63
|
+
"Ansible is not transactional -- there is no automatic revert; restore from a "
|
|
64
|
+
"known-good playbook state and re-run if needed."
|
|
65
|
+
),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def remediate(self, drift: Drift, *, confirm: bool = False) -> RemediationResult:
|
|
69
|
+
plan = self.plan_for(drift)
|
|
70
|
+
if not confirm:
|
|
71
|
+
return RemediationResult(
|
|
72
|
+
plan=plan,
|
|
73
|
+
applied=False,
|
|
74
|
+
verified=False,
|
|
75
|
+
detail="Dry run: pass confirm=True (or --apply) to reconcile.",
|
|
76
|
+
)
|
|
77
|
+
if not self.playbook:
|
|
78
|
+
return RemediationResult(
|
|
79
|
+
plan=plan,
|
|
80
|
+
applied=False,
|
|
81
|
+
verified=False,
|
|
82
|
+
detail="No playbook configured; set STEADYSTATE_ANSIBLE_PLAYBOOK to apply.",
|
|
83
|
+
)
|
|
84
|
+
self._run(plan.command)
|
|
85
|
+
cleared = not self._still_drifting(drift)
|
|
86
|
+
return RemediationResult(
|
|
87
|
+
plan=plan,
|
|
88
|
+
applied=True,
|
|
89
|
+
verified=cleared,
|
|
90
|
+
detail="Applied and verified clear."
|
|
91
|
+
if cleared
|
|
92
|
+
else "Applied, but the host still drifts on re-check.",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# --- live ansible (guarded; not exercised by unit tests) ---
|
|
96
|
+
|
|
97
|
+
def _run(self, command: list[str]) -> None:
|
|
98
|
+
subprocess.run(command, cwd=self.working_dir, check=True, capture_output=True, text=True)
|
|
99
|
+
|
|
100
|
+
def _still_drifting(self, drift: Drift) -> bool:
|
|
101
|
+
from ..sources.ansible import AnsibleSource
|
|
102
|
+
|
|
103
|
+
residual = AnsibleSource(
|
|
104
|
+
playbook=self.playbook, inventory=self.inventory, working_dir=self.working_dir
|
|
105
|
+
).collect_drift()
|
|
106
|
+
return any(d.identity == drift.identity for d in residual)
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Shared remediation-approval core -- the CLI verbs and the chat listener both call here.
|
|
2
|
+
|
|
3
|
+
Approving rebuilds the source + executor from what the suggesting scan recorded, re-collects
|
|
4
|
+
to match the *live* drift by fingerprint (so the executor's snapshot/verify run against
|
|
5
|
+
reality, and an already-cleared drift is a clean no-op), then applies under the usual
|
|
6
|
+
guardrails. Decline marks it so a re-scan won't re-offer it.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from datetime import UTC, datetime
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from ..sources import build_drift_source
|
|
15
|
+
from ..state import (
|
|
16
|
+
APPLIED,
|
|
17
|
+
APPROVED,
|
|
18
|
+
BREAKGLASS,
|
|
19
|
+
DECLINED,
|
|
20
|
+
FAILED,
|
|
21
|
+
NOOP,
|
|
22
|
+
PENDING,
|
|
23
|
+
VERIFIED,
|
|
24
|
+
AuditEntry,
|
|
25
|
+
PendingAction,
|
|
26
|
+
StateStore,
|
|
27
|
+
)
|
|
28
|
+
from . import build_executor
|
|
29
|
+
from .base import RemediationResult
|
|
30
|
+
from .bounds import confirmation_tier
|
|
31
|
+
from .breakglass import BREAKGLASS_SOURCE, breakglass_allowed
|
|
32
|
+
from .catalog import action_for_command
|
|
33
|
+
from .cleanup import CLEANUP_SOURCE, run_cleanup
|
|
34
|
+
from .execute import CATALOG_SOURCE, run_catalog_action
|
|
35
|
+
from .solution_remedy import SOLUTION_SOURCE, run_solution, solution_named
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _audit(
|
|
39
|
+
action: PendingAction, actor: str, decision: str, outcome: str, detail: str | None
|
|
40
|
+
) -> AuditEntry:
|
|
41
|
+
"""Build the append-only audit record for a decision on ``action``."""
|
|
42
|
+
return AuditEntry(
|
|
43
|
+
fingerprint=action.fingerprint,
|
|
44
|
+
source=action.source,
|
|
45
|
+
drift_identity=action.drift_identity,
|
|
46
|
+
actor=actor,
|
|
47
|
+
decision=decision,
|
|
48
|
+
outcome=outcome,
|
|
49
|
+
environment=action.environment,
|
|
50
|
+
detail=detail,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def apply_pending(
|
|
55
|
+
store: StateStore,
|
|
56
|
+
fingerprint: str,
|
|
57
|
+
actor: str,
|
|
58
|
+
now: datetime | None = None,
|
|
59
|
+
*,
|
|
60
|
+
token: str = "",
|
|
61
|
+
) -> tuple[str, RemediationResult | None]:
|
|
62
|
+
"""Approve + run the pending remediation for ``fingerprint``. Returns a human message and
|
|
63
|
+
the RemediationResult when one ran (None when there was nothing to do). Every decision that
|
|
64
|
+
reaches a real remediation point is recorded to the append-only audit log. ``token`` is the
|
|
65
|
+
break-glass confirmation (the target's name for a strong-tier override); ignored otherwise."""
|
|
66
|
+
now = now or datetime.now(UTC)
|
|
67
|
+
action = store.get_pending(fingerprint)
|
|
68
|
+
if action is None or action.status != PENDING:
|
|
69
|
+
return "no pending remediation for that fingerprint.", None
|
|
70
|
+
if action.source == BREAKGLASS_SOURCE: # an out-of-bound action awaiting a human override
|
|
71
|
+
# Re-check the allowlist AT CONFIRM time, then the confirmation friction (strong tier: type
|
|
72
|
+
# the target's name, stored as drift_identity), then run with the bound overridden --
|
|
73
|
+
# audited as BREAKGLASS so `history` shows who overrode it.
|
|
74
|
+
if not breakglass_allowed(actor):
|
|
75
|
+
return (
|
|
76
|
+
f"break-glass not enabled for you ({actor}). Set STEADYSTATE_BREAKGLASS_USERS.",
|
|
77
|
+
None,
|
|
78
|
+
)
|
|
79
|
+
matched = action_for_command(action.command)
|
|
80
|
+
tier = confirmation_tier(matched.envelope) if matched is not None else 0
|
|
81
|
+
if tier >= 2 and token != action.drift_identity:
|
|
82
|
+
return (
|
|
83
|
+
f"break-glass: type the target to confirm -- "
|
|
84
|
+
f"approve {action.fingerprint} {action.drift_identity}",
|
|
85
|
+
None,
|
|
86
|
+
)
|
|
87
|
+
if not store.claim_pending(fingerprint, PENDING, APPROVED, actor):
|
|
88
|
+
return "no pending remediation for that fingerprint.", None
|
|
89
|
+
result = run_catalog_action(action, break_glass=True)
|
|
90
|
+
outcome = VERIFIED if result.verified else APPLIED if result.applied else FAILED
|
|
91
|
+
store.record_audit(_audit(action, actor, BREAKGLASS, outcome, result.detail), now)
|
|
92
|
+
return result.detail, result
|
|
93
|
+
if action.source == SOLUTION_SOURCE: # an authored runbook command -- operator-vouched, gated
|
|
94
|
+
# Same race guard + audit as the cleanup, but no content allow-pattern: the operator wrote
|
|
95
|
+
# and vouched for this command (it's IaC-grade runbook intent). Recover the bound from the
|
|
96
|
+
# named solution for the plan; the audit records the solution + author (in drift_identity).
|
|
97
|
+
if not store.claim_pending(fingerprint, PENDING, APPROVED, actor):
|
|
98
|
+
return "no pending remediation for that fingerprint.", None
|
|
99
|
+
result = run_solution(action, solution_named(action.drift_identity))
|
|
100
|
+
outcome = VERIFIED if result.verified else APPLIED if result.applied else FAILED
|
|
101
|
+
store.record_audit(_audit(action, actor, APPROVED, outcome, result.detail), now)
|
|
102
|
+
return result.detail, result
|
|
103
|
+
if action.source in (CLEANUP_SOURCE, CATALOG_SOURCE): # a direct, re-validated catalog command
|
|
104
|
+
# Claim before the irreversible step (same race guard as the drift path), then run the
|
|
105
|
+
# allow-listed command and audit it. No drift source/executor -- the command is it. Both
|
|
106
|
+
# the evicted cleanup and the general `fix`/`run` actions route here through the same gate.
|
|
107
|
+
if not store.claim_pending(fingerprint, PENDING, APPROVED, actor):
|
|
108
|
+
return "no pending remediation for that fingerprint.", None
|
|
109
|
+
result = (
|
|
110
|
+
run_cleanup(action) if action.source == CLEANUP_SOURCE else run_catalog_action(action)
|
|
111
|
+
)
|
|
112
|
+
outcome = VERIFIED if result.verified else APPLIED if result.applied else FAILED
|
|
113
|
+
store.record_audit(_audit(action, actor, APPROVED, outcome, result.detail), now)
|
|
114
|
+
return result.detail, result
|
|
115
|
+
executor = build_executor(action.source, Path(action.path))
|
|
116
|
+
if executor is None:
|
|
117
|
+
return f"source '{action.source}' is observe-only; cannot remediate.", None
|
|
118
|
+
# Atomically claim the action (pending -> approved) BEFORE anything irreversible. Two approvers
|
|
119
|
+
# racing the same fingerprint (two chat users) both read PENDING above; the conditional UPDATE
|
|
120
|
+
# lets exactly one win -- the loser bails here, so the remediation runs at most once.
|
|
121
|
+
if not store.claim_pending(fingerprint, PENDING, APPROVED, actor):
|
|
122
|
+
return "no pending remediation for that fingerprint.", None
|
|
123
|
+
drifts = build_drift_source(action.source, Path(action.path)).collect_drift()
|
|
124
|
+
drift = next((d for d in drifts if d.fingerprint == fingerprint), None)
|
|
125
|
+
if drift is None:
|
|
126
|
+
store.record_audit(_audit(action, actor, APPROVED, NOOP, "drift no longer present"), now)
|
|
127
|
+
return "drift no longer present; nothing to do.", None
|
|
128
|
+
result = executor.remediate(drift, confirm=True)
|
|
129
|
+
outcome = VERIFIED if result.verified else APPLIED if result.applied else FAILED
|
|
130
|
+
store.record_audit(_audit(action, actor, APPROVED, outcome, result.detail), now)
|
|
131
|
+
return result.detail, result
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def decline_pending(
|
|
135
|
+
store: StateStore, fingerprint: str, actor: str, now: datetime | None = None
|
|
136
|
+
) -> str:
|
|
137
|
+
"""Decline the pending remediation for ``fingerprint``. Returns a human message."""
|
|
138
|
+
now = now or datetime.now(UTC)
|
|
139
|
+
action = store.get_pending(fingerprint)
|
|
140
|
+
if action is None:
|
|
141
|
+
return "no pending remediation for that fingerprint."
|
|
142
|
+
store.set_pending_status(fingerprint, DECLINED, actor)
|
|
143
|
+
store.record_audit(_audit(action, actor, DECLINED, DECLINED, None), now)
|
|
144
|
+
return f"declined {fingerprint}"
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Remediation artifacts -- a remediation expressed as a *code change*, not a live apply.
|
|
2
|
+
|
|
3
|
+
The deterministic counterpart to the live executors. Where ``terraform.py`` reconciles by
|
|
4
|
+
changing reality to match the repo (``terraform apply``), an artifact reconciles the *other*
|
|
5
|
+
direction: it proposes a repo change a human reviews and merges. The canonical form is a
|
|
6
|
+
**patch** (a git-apply-able unified diff) -- auth-free, VCS-agnostic, and a pure string, so it
|
|
7
|
+
is fully testable and provably *not* model-authored. A branch / PR is a way to *deliver* a
|
|
8
|
+
patch (see ``act/deliver/``), never the artifact itself.
|
|
9
|
+
|
|
10
|
+
The artifact is honest about **state**: a code change for a resource that isn't in state can't
|
|
11
|
+
just edit files -- it must import the resource (the safe, non-destructive direction) or destroy
|
|
12
|
+
it (never automatic). ``state_ops`` records that effect in plain language and ``destructive``
|
|
13
|
+
flags the dangerous direction, so the dimension the apply path glosses over is explicit here.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
|
|
21
|
+
from ..model import ChangeType
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class RemediationArtifact:
|
|
26
|
+
"""A remediation rendered as a reviewable repo change.
|
|
27
|
+
|
|
28
|
+
``patch`` is the deterministic fix (a unified diff); ``state_ops`` and ``destructive`` make
|
|
29
|
+
the state effect explicit (an import vs a destroy); ``title`` / ``body`` narrate it -- today
|
|
30
|
+
deterministic, later LLM-authored when reasoning is enabled (the seam is the same)."""
|
|
31
|
+
|
|
32
|
+
drift_identity: str # the resource the change concerns, e.g. "aws_s3_bucket.logs"
|
|
33
|
+
change_type: ChangeType
|
|
34
|
+
path: str # repo-relative file the patch creates or edits
|
|
35
|
+
patch: str # git-apply-able unified diff -- the canonical, auth-free fix
|
|
36
|
+
state_ops: list[str] = field(default_factory=list) # plain-language state effects (imports)
|
|
37
|
+
destructive: bool = False # True only for a destroy variant; gates labeling + delivery
|
|
38
|
+
title: str = "" # PR / commit title
|
|
39
|
+
body: str = "" # PR body: what changes, why, and the import/destroy implication
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def slug(self) -> str:
|
|
43
|
+
"""A filesystem-safe id for this artifact (used to name a delivered ``.patch``)."""
|
|
44
|
+
return re.sub(r"[^A-Za-z0-9._-]", "_", self.drift_identity)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def files_from_patch(patch: str) -> dict[str, str]:
|
|
48
|
+
"""Reconstruct ``{repo_path: full_file_content}`` from a **whole-file-addition** unified diff
|
|
49
|
+
(the form ``new_file_patch`` produces) -- what an API-based delivery (github-pr) needs, since
|
|
50
|
+
it builds a tree from file contents, not a diff. Only new-file hunks are recovered; an edit or
|
|
51
|
+
delete hunk contributes nothing for that path, so a caller can detect the gap and skip rather
|
|
52
|
+
than ship a partial change. Deterministic, no git invoked."""
|
|
53
|
+
files: dict[str, str] = {}
|
|
54
|
+
path: str | None = None
|
|
55
|
+
is_new_file = False
|
|
56
|
+
body: list[str] = []
|
|
57
|
+
|
|
58
|
+
def _flush() -> None:
|
|
59
|
+
if path is not None and is_new_file:
|
|
60
|
+
files[path] = "\n".join(body) + ("\n" if body else "")
|
|
61
|
+
|
|
62
|
+
for line in patch.splitlines():
|
|
63
|
+
if line.startswith("diff --git "):
|
|
64
|
+
_flush()
|
|
65
|
+
path, is_new_file, body = None, False, []
|
|
66
|
+
elif line == "--- /dev/null":
|
|
67
|
+
is_new_file = True
|
|
68
|
+
elif line.startswith("+++ b/"):
|
|
69
|
+
path = line[len("+++ b/") :]
|
|
70
|
+
elif line.startswith("+") and not line.startswith("+++"):
|
|
71
|
+
body.append(line[1:])
|
|
72
|
+
_flush()
|
|
73
|
+
return files
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def new_file_patch(path: str, content: str) -> str:
|
|
77
|
+
"""A git-apply-able unified diff that *creates* ``path`` with ``content``.
|
|
78
|
+
|
|
79
|
+
The whole-file-addition form (``--- /dev/null`` -> ``+++ b/<path>``) -- what ``git apply``
|
|
80
|
+
expects for a new file. ``content`` is normalized to end in a newline so every added line,
|
|
81
|
+
including the last, terminates cleanly and no ``\`` marker is
|
|
82
|
+
needed. Pure string assembly: deterministic and unit-testable, no git invoked."""
|
|
83
|
+
if not content.endswith("\n"):
|
|
84
|
+
content += "\n"
|
|
85
|
+
lines = content.split("\n")[:-1] # drop the empty trailing element from the final newline
|
|
86
|
+
hunk = "".join(f"+{line}\n" for line in lines)
|
|
87
|
+
return (
|
|
88
|
+
f"diff --git a/{path} b/{path}\n"
|
|
89
|
+
"new file mode 100644\n"
|
|
90
|
+
"--- /dev/null\n"
|
|
91
|
+
f"+++ b/{path}\n"
|
|
92
|
+
f"@@ -0,0 +1,{len(lines)} @@\n"
|
|
93
|
+
f"{hunk}"
|
|
94
|
+
)
|
steadystate/act/base.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""The Executor plugin seam + its result type.
|
|
2
|
+
|
|
3
|
+
Every remediation must be apply-eligibility-checked, snapshotted, verified, and
|
|
4
|
+
reversible. Chat is a convenient trigger, never a bypass of these guardrails.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Protocol, runtime_checkable
|
|
11
|
+
|
|
12
|
+
from ..model import Drift
|
|
13
|
+
from .artifact import RemediationArtifact
|
|
14
|
+
from .plan import RemediationPlan
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class RemediationResult:
|
|
19
|
+
plan: RemediationPlan
|
|
20
|
+
applied: bool
|
|
21
|
+
verified: bool # post-apply re-check: did the drift actually clear?
|
|
22
|
+
detail: str = ""
|
|
23
|
+
snapshot: dict | None = field(default=None) # pre-change state, for the record / revert
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@runtime_checkable
|
|
27
|
+
class Executor(Protocol):
|
|
28
|
+
name: str
|
|
29
|
+
|
|
30
|
+
def plan_for(self, drift: Drift) -> RemediationPlan: ...
|
|
31
|
+
|
|
32
|
+
def remediate(self, drift: Drift, *, confirm: bool = False) -> RemediationResult: ...
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@runtime_checkable
|
|
36
|
+
class Proposer(Protocol):
|
|
37
|
+
"""An *optional* executor capability: render a drift as a reviewable code change instead of a
|
|
38
|
+
live apply. Probed by ``isinstance(executor, Proposer)`` -- like the inbound adapters' optional
|
|
39
|
+
``defer``/``complete`` -- so an executor that can express a fix as a patch implements it and one
|
|
40
|
+
that can only apply live simply doesn't, and the propose path degrades honestly for it.
|
|
41
|
+
|
|
42
|
+
``propose`` returns ``None`` for a drift it has no code-change for (e.g. the apply direction is
|
|
43
|
+
the right fix), so a caller can offer artifacts only where one genuinely exists."""
|
|
44
|
+
|
|
45
|
+
def propose(self, drift: Drift) -> RemediationArtifact | None: ...
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Action envelopes + the bound: one impact-and-reversibility calculus for *all* infrastructure.
|
|
2
|
+
|
|
3
|
+
The decision a good operator makes before any change is always the same two questions -- *how much
|
|
4
|
+
does this touch, and can I undo it?* -- and it does not depend on whether the change is a
|
|
5
|
+
`kubectl delete`, a `terraform apply`, or an ansible play. So that calculus lives here, once,
|
|
6
|
+
backend-agnostic: every action declares an ``Envelope`` (its reversibility and its blast radius on
|
|
7
|
+
a generic scale), and a human declares the **bound** -- which envelopes may run unattended. The
|
|
8
|
+
gate (``within_bounds``) sees only the envelope, never a backend.
|
|
9
|
+
|
|
10
|
+
What the bound governs today: the autonomous paths -- reflexes (``hold``), the decider
|
|
11
|
+
(``propose``), and the drift ``--autonomy auto`` path for any executor that declares an envelope
|
|
12
|
+
(terraform does; ``can_run_unattended`` in act/plan.py is the gate). An executor that has not yet
|
|
13
|
+
declared envelopes (ansible) falls back to the older ``eligible`` boolean -- the migration is
|
|
14
|
+
incremental, and absence of an envelope only keeps prior behavior, never loosens it. So this is on
|
|
15
|
+
its way to "one grid governs every source", and is honest that it is not all the way there yet.
|
|
16
|
+
|
|
17
|
+
This is the spine the autonomy story stands on. A reflex today, an LLM tomorrow, decides *what to
|
|
18
|
+
do*; the bound decides *how much it is ever allowed to break*. The decider proposes an action and
|
|
19
|
+
its envelope; the gate checks that envelope against the human's bound; out-of-bound escalates no
|
|
20
|
+
matter how confident the decider is. The one decision that never goes to the code or the model is
|
|
21
|
+
the bound itself -- a human sets it (the conservative default, widened via ``STEADYSTATE_BOUND`` as
|
|
22
|
+
trust grows), and flipping a reflex to ``auto`` can never cross it.
|
|
23
|
+
|
|
24
|
+
The two axes are deliberately generic; each backend maps its own nouns onto them:
|
|
25
|
+
|
|
26
|
+
Impact k8s terraform ansible compose
|
|
27
|
+
------ --- --------- ------- -------
|
|
28
|
+
ONE a pod one resource one host/task one container
|
|
29
|
+
SERVICE a workload a module a role a service
|
|
30
|
+
TENANT a namespace a workspace/state an inventory group a project/stack
|
|
31
|
+
NODE a node -- a managed host the docker host
|
|
32
|
+
FLEET the cluster a root/account/region the whole inventory the engine
|
|
33
|
+
|
|
34
|
+
Reversibility example
|
|
35
|
+
------------- -------
|
|
36
|
+
LOSSLESS destroys nothing of value (delete an evicted pod, `docker rm` a dead container)
|
|
37
|
+
SELF_HEALING the platform restores it (delete a Running pod, restart a service, cordon a node)
|
|
38
|
+
RECOVERABLE a known inverse exists (scale down<->up, a re-appliable terraform change)
|
|
39
|
+
IRREVERSIBLE real loss, no inverse (delete a PVC, `terraform destroy` a database)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import os
|
|
45
|
+
from dataclasses import dataclass
|
|
46
|
+
from enum import IntEnum
|
|
47
|
+
|
|
48
|
+
# Both axes are ordinal (IntEnum), worst last -- so a policy is just "the highest tier still
|
|
49
|
+
# allowed" and the gate is a comparison. Names render lowercased for humans; the order is the point.
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Reversibility(IntEnum):
|
|
53
|
+
"""Can the action be undone, and is anything of value lost if it can't? Ascending severity."""
|
|
54
|
+
|
|
55
|
+
LOSSLESS = 0
|
|
56
|
+
SELF_HEALING = 1
|
|
57
|
+
RECOVERABLE = 2
|
|
58
|
+
IRREVERSIBLE = 3
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class Impact(IntEnum):
|
|
62
|
+
"""The blast radius on a generic, cross-backend scale (each backend maps its nouns). Rising."""
|
|
63
|
+
|
|
64
|
+
ONE = 0
|
|
65
|
+
SERVICE = 1
|
|
66
|
+
TENANT = 2
|
|
67
|
+
NODE = 3
|
|
68
|
+
FLEET = 4
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass(frozen=True)
|
|
72
|
+
class Envelope:
|
|
73
|
+
"""What an action would do, in the only two terms the bound cares about. Backend-agnostic: a
|
|
74
|
+
kubectl cleanup, a terraform apply, and an ansible play all describe themselves this way."""
|
|
75
|
+
|
|
76
|
+
reversibility: Reversibility
|
|
77
|
+
impact: Impact
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def label(self) -> str:
|
|
81
|
+
return f"{self.reversibility.name.lower()}/{self.impact.name.lower()}"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# The bound: for each reversibility, the HIGHEST impact tier that may run unattended (None = never).
|
|
85
|
+
# This is the human's 3am calculus, written down once. Conservative by default -- only a lossless or
|
|
86
|
+
# self-healing action, and only within a small blast radius, runs without a human; anything
|
|
87
|
+
# recoverable-or-worse, or anything reaching a node/the fleet, escalates. An operator widens it as
|
|
88
|
+
# trust grows (the same graduation `hold`'s reflexes use), but it is ALWAYS a human's decision: the
|
|
89
|
+
# bound is the one thing a decider -- reflex or model -- never sets for itself.
|
|
90
|
+
BoundPolicy = dict[Reversibility, "Impact | None"]
|
|
91
|
+
|
|
92
|
+
DEFAULT_BOUND: BoundPolicy = {
|
|
93
|
+
Reversibility.LOSSLESS: Impact.TENANT, # lossless, up to a whole tenant (namespace/stack): auto
|
|
94
|
+
Reversibility.SELF_HEALING: Impact.SERVICE, # self-healing up to one service -> auto
|
|
95
|
+
Reversibility.RECOVERABLE: None, # a known inverse still needs a human, until trust is earned
|
|
96
|
+
Reversibility.IRREVERSIBLE: None, # never autonomous, at any size
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def within_bounds(envelope: Envelope, policy: BoundPolicy = DEFAULT_BOUND) -> bool:
|
|
101
|
+
"""True iff ``envelope`` may run unattended under ``policy`` -- the gate every decider passes
|
|
102
|
+
through, seeing only the envelope, never a backend. Pure. ``False`` (escalate) is the safe
|
|
103
|
+
default for any reversibility the policy doesn't permit."""
|
|
104
|
+
ceiling = policy.get(envelope.reversibility)
|
|
105
|
+
return ceiling is not None and envelope.impact <= ceiling
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def confirmation_tier(envelope: Envelope, policy: BoundPolicy = DEFAULT_BOUND) -> int:
|
|
109
|
+
"""How much confirmation friction an action needs, from its envelope alone. ``0`` = within the
|
|
110
|
+
bound -- autonomous-eligible, no confirmation (`fix`/`run` just runs it). Out of bound is
|
|
111
|
+
break-glass: ``2`` (STRONG -- type the target's name to confirm) when it's IRREVERSIBLE or
|
|
112
|
+
reaches a NODE/the FLEET; else ``1`` (light -- a plain confirm). So the most dangerous things
|
|
113
|
+
get the most friction, automatically. Pure."""
|
|
114
|
+
if within_bounds(envelope, policy):
|
|
115
|
+
return 0
|
|
116
|
+
if envelope.reversibility >= Reversibility.IRREVERSIBLE or envelope.impact >= Impact.NODE:
|
|
117
|
+
return 2
|
|
118
|
+
return 1
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
_REVERSIBILITY_BY_NAME = {r.name.lower(): r for r in Reversibility}
|
|
122
|
+
_IMPACT_BY_NAME = {i.name.lower(): i for i in Impact}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _config_bound_table() -> dict:
|
|
126
|
+
from ..config import config_table # local import: keep bounds.py importable without the config
|
|
127
|
+
|
|
128
|
+
return config_table("bound")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _apply_bound_pair(policy: BoundPolicy, rev_name: str, impact_name: str) -> None:
|
|
132
|
+
"""Overlay one ``reversibility=impact`` decision onto ``policy`` (in place). An unknown
|
|
133
|
+
reversibility or impact is **skipped**, never applied -- a typo can only leave the bound at the
|
|
134
|
+
conservative default, never silently widen it ('never escalate on uncertainty')."""
|
|
135
|
+
reversibility = _REVERSIBILITY_BY_NAME.get(rev_name.strip().lower())
|
|
136
|
+
if reversibility is None:
|
|
137
|
+
return
|
|
138
|
+
impact = impact_name.strip().lower()
|
|
139
|
+
if impact in ("none", "never", ""):
|
|
140
|
+
policy[reversibility] = None # explicitly forbid auto for this reversibility
|
|
141
|
+
elif impact in _IMPACT_BY_NAME:
|
|
142
|
+
policy[reversibility] = _IMPACT_BY_NAME[impact]
|
|
143
|
+
# else: an unknown impact name -> skip (stay conservative; never widen on a typo)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def bound_from_env(raw: str | None = None) -> BoundPolicy:
|
|
147
|
+
"""The *active* bound: ``DEFAULT_BOUND`` overlaid with the committed ``[bound]`` table, then the
|
|
148
|
+
``STEADYSTATE_BOUND`` env knob (env wins). The widen/narrow dial every autonomy gate reads, so
|
|
149
|
+
one source governs the whole tool: drift auto-apply, reflexes, solution auto-apply, the decider.
|
|
150
|
+
|
|
151
|
+
The bound is the one decision that should never be casual -- so it belongs in the **committed
|
|
152
|
+
config**, reviewed in PRs (`[bound]` with ``reversibility = "impact"`` keys), with the env var
|
|
153
|
+
the per-run override. Reversibility names: ``lossless`` / ``self_healing`` / ``recoverable`` /
|
|
154
|
+
``irreversible``; impact names ``one`` / ``service`` / ``tenant`` / ``node`` / ``fleet`` (each
|
|
155
|
+
pair names the HIGHEST impact tier that may run unattended for that reversibility; ``none``
|
|
156
|
+
forbids it). Env format mirrors it: comma-separated ``reversibility=impact`` pairs, e.g.
|
|
157
|
+
``STEADYSTATE_BOUND="recoverable=service"``.
|
|
158
|
+
|
|
159
|
+
Unparseable entries are skipped (never widen on a typo). Pure given ``raw``; with ``raw`` None
|
|
160
|
+
it reads the committed ``[bound]`` table then ``STEADYSTATE_BOUND``."""
|
|
161
|
+
policy = dict(DEFAULT_BOUND)
|
|
162
|
+
if raw is None: # the live resolution: committed config first (baseline), then env (override)
|
|
163
|
+
for rev_name, impact in _config_bound_table().items():
|
|
164
|
+
_apply_bound_pair(policy, rev_name, str(impact))
|
|
165
|
+
raw = os.environ.get("STEADYSTATE_BOUND", "")
|
|
166
|
+
for token in raw.replace(";", ",").split(","):
|
|
167
|
+
key, sep, value = token.partition("=")
|
|
168
|
+
if sep: # a `reversibility=impact` pair (a bare token is skipped)
|
|
169
|
+
_apply_bound_pair(policy, key, value)
|
|
170
|
+
return policy
|