steadystate 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- steadystate-0.1.0/.claude/settings.json +7 -0
- steadystate-0.1.0/.dockerignore +16 -0
- steadystate-0.1.0/.gitattributes +3 -0
- steadystate-0.1.0/.github/dependabot.yml +12 -0
- steadystate-0.1.0/.github/workflows/ci.yml +47 -0
- steadystate-0.1.0/.github/workflows/codeql.yml +41 -0
- steadystate-0.1.0/.github/workflows/release.yml +69 -0
- steadystate-0.1.0/.github/workflows/security.yml +56 -0
- steadystate-0.1.0/.gitignore +17 -0
- steadystate-0.1.0/.mcp.json +8 -0
- steadystate-0.1.0/ARCHITECTURE.md +184 -0
- steadystate-0.1.0/CHANGELOG.md +319 -0
- steadystate-0.1.0/CONFIG.md +130 -0
- steadystate-0.1.0/CONTRIBUTING.md +101 -0
- steadystate-0.1.0/Dockerfile +33 -0
- steadystate-0.1.0/LICENSE +201 -0
- steadystate-0.1.0/LLM_SAFETY.md +141 -0
- steadystate-0.1.0/PKG-INFO +214 -0
- steadystate-0.1.0/README.md +190 -0
- steadystate-0.1.0/SECURITY.md +57 -0
- steadystate-0.1.0/demo/README.md +127 -0
- steadystate-0.1.0/demo/ansible-fleet-drift.json +90 -0
- steadystate-0.1.0/demo/argocd-incident.json +19 -0
- steadystate-0.1.0/demo/k8s-insecure.json +14 -0
- steadystate-0.1.0/deploy/README.md +24 -0
- steadystate-0.1.0/deploy/discord/README.md +75 -0
- steadystate-0.1.0/deploy/discord/command.json +118 -0
- steadystate-0.1.0/deploy/discord/register.py +76 -0
- steadystate-0.1.0/deploy/github-actions/drift.yml +53 -0
- steadystate-0.1.0/deploy/kubernetes/cronjob.yaml +54 -0
- steadystate-0.1.0/deploy/kubernetes/listener.yaml +112 -0
- steadystate-0.1.0/deploy/kubernetes/rbac.yaml +54 -0
- steadystate-0.1.0/deploy/teams/README.md +60 -0
- steadystate-0.1.0/docs/repo-native-posture.md +125 -0
- steadystate-0.1.0/examples/README.md +56 -0
- steadystate-0.1.0/examples/bastion-host/README.md +151 -0
- steadystate-0.1.0/examples/brokered-creds/README.md +80 -0
- steadystate-0.1.0/examples/chat-listener/README.md +83 -0
- steadystate-0.1.0/examples/ci-terraform/README.md +51 -0
- steadystate-0.1.0/examples/contained-agent/README.md +104 -0
- steadystate-0.1.0/examples/custom-checks/README.md +105 -0
- steadystate-0.1.0/examples/fleet-health/README.md +127 -0
- steadystate-0.1.0/examples/k8s-cronjob/README.md +63 -0
- steadystate-0.1.0/examples/mcp-copilot/README.md +163 -0
- steadystate-0.1.0/examples/repo-native/README.md +47 -0
- steadystate-0.1.0/examples/repo-native/steadystate/config.toml +24 -0
- steadystate-0.1.0/examples/sample-plan.json +33 -0
- steadystate-0.1.0/examples/solutions/README.md +95 -0
- steadystate-0.1.0/examples/solutions/solutions.json +41 -0
- steadystate-0.1.0/pyproject.toml +83 -0
- steadystate-0.1.0/src/steadystate/__init__.py +7 -0
- steadystate-0.1.0/src/steadystate/_http.py +33 -0
- steadystate-0.1.0/src/steadystate/act/__init__.py +57 -0
- steadystate-0.1.0/src/steadystate/act/ansible.py +106 -0
- steadystate-0.1.0/src/steadystate/act/approve.py +144 -0
- steadystate-0.1.0/src/steadystate/act/artifact.py +94 -0
- steadystate-0.1.0/src/steadystate/act/base.py +45 -0
- steadystate-0.1.0/src/steadystate/act/bounds.py +170 -0
- steadystate-0.1.0/src/steadystate/act/breakglass.py +36 -0
- steadystate-0.1.0/src/steadystate/act/catalog.py +298 -0
- steadystate-0.1.0/src/steadystate/act/cleanup.py +209 -0
- steadystate-0.1.0/src/steadystate/act/codify.py +142 -0
- steadystate-0.1.0/src/steadystate/act/decide.py +353 -0
- steadystate-0.1.0/src/steadystate/act/deliver/__init__.py +44 -0
- steadystate-0.1.0/src/steadystate/act/deliver/base.py +40 -0
- steadystate-0.1.0/src/steadystate/act/deliver/github_pr.py +189 -0
- steadystate-0.1.0/src/steadystate/act/deliver/patch_file.py +37 -0
- steadystate-0.1.0/src/steadystate/act/execute.py +93 -0
- steadystate-0.1.0/src/steadystate/act/learn.py +284 -0
- steadystate-0.1.0/src/steadystate/act/plan.py +112 -0
- steadystate-0.1.0/src/steadystate/act/reflex.py +320 -0
- steadystate-0.1.0/src/steadystate/act/solution_remedy.py +201 -0
- steadystate-0.1.0/src/steadystate/act/terraform.py +130 -0
- steadystate-0.1.0/src/steadystate/catalog.py +206 -0
- steadystate-0.1.0/src/steadystate/classify.py +103 -0
- steadystate-0.1.0/src/steadystate/cli.py +2525 -0
- steadystate-0.1.0/src/steadystate/compliance.py +169 -0
- steadystate-0.1.0/src/steadystate/config.py +46 -0
- steadystate-0.1.0/src/steadystate/discover.py +1227 -0
- steadystate-0.1.0/src/steadystate/domains/__init__.py +89 -0
- steadystate-0.1.0/src/steadystate/domains/base.py +140 -0
- steadystate-0.1.0/src/steadystate/domains/compliance.py +217 -0
- steadystate-0.1.0/src/steadystate/domains/security.py +192 -0
- steadystate-0.1.0/src/steadystate/domains/security_azure.py +270 -0
- steadystate-0.1.0/src/steadystate/domains/security_gcp.py +224 -0
- steadystate-0.1.0/src/steadystate/domains/security_k8s.py +238 -0
- steadystate-0.1.0/src/steadystate/engine.py +202 -0
- steadystate-0.1.0/src/steadystate/health.py +53 -0
- steadystate-0.1.0/src/steadystate/inbound/__init__.py +43 -0
- steadystate-0.1.0/src/steadystate/inbound/base.py +409 -0
- steadystate-0.1.0/src/steadystate/inbound/discord.py +201 -0
- steadystate-0.1.0/src/steadystate/inbound/mcp.py +403 -0
- steadystate-0.1.0/src/steadystate/inbound/server.py +1365 -0
- steadystate-0.1.0/src/steadystate/inbound/slack.py +141 -0
- steadystate-0.1.0/src/steadystate/inbound/teams.py +112 -0
- steadystate-0.1.0/src/steadystate/inbound/translate.py +249 -0
- steadystate-0.1.0/src/steadystate/metrics.py +155 -0
- steadystate-0.1.0/src/steadystate/model.py +85 -0
- steadystate-0.1.0/src/steadystate/notify/__init__.py +62 -0
- steadystate-0.1.0/src/steadystate/notify/base.py +21 -0
- steadystate-0.1.0/src/steadystate/notify/console.py +185 -0
- steadystate-0.1.0/src/steadystate/notify/discord.py +142 -0
- steadystate-0.1.0/src/steadystate/notify/github.py +220 -0
- steadystate-0.1.0/src/steadystate/notify/grafana.py +103 -0
- steadystate-0.1.0/src/steadystate/notify/pagerduty.py +98 -0
- steadystate-0.1.0/src/steadystate/notify/prometheus.py +150 -0
- steadystate-0.1.0/src/steadystate/notify/servicenow.py +234 -0
- steadystate-0.1.0/src/steadystate/notify/slack.py +137 -0
- steadystate-0.1.0/src/steadystate/notify/teams.py +138 -0
- steadystate-0.1.0/src/steadystate/notify/webhook.py +100 -0
- steadystate-0.1.0/src/steadystate/onboarding.py +398 -0
- steadystate-0.1.0/src/steadystate/plugins.py +81 -0
- steadystate-0.1.0/src/steadystate/probe/__init__.py +99 -0
- steadystate-0.1.0/src/steadystate/probe/ansible_health.py +318 -0
- steadystate-0.1.0/src/steadystate/probe/argocd.py +75 -0
- steadystate-0.1.0/src/steadystate/probe/base.py +69 -0
- steadystate-0.1.0/src/steadystate/probe/custom.py +830 -0
- steadystate-0.1.0/src/steadystate/probe/docker.py +169 -0
- steadystate-0.1.0/src/steadystate/probe/kubectl.py +613 -0
- steadystate-0.1.0/src/steadystate/probe/solutions.py +241 -0
- steadystate-0.1.0/src/steadystate/reason/__init__.py +1 -0
- steadystate-0.1.0/src/steadystate/reason/alert.py +119 -0
- steadystate-0.1.0/src/steadystate/reason/correlate.py +154 -0
- steadystate-0.1.0/src/steadystate/reason/cost.py +188 -0
- steadystate-0.1.0/src/steadystate/reason/enrich.py +357 -0
- steadystate-0.1.0/src/steadystate/reason/explain.py +64 -0
- steadystate-0.1.0/src/steadystate/reason/llm.py +457 -0
- steadystate-0.1.0/src/steadystate/reason/pipeline.py +397 -0
- steadystate-0.1.0/src/steadystate/reason/report.py +74 -0
- steadystate-0.1.0/src/steadystate/reconcile.py +54 -0
- steadystate-0.1.0/src/steadystate/reconcile_state.py +230 -0
- steadystate-0.1.0/src/steadystate/serialize.py +117 -0
- steadystate-0.1.0/src/steadystate/silos.py +85 -0
- steadystate-0.1.0/src/steadystate/sources/__init__.py +204 -0
- steadystate-0.1.0/src/steadystate/sources/ansible.py +138 -0
- steadystate-0.1.0/src/steadystate/sources/argocd.py +79 -0
- steadystate-0.1.0/src/steadystate/sources/base.py +141 -0
- steadystate-0.1.0/src/steadystate/sources/docker_compose.py +169 -0
- steadystate-0.1.0/src/steadystate/sources/helm.py +96 -0
- steadystate-0.1.0/src/steadystate/sources/k8s.py +757 -0
- steadystate-0.1.0/src/steadystate/sources/rancher.py +105 -0
- steadystate-0.1.0/src/steadystate/sources/terraform.py +119 -0
- steadystate-0.1.0/src/steadystate/state.py +708 -0
- steadystate-0.1.0/src/steadystate/sweep.py +176 -0
- steadystate-0.1.0/src/steadystate/targets.py +140 -0
- steadystate-0.1.0/tests/test_action_catalog.py +140 -0
- steadystate-0.1.0/tests/test_ansible.py +113 -0
- steadystate-0.1.0/tests/test_ansible_executor.py +66 -0
- steadystate-0.1.0/tests/test_ansible_health_probe.py +209 -0
- steadystate-0.1.0/tests/test_ansible_live.py +111 -0
- steadystate-0.1.0/tests/test_argocd.py +60 -0
- steadystate-0.1.0/tests/test_argocd_probe.py +67 -0
- steadystate-0.1.0/tests/test_artifact.py +126 -0
- steadystate-0.1.0/tests/test_audit.py +108 -0
- steadystate-0.1.0/tests/test_bounds.py +112 -0
- steadystate-0.1.0/tests/test_breakglass.py +151 -0
- steadystate-0.1.0/tests/test_capabilities.py +78 -0
- steadystate-0.1.0/tests/test_catalog.py +98 -0
- steadystate-0.1.0/tests/test_catalog_actions.py +121 -0
- steadystate-0.1.0/tests/test_chat_cli.py +222 -0
- steadystate-0.1.0/tests/test_chat_fix_run.py +118 -0
- steadystate-0.1.0/tests/test_chat_fp_resolution.py +117 -0
- steadystate-0.1.0/tests/test_chat_homeostat_reads.py +56 -0
- steadystate-0.1.0/tests/test_chat_nl.py +167 -0
- steadystate-0.1.0/tests/test_chat_unmute_snooze.py +93 -0
- steadystate-0.1.0/tests/test_ci.py +58 -0
- steadystate-0.1.0/tests/test_classify.py +59 -0
- steadystate-0.1.0/tests/test_cleanup.py +165 -0
- steadystate-0.1.0/tests/test_cli.py +89 -0
- steadystate-0.1.0/tests/test_cohere.py +72 -0
- steadystate-0.1.0/tests/test_compliance.py +149 -0
- steadystate-0.1.0/tests/test_compliance_domain.py +156 -0
- steadystate-0.1.0/tests/test_config.py +97 -0
- steadystate-0.1.0/tests/test_console_state.py +143 -0
- steadystate-0.1.0/tests/test_correlate.py +75 -0
- steadystate-0.1.0/tests/test_correlators.py +109 -0
- steadystate-0.1.0/tests/test_cost.py +272 -0
- steadystate-0.1.0/tests/test_custom_checks.py +581 -0
- steadystate-0.1.0/tests/test_decide.py +159 -0
- steadystate-0.1.0/tests/test_decider_auto.py +71 -0
- steadystate-0.1.0/tests/test_decider_to_pending.py +129 -0
- steadystate-0.1.0/tests/test_deliver.py +211 -0
- steadystate-0.1.0/tests/test_deterministic_correlate.py +268 -0
- steadystate-0.1.0/tests/test_discord.py +204 -0
- steadystate-0.1.0/tests/test_discord_inbound.py +300 -0
- steadystate-0.1.0/tests/test_discover.py +1333 -0
- steadystate-0.1.0/tests/test_docker_compose.py +60 -0
- steadystate-0.1.0/tests/test_docker_probe.py +138 -0
- steadystate-0.1.0/tests/test_engine.py +109 -0
- steadystate-0.1.0/tests/test_enrich.py +275 -0
- steadystate-0.1.0/tests/test_enrich_sentinel.py +133 -0
- steadystate-0.1.0/tests/test_executor.py +270 -0
- steadystate-0.1.0/tests/test_explain.py +143 -0
- steadystate-0.1.0/tests/test_github_surface.py +174 -0
- steadystate-0.1.0/tests/test_grafana.py +190 -0
- steadystate-0.1.0/tests/test_health.py +73 -0
- steadystate-0.1.0/tests/test_helm.py +100 -0
- steadystate-0.1.0/tests/test_http.py +40 -0
- steadystate-0.1.0/tests/test_http_source_failures.py +102 -0
- steadystate-0.1.0/tests/test_inbound.py +1119 -0
- steadystate-0.1.0/tests/test_k8s.py +456 -0
- steadystate-0.1.0/tests/test_kubeconfig.py +108 -0
- steadystate-0.1.0/tests/test_learn.py +243 -0
- steadystate-0.1.0/tests/test_llm_egress_gate.py +112 -0
- steadystate-0.1.0/tests/test_llm_mock.py +107 -0
- steadystate-0.1.0/tests/test_llm_provider.py +228 -0
- steadystate-0.1.0/tests/test_mcp.py +365 -0
- steadystate-0.1.0/tests/test_metrics.py +152 -0
- steadystate-0.1.0/tests/test_model.py +15 -0
- steadystate-0.1.0/tests/test_onboarding.py +180 -0
- steadystate-0.1.0/tests/test_pending.py +263 -0
- steadystate-0.1.0/tests/test_pipeline.py +54 -0
- steadystate-0.1.0/tests/test_plugins.py +222 -0
- steadystate-0.1.0/tests/test_policy_pipeline.py +139 -0
- steadystate-0.1.0/tests/test_probe.py +634 -0
- steadystate-0.1.0/tests/test_prometheus.py +225 -0
- steadystate-0.1.0/tests/test_rancher.py +90 -0
- steadystate-0.1.0/tests/test_reconcile.py +114 -0
- steadystate-0.1.0/tests/test_reflex.py +286 -0
- steadystate-0.1.0/tests/test_registry.py +125 -0
- steadystate-0.1.0/tests/test_remediability.py +121 -0
- steadystate-0.1.0/tests/test_report.py +83 -0
- steadystate-0.1.0/tests/test_scan_k8s_live.py +103 -0
- steadystate-0.1.0/tests/test_scan_state.py +480 -0
- steadystate-0.1.0/tests/test_security_azure_domain.py +385 -0
- steadystate-0.1.0/tests/test_security_domain.py +326 -0
- steadystate-0.1.0/tests/test_security_gcp_domain.py +312 -0
- steadystate-0.1.0/tests/test_security_k8s_domain.py +122 -0
- steadystate-0.1.0/tests/test_serialize.py +146 -0
- steadystate-0.1.0/tests/test_servicenow.py +229 -0
- steadystate-0.1.0/tests/test_silos.py +124 -0
- steadystate-0.1.0/tests/test_slack.py +103 -0
- steadystate-0.1.0/tests/test_solution_remedy.py +198 -0
- steadystate-0.1.0/tests/test_solutions.py +249 -0
- steadystate-0.1.0/tests/test_source_failures.py +136 -0
- steadystate-0.1.0/tests/test_source_robustness_guard.py +97 -0
- steadystate-0.1.0/tests/test_state.py +357 -0
- steadystate-0.1.0/tests/test_state_concurrency.py +96 -0
- steadystate-0.1.0/tests/test_suggest_codify.py +98 -0
- steadystate-0.1.0/tests/test_summary.py +154 -0
- steadystate-0.1.0/tests/test_surfaces.py +100 -0
- steadystate-0.1.0/tests/test_sweep.py +354 -0
- steadystate-0.1.0/tests/test_symptom_pipeline.py +169 -0
- steadystate-0.1.0/tests/test_targets.py +281 -0
- steadystate-0.1.0/tests/test_teams.py +246 -0
- steadystate-0.1.0/tests/test_teams_inbound.py +135 -0
- steadystate-0.1.0/tests/test_terraform.py +108 -0
- steadystate-0.1.0/tests/test_terraform_state_source.py +69 -0
- steadystate-0.1.0/tests/test_verify.py +169 -0
- steadystate-0.1.0/tests/test_webhook_pagerduty.py +123 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
version: 2
|
|
2
|
+
updates:
|
|
3
|
+
# Keep pyproject runtime/dev deps fresh.
|
|
4
|
+
- package-ecosystem: "pip"
|
|
5
|
+
directory: "/"
|
|
6
|
+
schedule:
|
|
7
|
+
interval: "weekly"
|
|
8
|
+
# Keep the pinned action majors fresh, so e.g. checkout@v4 doesn't silently age.
|
|
9
|
+
- package-ecosystem: "github-actions"
|
|
10
|
+
directory: "/"
|
|
11
|
+
schedule:
|
|
12
|
+
interval: "weekly"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
name: ci
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
# Least privilege: the default GITHUB_TOKEN only needs to read the repo to
|
|
9
|
+
# check out + run tests. Shrinks blast radius if a step is ever compromised.
|
|
10
|
+
permissions:
|
|
11
|
+
contents: read
|
|
12
|
+
|
|
13
|
+
# Per-branch concurrency: a newer push to the same ref cancels the older run,
|
|
14
|
+
# so rapid pushes don't queue up redundant CI.
|
|
15
|
+
concurrency:
|
|
16
|
+
group: ci-${{ github.ref }}
|
|
17
|
+
cancel-in-progress: true
|
|
18
|
+
|
|
19
|
+
jobs:
|
|
20
|
+
check:
|
|
21
|
+
# GitHub-hosted + isolated per job, so fork PRs run safely (no shared box).
|
|
22
|
+
runs-on: ubuntu-latest
|
|
23
|
+
timeout-minutes: 15
|
|
24
|
+
steps:
|
|
25
|
+
- uses: actions/checkout@v6
|
|
26
|
+
- uses: actions/setup-python@v6
|
|
27
|
+
with:
|
|
28
|
+
# The supported floor (pyproject requires-python >=3.11); passing here
|
|
29
|
+
# means we don't lean on any 3.12+-only behavior.
|
|
30
|
+
python-version: "3.11"
|
|
31
|
+
- name: Set up venv + install
|
|
32
|
+
run: |
|
|
33
|
+
python3 -m venv .venv
|
|
34
|
+
.venv/bin/python -m pip install --upgrade pip
|
|
35
|
+
.venv/bin/python -m pip install -e ".[dev]"
|
|
36
|
+
- name: Lint
|
|
37
|
+
run: .venv/bin/ruff check src tests
|
|
38
|
+
- name: Format check
|
|
39
|
+
run: .venv/bin/ruff format --check src tests
|
|
40
|
+
- name: Types
|
|
41
|
+
run: .venv/bin/mypy src
|
|
42
|
+
- name: Test
|
|
43
|
+
run: .venv/bin/pytest -q --cov=steadystate --cov-report=term-missing --cov-fail-under=75
|
|
44
|
+
- name: Build check
|
|
45
|
+
run: |
|
|
46
|
+
.venv/bin/python -m build
|
|
47
|
+
.venv/bin/twine check dist/*
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: codeql
|
|
2
|
+
|
|
3
|
+
# Static analysis (SAST) for the Python code: CodeQL taint-tracks untrusted input to
|
|
4
|
+
# dangerous sinks (command/path injection, SSRF, ...). It's smart about our subprocess use --
|
|
5
|
+
# it flags a shell-out only if attacker-controlled data can reach the argv, not just for
|
|
6
|
+
# calling out to terraform/kubectl/docker. Results land in the repo's Security tab.
|
|
7
|
+
# (Code scanning is free on public repos -- which this is.)
|
|
8
|
+
|
|
9
|
+
on:
|
|
10
|
+
push:
|
|
11
|
+
branches: [main]
|
|
12
|
+
pull_request:
|
|
13
|
+
branches: [main]
|
|
14
|
+
schedule:
|
|
15
|
+
- cron: "27 4 * * 1" # weekly (Mon 04:27 UTC) -- catch newly-shipped query patterns
|
|
16
|
+
|
|
17
|
+
permissions:
|
|
18
|
+
contents: read
|
|
19
|
+
actions: read
|
|
20
|
+
security-events: write # upload the SARIF results to the Security tab
|
|
21
|
+
|
|
22
|
+
concurrency:
|
|
23
|
+
group: codeql-${{ github.ref }}
|
|
24
|
+
cancel-in-progress: true
|
|
25
|
+
|
|
26
|
+
jobs:
|
|
27
|
+
analyze:
|
|
28
|
+
runs-on: ubuntu-latest
|
|
29
|
+
timeout-minutes: 15
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v6
|
|
32
|
+
- name: Initialize CodeQL
|
|
33
|
+
uses: github/codeql-action/init@v4
|
|
34
|
+
with:
|
|
35
|
+
languages: python
|
|
36
|
+
# security-extended adds the deeper security queries beyond the default set,
|
|
37
|
+
# without the style/quality noise of security-and-quality.
|
|
38
|
+
queries: security-extended
|
|
39
|
+
# Python is interpreted, so no build step -- CodeQL extracts the source directly.
|
|
40
|
+
- name: Analyze
|
|
41
|
+
uses: github/codeql-action/analyze@v4
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
name: release
|
|
2
|
+
|
|
3
|
+
# Cut a release when a vX.Y.Z tag is pushed: build once, then publish to PyPI
|
|
4
|
+
# (Trusted Publishing -- no stored token) AND attach the artifacts to a GitHub Release.
|
|
5
|
+
#
|
|
6
|
+
# ONE-TIME setup before the first tag (PyPI side, repo owner only):
|
|
7
|
+
# PyPI -> the `steadystate` project -> Publishing -> add a Trusted Publisher:
|
|
8
|
+
# owner = jedi12many, repo = steadystate.ai, workflow = release.yml, environment = pypi
|
|
9
|
+
# (For the very first publish, register it as a "pending publisher" -- the name is reserved
|
|
10
|
+
# on the first successful upload.) No API token is ever stored; PyPI verifies the OIDC identity
|
|
11
|
+
# of this workflow. Make sure __version__ matches the tag (e.g. v0.1.0 -> "0.1.0").
|
|
12
|
+
on:
|
|
13
|
+
push:
|
|
14
|
+
tags:
|
|
15
|
+
- "v*"
|
|
16
|
+
|
|
17
|
+
permissions:
|
|
18
|
+
contents: read # escalated per-job below (least privilege)
|
|
19
|
+
|
|
20
|
+
jobs:
|
|
21
|
+
build:
|
|
22
|
+
runs-on: ubuntu-latest
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v6
|
|
25
|
+
- uses: actions/setup-python@v6
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.12"
|
|
28
|
+
- name: Build + check
|
|
29
|
+
run: |
|
|
30
|
+
python -m pip install --upgrade pip
|
|
31
|
+
pip install build twine
|
|
32
|
+
python -m build
|
|
33
|
+
twine check dist/* # sanity-check sdist+wheel metadata before publishing
|
|
34
|
+
- uses: actions/upload-artifact@v4
|
|
35
|
+
with:
|
|
36
|
+
name: dist
|
|
37
|
+
path: dist/
|
|
38
|
+
|
|
39
|
+
pypi:
|
|
40
|
+
needs: build
|
|
41
|
+
runs-on: ubuntu-latest
|
|
42
|
+
# GitHub-hosted, isolated; the `pypi` environment is the trusted-publisher identity PyPI checks.
|
|
43
|
+
environment:
|
|
44
|
+
name: pypi
|
|
45
|
+
url: https://pypi.org/project/steadystate/
|
|
46
|
+
permissions:
|
|
47
|
+
id-token: write # OIDC for Trusted Publishing -- no API token stored
|
|
48
|
+
steps:
|
|
49
|
+
- uses: actions/download-artifact@v4
|
|
50
|
+
with:
|
|
51
|
+
name: dist
|
|
52
|
+
path: dist/
|
|
53
|
+
- name: Publish to PyPI
|
|
54
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
55
|
+
|
|
56
|
+
github-release:
|
|
57
|
+
needs: build
|
|
58
|
+
runs-on: ubuntu-latest
|
|
59
|
+
permissions:
|
|
60
|
+
contents: write # softprops/action-gh-release creates the Release
|
|
61
|
+
steps:
|
|
62
|
+
- uses: actions/download-artifact@v4
|
|
63
|
+
with:
|
|
64
|
+
name: dist
|
|
65
|
+
path: dist/
|
|
66
|
+
- name: Publish GitHub Release
|
|
67
|
+
uses: softprops/action-gh-release@v3
|
|
68
|
+
with:
|
|
69
|
+
files: dist/*
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
name: security
|
|
2
|
+
|
|
3
|
+
# Two security gates that don't fit the main CI lint/type/test flow:
|
|
4
|
+
# audit -- SCA: pip-audit the deps we actually ship (runtime + [llm]/[discord] extras)
|
|
5
|
+
# against the PyPA/OSV advisory DB. The weekly schedule matters: a CVE can be
|
|
6
|
+
# disclosed against a pinned dep long after the last commit. Dependabot
|
|
7
|
+
# (.github/dependabot.yml) then opens the bump PR that turns this green.
|
|
8
|
+
# bandit -- SAST: scan our own source for insecure patterns (reviewed skips in pyproject).
|
|
9
|
+
|
|
10
|
+
on:
|
|
11
|
+
push:
|
|
12
|
+
branches: [main]
|
|
13
|
+
pull_request:
|
|
14
|
+
schedule:
|
|
15
|
+
- cron: "27 4 * * 1" # weekly (Mon 04:27 UTC)
|
|
16
|
+
|
|
17
|
+
permissions:
|
|
18
|
+
contents: read
|
|
19
|
+
|
|
20
|
+
concurrency:
|
|
21
|
+
group: security-${{ github.ref }}
|
|
22
|
+
cancel-in-progress: true
|
|
23
|
+
|
|
24
|
+
jobs:
|
|
25
|
+
audit:
|
|
26
|
+
name: dependency audit (pip-audit)
|
|
27
|
+
runs-on: ubuntu-latest
|
|
28
|
+
timeout-minutes: 10
|
|
29
|
+
steps:
|
|
30
|
+
- uses: actions/checkout@v6
|
|
31
|
+
- uses: actions/setup-python@v6
|
|
32
|
+
with:
|
|
33
|
+
python-version: "3.11" # the supported floor (pyproject requires-python >=3.11)
|
|
34
|
+
- name: Install shipped deps (+ optional extras) and pip-audit
|
|
35
|
+
run: |
|
|
36
|
+
python -m pip install --upgrade pip
|
|
37
|
+
pip install -e ".[llm,discord]"
|
|
38
|
+
pip install pip-audit
|
|
39
|
+
# No --strict: the local editable package isn't on PyPI and is skipped, which --strict
|
|
40
|
+
# would (wrongly) treat as a failure. Plain pip-audit exits non-zero only on a real CVE.
|
|
41
|
+
- name: Audit installed packages for known CVEs
|
|
42
|
+
run: pip-audit --desc
|
|
43
|
+
|
|
44
|
+
bandit:
|
|
45
|
+
name: static analysis (bandit)
|
|
46
|
+
runs-on: ubuntu-latest
|
|
47
|
+
timeout-minutes: 10
|
|
48
|
+
steps:
|
|
49
|
+
- uses: actions/checkout@v6
|
|
50
|
+
- uses: actions/setup-python@v6
|
|
51
|
+
with:
|
|
52
|
+
python-version: "3.11"
|
|
53
|
+
- run: python -m pip install "bandit[toml]"
|
|
54
|
+
# Reads [tool.bandit] from pyproject -- the reviewed skips + the rationale for each.
|
|
55
|
+
- name: Scan source for insecure patterns
|
|
56
|
+
run: bandit -r src -c pyproject.toml
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
.venv/
|
|
4
|
+
venv/
|
|
5
|
+
*.egg-info/
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
.pytest_cache/
|
|
9
|
+
.ruff_cache/
|
|
10
|
+
.steadystate.tfplan
|
|
11
|
+
.steadystate/
|
|
12
|
+
.env
|
|
13
|
+
.coverage
|
|
14
|
+
htmlcov/
|
|
15
|
+
|
|
16
|
+
# Leftover agent worktrees (never commit the embedded repos)
|
|
17
|
+
.claude/worktrees/
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# steadystate.ai — Architecture
|
|
2
|
+
|
|
3
|
+
> **The operational substrate for IT-Ops — human- or agent-driven.** You declared a desired steady state (Terraform, ArgoCD, Kubernetes/Rancher, Ansible, Docker, Helm). steadystate.ai watches your *deployed* infrastructure — live, and in CI — for any departure from it (**drift**: config diverged; **malfunction**: it's failing *now*), answers **is it working?**, carries your team's **runbook**, correlates a malfunction to the drift that caused it, and **closes the loop** — but only ever within a **bound you commit**.
|
|
4
|
+
|
|
5
|
+
Status: **the loop is built, in two postures.** A **live watcher** (a server/CLI next to a deployment, or driven by an agent over MCP) and a **repo-native GitOps** mode (`steadystate ci` — stateless, in the IaC repo, opening a PR/issue) share one deterministic core: drift + **malfunction** detection across the sources (terraform · terraform-state · ansible · kubernetes · rancher · argocd · docker-compose · helm), a **function-first verdict** (`WORKING | DEGRADED | DOWN` via `http` smoke tests + live symptoms), custom health checks, security/compliance domain packs, metric enrichment, an authored **runbook** (`problem → fix`, matched/offered/auto-applied/surfaced), and a guardrailed **observe → suggest → approve → act** loop — gated by an impact×reversibility **bound you commit** in `config.toml`, approvable from terminal / chat (Slack · Discord · Teams) / an agent (MCP), with an append-only audit. This doc describes the design those seams realize; the roadmap (§11) tracks what's done vs next.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 1. Thesis
|
|
10
|
+
|
|
11
|
+
A system in *steady state* is running **as declared** and running **healthy**: `declared == observed`, *and* the observed system is actually working. It leaves steady state two ways:
|
|
12
|
+
|
|
13
|
+
- **Drift** — declared ≠ observed *config* (someone changed the firewall; the image isn't what you pinned).
|
|
14
|
+
- **Malfunction** — the config is fine, but the system is *failing*: a crashloop, an OOMKill, a restart storm, an expiring cert.
|
|
15
|
+
|
|
16
|
+
Both are **departures from steady state**, and *reasoning about any departure* — which ones matter, why, and what to do — is the product. Drift is the first-class signal and where the engine began; **malfunction** is the second (the `Symptom` type, §4; shipped via `--probe`). The name was always *steadystate*, not *driftfinder*: a problem that never touched your config is still a problem with your system, and blinding ourselves to it to protect a definition would serve the definition, not the operator.
|
|
17
|
+
|
|
18
|
+
steadystate.ai is **not** a security tool that happens to read config, nor a monitor that happens to know your config. It's a **steady-state reasoning engine** — security and compliance are the first *plugins*; the core never knows what "security" is, a domain pack teaches it.
|
|
19
|
+
|
|
20
|
+
This is a deliberate, lean reframe of an earlier custom-everything system (own agent + own brain + own dashboard). The lesson learned: **only the reasoning is differentiated.** Build that; rent the rest — *including the detection of a malfunction*. The platform already knows a pod is CrashLoopBackOff; we read that verdict and reason about it (correlate it to drift, remember it, explain it, fix it), we do not re-build alerting.
|
|
21
|
+
|
|
22
|
+
## 2. Principles
|
|
23
|
+
|
|
24
|
+
1. **Build the reasoning + the guardrails. Rent everything else.** Collection, storage, dashboards, and execution already exist and are better than we want to maintain.
|
|
25
|
+
2. **Modular from day one.** Five plugin seams + an enricher + a probe (below). Security/compliance/cost are packs, not core.
|
|
26
|
+
3. **Chat-first, thin UI.** Operators live in Slack/Teams. The tool comes to them and talks back. The web UI is config + a read-only view, nothing more.
|
|
27
|
+
4. **Default-quiet.** Steady state = silence. We only surface *departures* (drift, policy violations, malfunction) that clear the bar. (Borrowed, hard-won, from the predecessor.)
|
|
28
|
+
5. **The operator is authoritative.** If a human says "that drift is intentional," we believe them and stop nagging.
|
|
29
|
+
6. **No action without a guardrail.** Every remediation is apply-eligibility-checked, snapshotted, verified, and reversible — whether triggered from chat or anywhere else.
|
|
30
|
+
|
|
31
|
+
## 3. The pipeline
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
DECLARED state (plugins — must be EASY) OBSERVED state (rented)
|
|
35
|
+
terraform · argocd · docker-compose the real cloud/cluster/host, via
|
|
36
|
+
ansible · helm · k8s · pulumi each tool's own diff where possible
|
|
37
|
+
│ via each tool's OWN machine-readable │ (terraform plan, argocd live, …)
|
|
38
|
+
│ output, never raw-file parsing │
|
|
39
|
+
▼ ▼
|
|
40
|
+
┌──────────────────────────────────────────────────┐
|
|
41
|
+
│ Canonical State Model (desired ⇄ observed) │ ← the spine (§4)
|
|
42
|
+
└──────────────────────────────────────────────────┘
|
|
43
|
+
▼
|
|
44
|
+
Departures from steady state — the inputs the engine reasons about (one normalized shape):
|
|
45
|
+
DRIFT declared ≠ observed config (reconciler) [built]
|
|
46
|
+
POLICY declared violates a baseline (domain.evaluate — CIS/STIG) [built]
|
|
47
|
+
SYMPTOM observed unhealthy right now (health probes — crashloop/…) [built §4]
|
|
48
|
+
▼
|
|
49
|
+
Reasoning core (BUILD — the IP)
|
|
50
|
+
Signals → Events → Alerts (3-tier scoring) · correlate ACROSS types · honest LLM "why this matters"
|
|
51
|
+
▼
|
|
52
|
+
Domain packs score it: [security] [compliance] [cost] [reliability] … (plugins, §6)
|
|
53
|
+
▼
|
|
54
|
+
Guardrailed executor (BUILD the guardrails / rent the execution)
|
|
55
|
+
apply-eligibility · snapshot→verify→revert · would-break → your CD/Ansible/terraform
|
|
56
|
+
▼
|
|
57
|
+
Operator (Slack/Teams ChatOps, §7) · read-only UI / Grafana · API
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## 4. The spine: a canonical State Model
|
|
61
|
+
|
|
62
|
+
Everything reduces to one model so the core stays source-agnostic:
|
|
63
|
+
|
|
64
|
+
- **Resource** — `{ kind, identity, properties, provenance, observed_at }`. `provenance` traces back to the source + the declaring file/line.
|
|
65
|
+
- **DeclaredState** / **ObservedState** — sets of Resources from the desired and actual sides.
|
|
66
|
+
|
|
67
|
+
The model carries one entry per *departure from steady state*. Three kinds, all reduced to the same downstream shape:
|
|
68
|
+
|
|
69
|
+
- **Drift** *(built)* — a reconciled divergence: `{ identity, change_type (added/removed/modified), declared, observed, detected_at, provenance }`.
|
|
70
|
+
- **PolicyFinding** *(built — Docker CIS, k8s security)* — a standing-baseline violation generated from declared inventory, *not* from drift: `{ rule_id, identity, severity, references, provenance }`.
|
|
71
|
+
- **Symptom** *(built — `--probe`)* — an operational malfunction of a *declared* resource, observed now: `{ identity, kind, category (CrashLoopBackOff / Restarting / Unhealthy / Exited …), severity, evidence (last log line, restart count), provenance, detected_at }`. The parallel to Drift: where Drift says *config diverged*, Symptom says *config is fine but it's failing*.
|
|
72
|
+
|
|
73
|
+
All three normalize to a **Signal** that the 3-tier scorer, the correlator, the memory store (new/recurring/resolved), the surfaces, and the act loop already handle (`Alert` already carries `drifts` + `findings`; Symptom adds `symptoms`). Adding Symptom is the **same move PolicyFinding already made** — a new input type, not a new pipeline. The payoff is **correlation across types**: a Symptom (`web` crashlooping) grouped with a co-located Drift (its image changed) becomes one root-caused Alert no monitor produces. Two boundaries keep this from becoming a monitor: Symptoms are scoped to **declared resources** (we watch *your* system's steady state, not the whole cluster), and their **detection is rented** — we read the verdict the platform already computes (kubectl pod status, docker state, a firing Prometheus alert), we don't store metrics or scrape all logs.
|
|
74
|
+
|
|
75
|
+
Conventions (learned the hard way): **stable/idempotent resource IDs** (re-ingest is a no-op, not a duplicate), **source ranking** when two sources disagree, and provenance on everything so an Alert can point at the exact line that declared a thing.
|
|
76
|
+
|
|
77
|
+
## 5. Build vs rent
|
|
78
|
+
|
|
79
|
+
| Layer | Decision | Notes |
|
|
80
|
+
|---|---|---|
|
|
81
|
+
| **Collect** | **rent** (thin plugin per source) | Use each tool's own output: `terraform show/plan -json`, ArgoCD API (it already diffs!), `docker compose config`, `ansible-inventory`. |
|
|
82
|
+
| **Reason** | **BUILD — the IP** | Canonical model, reconciler, 3-tier scoring (Signal/Event/Alert), correlation, honest LLM analysis. |
|
|
83
|
+
| **Decide/Act** | **BUILD the guardrails / rent execution** | apply-eligibility + snapshot/revert + would-break; the actual change runs via your CD/terraform/ansible. |
|
|
84
|
+
| **Store** | rent / embed | SQLite when standalone; otherwise the host store. |
|
|
85
|
+
| **Surface** | **rent** | Slack/Teams (primary), API, optional Grafana app. No custom dashboard. |
|
|
86
|
+
|
|
87
|
+
## 6. Plugin model (five seams + an enricher + a probe)
|
|
88
|
+
|
|
89
|
+
The core defines the interfaces; everything domain- or vendor-specific is a plugin, registered in a one-line registry so adding one never edits the core or the CLI.
|
|
90
|
+
|
|
91
|
+
1. **StateSource** — declared state in: `terraform · ansible · kubernetes · rancher · argocd · docker-compose · helm` (`DRIFT_SOURCES`). Each rides its tool's own machine-readable diff and declares its **observe** (read-only, pre-approved) vs **destructive** (needs approval) commands — the per-plugin permission manifest (`steadystate commands`).
|
|
92
|
+
2. **Domain** — what drift *means* (`DEFAULT_DOMAINS`): the AWS/GCP/Azure security packs map exposure-increasing drift to ATT&CK techniques; the Docker CIS pack evaluates a standing-policy baseline. A pack `score`s a drift and/or `evaluate`s the declared inventory, and attaches framework `references`. **This is how security & compliance enter — as packs, not core.**
|
|
93
|
+
3. **Surface** — push Alerts out, and take operator input back. Outbound: `console · slack · teams · discord · prometheus · grafana` (`SURFACES`). Inbound has its own registry (`INBOUND`, mirroring `SURFACES`) over a signed HTTP listener — `listen --from <channel>` — so accepting approvals from a new chat provider is an adapter (verify/handshake/parse/respond, plus an optional defer/complete for async — §11 item 16), not a fork. Slack (HMAC, buttons + slash command), Discord (Ed25519, slash command), and Teams (HMAC, @mention command) ship; the `verify`/`handshake` split is what lets three very different signing + handshake protocols share one listener. Each adapter parses its payload down to a provider-agnostic **Command** (`verb + actor + argument`) over a shared verb grammar — so the listener takes `help`, `pending` (read-only discovery), and `probe <target>` (an on-demand scan — §7) alongside `approve`/`decline`, and a new verb is one registry entry, not a change in every adapter. The listener is the long-lived counterpart to the scheduled scan ([deploy/kubernetes/listener.yaml](deploy/kubernetes/listener.yaml)): the CronJob pushes alerts out, the listener lets chat talk back.
|
|
94
|
+
4. **Executor** — perform a guardrailed remediation, keyed by source (`EXECUTORS`): `terraform`, `ansible`. A source with no executor is observe-only by declaration.
|
|
95
|
+
5. **Correlator** — group Events into Alerts (`CORRELATORS`): `llm` (root cause) or `deterministic` (shared attribute); `auto` chooses by whether a provider is configured.
|
|
96
|
+
|
|
97
|
+
Plus an **Enricher** (`ENRICHERS`): an optional step that cross-references an Alert against live metrics (`prometheus`) and escalates a drift whose resource breaches a PromQL bar right now — a metric threshold, distinct from a health verdict.
|
|
98
|
+
|
|
99
|
+
A **health probe** (`PROBES`, `--probe` / `--probe auto`) — *shipped: `kubectl`, `docker`, `argocd`* — the producer of `Symptom`s (§4), the operational counterpart to a StateSource. Live probes (kubectl, docker) shell out for health; a snapshot probe (argocd) reads the *same* document the source rides — ArgoCD's per-resource `health.status`, separate from its sync status, so OutOfSync (Drift) + Degraded (Symptom) diagnose into one Alert. Where a StateSource reconciles declared vs observed *config* into Drift, a health probe reads the live *health* of declared resources into Symptoms. Same access the source uses; detection rented, reasoning ours. Each probe declares its read-only **observe** commands the way a source does (`PROBE_CAPABILITIES`, surfaced in `steadystate commands` + the catalog) — so the kubectl probe's `kubectl logs` (the failing pod's evidence) is in the permission contract, and a least-privilege RBAC can be derived from it (`pods` *and* `pods/log`). This is the seam that lets the engine see malfunction with no drift, without becoming a monitor. **It retired the `kubectl`/`docker` enrichers:** once a Symptom is a first-class peer, a Symptom + a co-located Drift *correlate* into one root-caused Alert automatically — which is what "the enricher escalates a drift" was hand-rolling, but stronger (the symptom is evidence in the root cause, not just a severity bump). The pod/container-health detection now lives in the probe; only `--enrich prometheus` (metric-threshold escalation, a different shape) remains an enricher.
|
|
100
|
+
|
|
101
|
+
**Out-of-tree plugins (entry points).** Every registry above is built-in-by-default but *extensible across the packaging boundary*: a separately installed package contributes to a seam by declaring an [entry point](https://packaging.python.org/en/latest/specifications/entry-points/) — `steadystate.sources`, `steadystate.domains`, `steadystate.surfaces`, `steadystate.inbound`, `steadystate.executors`, `steadystate.correlators` — that loads the same shape the in-tree registry holds (a source factory, a `Surface` factory, a `Domain` class, …). At startup each registry overlays what it discovers (`plugins.py`, stdlib `importlib.metadata`) onto its built-ins, with two guarantees: a plugin that fails to import is logged and **skipped** (a broken third-party package never crashes the host or hides the plugins that load), and **built-ins win** every name clash (installing a package can *add* `--source pulumi` but never silently redirect `--source terraform` at its own code). So "add a pack, never edit core" holds for third parties too, not only within this repo.
|
|
102
|
+
|
|
103
|
+
```toml
|
|
104
|
+
# pyproject.toml of some third-party package — no fork, no PR to steadystate
|
|
105
|
+
[project.entry-points."steadystate.sources"]
|
|
106
|
+
pulumi = "acme_steadystate.pulumi:make_source" # make_source(path) -> DriftSource
|
|
107
|
+
[project.entry-points."steadystate.domains"]
|
|
108
|
+
pci = "acme_steadystate.pci:PCIDomain" # zero-arg -> a Domain
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
The engine itself is **not** a plugin inside someone else's product (it's a stateful service + an embeddable library). The *integrations* are the plugins.
|
|
112
|
+
|
|
113
|
+
## 7. Operator communication (ChatOps) — first-class
|
|
114
|
+
|
|
115
|
+
The tool **lives in Slack/Teams**, not in a dashboard you must remember to open.
|
|
116
|
+
|
|
117
|
+
1. **Detect → reason → push.** Drift → an Alert (what drifted, why it matters, recommended fix) → posted to the right channel/thread. Default-quiet.
|
|
118
|
+
2. **Summon — probe on demand** *(shipped).* `@steadystate probe <target>` dispatches a scan/probe of a named target *right now*, **regardless of the schedule** — someone just pinged you about prod, so you send the on-call agent to look rather than SSH-ing in yourself. It runs the full pipeline (drift + Symptoms + correlation) through the shared engine (`engine.build_report`, the same path the `scan` CLI runs) and posts the result back to the thread. This is the operator-**initiated** counterpart to the scheduled run; scheduling itself stays rented (cron / CI / a CronJob), and chat is the out-of-band trigger *and* the result surface. A `target` is a named, pre-registered scan config (`source` + `path` + `label`) in the listener's `STEADYSTATE_TARGETS` file, so `probe prod-k8s` resolves to "k8s, these manifests, env=prod-k8s." It runs **read-only**: it reports drift + health and never records or applies — so chat is a trigger, never a bypass. "Stateless" softened to "reads, never writes" for one reason — it **honors the operator's mutes/snoozes** by default (the same `is_suppressed` rule the reconcile uses, read-only), so silenced noise stays quiet on demand too; it says how many it withheld, and `unmute` bypasses suppression for that run so a stale mute can never hide a live incident.
|
|
119
|
+
3. **Converse** (operator replies in-thread, to the generative AI):
|
|
120
|
+
- **Understand** — "what changed?", "why does it matter?", "show the diff", "who declared this?" → grounded answers (declared vs observed + git provenance).
|
|
121
|
+
- **Acknowledge / declare intent** — "that was me, intentional" → acked, trusted, won't re-nag. Also snooze / false-positive.
|
|
122
|
+
- **Remediate** — "fix it / bring it back to declared" → guardrailed executor (apply-eligibility → snapshot → apply → verify → offer revert). The AI states the action + blast radius and waits for go.
|
|
123
|
+
- **Escalate** — page / open a ticket.
|
|
124
|
+
4. **Record.** The thread + actions become the Alert's audit trail. The conversation *is* the documentation.
|
|
125
|
+
|
|
126
|
+
**The inbound seam carries commands, not just approvals.** The `INBOUND` adapters (§6) normalize an operator message to a provider-agnostic **Command** (`verb + actor + argument`), not a fixed approve/decline pair — so `listen` is a ChatOps command surface, not only an approval listener. *Shipped:* the `Command` type and a shared verb grammar, with the read-only discovery commands **`help`** (renders itself from the command registry, so an operator who didn't set up the deployment can ask what this listener accepts) and **`pending`** (the open remediations + their fingerprints), plus **`probe <target>`** (Summon — resolves a named target and runs the read-only scan engine) — across all three providers (Teams @mention · Slack slash command · Discord slash subcommand).
|
|
127
|
+
|
|
128
|
+
**Bidirectional** (Events API + bot), not outbound-only. **Chat is a trigger, not a bypass:** operator identity → role, high-blast-radius needs explicit confirm, same guardrails as everywhere.
|
|
129
|
+
|
|
130
|
+
This is why the **web UI is thin**: onboarding/config, a read-only Alerts list, settings/audit. (Could even be a Grafana app → zero owned frontend.)
|
|
131
|
+
|
|
132
|
+
## 8. Decisions (locked)
|
|
133
|
+
|
|
134
|
+
- **Language: Python.** The maintainers' language; the AI ecosystem is Python-first; there's no hot-path agent anymore (collection is rented), so Rust/Go's perf isn't needed; integrations are subprocess + JSON + HTTP. Ship via `pip`/`uv`/Docker.
|
|
135
|
+
- **First source: Terraform.** Others as StateSource plugins later.
|
|
136
|
+
- **v0 = drift only.** Nail declared-vs-observed drift + reasoning before any domain pack. Then add packs (CIS, STIG, …) one at a time — modularity lets us try styles.
|
|
137
|
+
- **Positioning: separate / adjacent** to the predecessor product; clean-room, its own domain.
|
|
138
|
+
- **Thesis scope: drift + malfunction, not "monitoring."** Steady state includes *health*, not just config, so operational malfunction is a first-class departure (§1, §4) — the product is *steadystate*, not *driftfinder*. The boundary that keeps this from drifting into Datadog/Loki territory: Symptoms are scoped to **declared resources** and their **detection is rented** (we read existing health verdicts and reason about them; we don't store metrics, scrape all logs, or run alerting rules). *Decision: yes. Built: the `Symptom` type, the kubectl/docker/argocd probes, cross-type diagnosis (§11), and chat-summoned probes (§7).*
|
|
139
|
+
|
|
140
|
+
## 9. v0 scope (the thinnest thing that proves the spine)
|
|
141
|
+
|
|
142
|
+
`steadystate scan ./infra` →
|
|
143
|
+
1. **Terraform StateSource**: run `terraform plan -json` (terraform already diffs declared vs real cloud state) → parse resource changes.
|
|
144
|
+
2. **Reconcile** those into **Drift** records (canonical model).
|
|
145
|
+
3. **Reason**: 3-tier scoring (signals → events → alerts) + an honest LLM "why this drift matters" → **Alerts**.
|
|
146
|
+
4. **Surface**: print to console (and a Slack push behind a flag).
|
|
147
|
+
|
|
148
|
+
No domain packs, no executor, no UI yet. Proves: ingest → reconcile → reason → surface, and the plugin seams.
|
|
149
|
+
|
|
150
|
+
## 10. Open decisions
|
|
151
|
+
|
|
152
|
+
- **License** — MIT vs Apache-2.0 (patent grant; common for infra OSS) vs a source-available license if open-core protection matters. *Owner: you.*
|
|
153
|
+
- **Surface order** — Slack first, then Teams? (Slack's bot/Events API is the faster build.)
|
|
154
|
+
- **Plugin mechanism for out-of-tree packs** — in-process Python entry points to start; gRPC/WASM later if we want language-agnostic third-party packs.
|
|
155
|
+
- **Observed-state beyond tf-plan** — ride each tool's native diff (tf plan, argocd live) first; a generic cloud-API observer is a later, bigger build.
|
|
156
|
+
|
|
157
|
+
## 11. Roadmap
|
|
158
|
+
|
|
159
|
+
**Done:**
|
|
160
|
+
1. Drift v0 — Terraform → Alerts (console / Slack / Teams).
|
|
161
|
+
2. Three-tier scoring + Brain Tuning; LLM **and** deterministic correlation (a registered seam).
|
|
162
|
+
3. Domain packs — AWS/GCP/Azure security (+ ATT&CK references) and Docker CIS compliance.
|
|
163
|
+
4. Memoryful scan — SQLite store: new/recurring/resolved, mute/snooze.
|
|
164
|
+
5. More sources — ArgoCD, docker-compose, Kubernetes, Rancher (Fleet), Ansible.
|
|
165
|
+
6. Observability — Prometheus/Grafana surfaces + Prometheus enrichment; LLM spend visibility + kill switch.
|
|
166
|
+
7. Guardrailed executor, per-plugin (terraform + ansible) and the **observe → suggest → approve → act** loop, approvable from the terminal or a Slack button.
|
|
167
|
+
8. `--autonomy auto` — self-apply every eligible remediation through that same guardrailed core; the apply gate is deterministic (the LLM never decides), so a REMOVED drift is never eligible and auto reconciles toward declared config without destroying.
|
|
168
|
+
9. Generalized **inbound seam** — the approval listener is a registry (`INBOUND`) of provider adapters over one stdlib HTTP shell; a new chat provider is an adapter, not a fork. Slack, Discord, and Teams ship.
|
|
169
|
+
10. Alerts self-identify (*which* resource drifted, and `--label` for *which* environment); a remediation **audit log** (`history`) — the append-only accountability trail every approve/decline/auto-apply writes to, the floor under turning `--autonomy auto` on for real.
|
|
170
|
+
11. Kubernetes security pack (`security-k8s`) — a standing Pod Security baseline (privileged, host namespaces, capabilities, hostPath) over declared manifests, mapped to CIS Kubernetes + ATT&CK; the same `evaluate`-the-baseline rail the Docker CIS pack rides.
|
|
171
|
+
12. Live-health enrichers — a drift-anchored kubectl/docker correlation (CrashLoopBackOff / restarts / unhealthy container + the failing pod/container's last log line) that escalated *"failing since it drifted."* This was the detection the `Symptom` probe (below) promoted from "escalate" to "originate" — and then **retired** the enrichers (item 14).
|
|
172
|
+
13. **Operational malfunction as a first-class departure** (the thesis evolution, §1/§4) — the `Symptom` type and the `probe/` seam (`--probe auto | kubectl | docker | argocd`) that produces Symptoms for declared resources even with no drift, riding the same Signal/Event/Alert pipeline, and **cross-type diagnosis**: a Symptom co-located with a Drift folds into one root-caused Alert. A probe exists wherever health is distinct from drift (k8s pods, compose containers, ArgoCD's health field); terraform/ansible/rancher have none, by design. Scope guardrails held: declared resources only, detection rented.
|
|
173
|
+
14. **Retired the kubectl/docker enrichers** now that the probes subsume them — correlation does the escalation, stronger (the symptom is evidence in the root cause, not a severity bump). The pod/container-health detection moved into `probe/{kubectl,docker}.py`; `--enrich prometheus` (metric-threshold, a different shape) stays.
|
|
174
|
+
15. **Chat is two-way — the inbound `Command` seam + Summon** (§7): the `INBOUND` adapters parse a provider-agnostic `Command` (`verb + actor + flags`) over one shared grammar, so the listener takes `help` · `targets` · `pending` · `findings` · `history` (read-only discovery), **`probe <target> [verbose|cost]`** (Summon — an on-demand scan of a named target from `STEADYSTATE_TARGETS`, read-only, with `verbose` showing the declared→observed evidence), and `mute`/`approve`/`decline` (writes), across all three providers + a local `chat` REPL. Summon runs through the shared `engine.build_report` the `scan` CLI uses, so there's one reasoning path. The persistent listener ships as a Deployment ([deploy/kubernetes/listener.yaml](deploy/kubernetes/listener.yaml)) — the long-lived counterpart to the scheduled CronJob.
|
|
175
|
+
16. **Async summons** — a slow scan (a live `terraform plan`) can exceed a chat provider's ~3s interaction window. `dispatch` now returns an optional *deferred work* alongside the immediate reply: for a `probe` on a provider that supports it, the listener ACKs at once and runs the scan in a background thread, posting the result back through the provider's channel (Discord: edit the deferred message; Slack: POST the `response_url`). An optional `defer`/`complete` adapter capability (probed by attribute, like the rest of the seam), so Teams — which has no `response_url` — stays synchronous, unchanged. The result post is read-only and best-effort; a flaky post never crashes the listener.
|
|
176
|
+
17. **Out-of-tree plugin discovery** (§6) — the registries stop being repo-bound: every seam (sources · domains · surfaces · inbound · executors · correlators) now overlays `importlib.metadata` entry points (`steadystate.<seam>`) on its built-ins, so a *separately installed* package extends steadystate without a fork (`plugins.py`, stdlib only). Discovery is isolated (a plugin that fails to import is logged and skipped) and safe (built-ins win every name clash — a package can add a backend, never hijack a shipped name). This closes the last in-tree-only seam: "add a pack, never edit core" is now true for third parties too.
|
|
177
|
+
18. **Function-first verdict — "is it *working*?"** The `health` command answers `WORKING | DEGRADED | DOWN`: an `http` **smoke test** check kind (exercise the endpoint — a service that won't answer IS down) plus the live symptoms, scoped to a workload and **correlated** with the drift that likely caused them. `summary` leads with what's *impaired* (a live malfunction) over mere drift/posture, so neither a human nor an agent chases a red herring — but a high-severity drift (an opened firewall) is **flagged for review**, never buried. Custom **health checks** (`define-check`/`add-check`) and **metric enrichment** (a pluggable Prometheus adapter, consumed as context, never reimplemented) round out the live picture; **silos** (`--silo`, like `git -C`) name per-deployment walls.
|
|
178
|
+
19. **The honest gate, made legible** — a `posture` verb that states plainly what steadystate bounds *and where that ends* (a shell-enabled agent's real limit is its RBAC, not us); the **sole-actuator** (contained-agent) model where steadystate is an agent's only tool; and a middle MCP grant tier (**`--author`**) that lets an agent write checks + runbook solutions without the power to touch infra.
|
|
179
|
+
20. **The runbook (solutions)** — operator-vouched `problem → fix` entries (`solutions.json`), authored (`add-solution`/`define-solution`, signed), **learned** (`learn` surfaces a fix you keep applying by hand and hands you the capture), **matched** to a finding (category or title regex), **offered** as a one-`approve` remediation, optionally **auto-applied** within the bound (`STEADYSTATE_SOLUTION_AUTO`), and **surfaced** wherever the problem lands (`show`, a CI-opened issue). Your tribal knowledge as a first-class, gated, auditable artifact — the catalog you grow yourself.
|
|
180
|
+
21. **Two postures + config as code** — a **repo-native GitOps** mode alongside the live watcher: `steadystate ci` (stateless, deterministic, no creds) scans the IaC, gates the merge, and opens a PR/issue (the github-issues surface dedups, auto-closes, and carries the matched runbook fix). A **`terraform-state`** source diffs config-vs-state with `-refresh=false` (state-bucket read, no broad cloud creds). And a committed **`steadystate/config.toml`** — `[defaults]` (source/path), `[bound]` (the autonomy envelope, *reviewed in a PR*), `[ci]` — unifies config beside `checks.json`/`solutions.json`, 12-factor (`flag > env > config > default`). See [docs/repo-native-posture.md](docs/repo-native-posture.md).
|
|
181
|
+
|
|
182
|
+
**Next:**
|
|
183
|
+
- More `config.toml` tables (`[autonomy]`, surfaces) · the matched runbook fix in more surfaces (ServiceNow/Slack) · a live kube-prometheus enrichment run.
|
|
184
|
+
- More sources (Pulumi) · more domain packs (STIG, cost) · Dockerfile reader for the CIS pack.
|