argus-sre 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. argus_sre-0.2.0/.dockerignore +8 -0
  2. argus_sre-0.2.0/.github/workflows/ci.yml +66 -0
  3. argus_sre-0.2.0/.github/workflows/publish.yml +21 -0
  4. argus_sre-0.2.0/.github/workflows/release.yml +25 -0
  5. argus_sre-0.2.0/.gitignore +6 -0
  6. argus_sre-0.2.0/CHANGELOG.md +44 -0
  7. argus_sre-0.2.0/Dockerfile +12 -0
  8. argus_sre-0.2.0/PKG-INFO +17 -0
  9. argus_sre-0.2.0/README.md +148 -0
  10. argus_sre-0.2.0/argus/__init__.py +0 -0
  11. argus_sre-0.2.0/argus/clients/__init__.py +0 -0
  12. argus_sre-0.2.0/argus/clients/alertmanager.py +11 -0
  13. argus_sre-0.2.0/argus/clients/kubernetes.py +413 -0
  14. argus_sre-0.2.0/argus/clients/ollama.py +26 -0
  15. argus_sre-0.2.0/argus/clients/prometheus.py +44 -0
  16. argus_sre-0.2.0/argus/clients/qdrant.py +79 -0
  17. argus_sre-0.2.0/argus/commands/__init__.py +0 -0
  18. argus_sre-0.2.0/argus/commands/agent.py +548 -0
  19. argus_sre-0.2.0/argus/commands/alert.py +41 -0
  20. argus_sre-0.2.0/argus/commands/ask.py +121 -0
  21. argus_sre-0.2.0/argus/commands/diagnose.py +177 -0
  22. argus_sre-0.2.0/argus/commands/incident.py +170 -0
  23. argus_sre-0.2.0/argus/commands/model.py +64 -0
  24. argus_sre-0.2.0/argus/commands/runbook.py +128 -0
  25. argus_sre-0.2.0/argus/commands/status.py +75 -0
  26. argus_sre-0.2.0/argus/config.py +34 -0
  27. argus_sre-0.2.0/argus/main.py +137 -0
  28. argus_sre-0.2.0/argus/mcp_server.py +127 -0
  29. argus_sre-0.2.0/argus/scripts/consume_vault.py +153 -0
  30. argus_sre-0.2.0/argus/scripts/ingest_vault.py +166 -0
  31. argus_sre-0.2.0/argus/scripts/vault_git_publisher.py +141 -0
  32. argus_sre-0.2.0/argus/scripts/vault_watcher.py +115 -0
  33. argus_sre-0.2.0/pyproject.toml +44 -0
@@ -0,0 +1,8 @@
1
+ .git
2
+ __pycache__
3
+ *.pyc
4
+ *.pyo
5
+ .pytest_cache
6
+ .ruff_cache
7
+ dist
8
+ *.egg-info
@@ -0,0 +1,66 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main]
6
+ push:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ check-commits:
11
+ name: Conventional Commits
12
+ runs-on: ubuntu-latest
13
+ if: github.event_name == 'pull_request'
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ with:
17
+ fetch-depth: 0
18
+ - uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.11"
21
+ - run: pip install commitizen
22
+ - run: cz check --rev-range origin/${{ github.base_ref }}..HEAD
23
+
24
+ lint:
25
+ name: Lint
26
+ runs-on: ubuntu-latest
27
+ steps:
28
+ - uses: actions/checkout@v4
29
+ - uses: actions/setup-python@v5
30
+ with:
31
+ python-version: "3.11"
32
+ - run: pip install ruff
33
+ - run: ruff check argus/
34
+
35
+ smoke-test:
36
+ name: Import Smoke Test
37
+ runs-on: ubuntu-latest
38
+ steps:
39
+ - uses: actions/checkout@v4
40
+ - uses: actions/setup-python@v5
41
+ with:
42
+ python-version: "3.11"
43
+ - run: pip install -e .
44
+ - run: python -c "from argus.main import app; from argus.commands import agent, diagnose, incident; print('OK')"
45
+
46
+ docker:
47
+ name: Build & Push Image
48
+ runs-on: ubuntu-latest
49
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
50
+ needs: [lint, smoke-test]
51
+ permissions:
52
+ contents: read
53
+ packages: write
54
+ steps:
55
+ - uses: actions/checkout@v4
56
+ - uses: docker/login-action@v3
57
+ with:
58
+ registry: ghcr.io
59
+ username: ${{ github.actor }}
60
+ password: ${{ secrets.GITHUB_TOKEN }}
61
+ - uses: docker/build-push-action@v5
62
+ with:
63
+ context: .
64
+ push: true
65
+ tags: ghcr.io/t12-pybash/argus:latest
66
+ labels: org.opencontainers.image.revision=${{ github.sha }}
@@ -0,0 +1,21 @@
1
+ name: Publish
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*.*.*"
7
+
8
+ jobs:
9
+ publish:
10
+ name: Publish to PyPI
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.11"
17
+ - run: pip install hatch
18
+ - run: hatch build
19
+ - uses: pypa/gh-action-pypi-publish@release/v1
20
+ with:
21
+ password: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,25 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*.*.*"
7
+
8
+ jobs:
9
+ release:
10
+ name: Build & Publish to PyPI
11
+ runs-on: ubuntu-latest
12
+ environment: pypi
13
+ permissions:
14
+ id-token: write
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.11"
20
+ - name: Install hatch
21
+ run: pip install hatch
22
+ - name: Build
23
+ run: hatch build
24
+ - name: Publish to PyPI
25
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,6 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ .venv/
5
+ dist/
6
+ *.egg-info/
@@ -0,0 +1,44 @@
1
+ ## v0.2.0 (2026-06-27)
2
+
3
+ ### Feat
4
+
5
+ - **diagnose**: RAG over Obsidian vault — search Qdrant for runbooks before LLM call
6
+ - **diagnose**: use deepseek-r1:7b for root cause analysis, strip <think> blocks from output
7
+ - **argus**: add 'argus model' command family (status, rollout, rollback)
8
+ - **status**: add KServe InferenceService health to argus status
9
+ - expand agent remediation action space
10
+ - rename package to argus, add agent command and CI/CD pipeline
11
+ - direct Qdrant upsert on save — bypass vault sync pipeline
12
+ - add sre diagnose command and enrich ask with pod exit reasons
13
+ - add problem pods, deployment readiness, disk usage, and Flux sync context to sre ask
14
+ - inject running pod context into sre ask
15
+ - replace rsync CronJob with git-based vault publisher
16
+ - add CPU and memory context to sre ask with node name mapping
17
+ - add Dockerfile and SRE_CONFIG_FILE support
18
+ - add incremental re-indexing to ingest_vault
19
+ - add sre incident command with vault note creation and cluster snapshot
20
+ - add sre runbook command with semantic vault search and optional LLM synthesis
21
+ - initial sre CLI with status, alert, ask commands
22
+
23
+ ### Fix
24
+
25
+ - **diagnose**: exit early when pod has no evidence of problems
26
+ - remove unused subprocess import from agent
27
+ - update Dockerfile and CI for argus rename
28
+ - rename all sre references to argus
29
+ - strip bold markers from Slack remediation plan
30
+ - rename sre agent strings to argus agent
31
+ - escape jsonpath braces in diagnose f-string prompt
32
+ - improve diagnose prompt accuracy and filter low-score chunks from runbook --ask
33
+ - add safe.directory config for PVC ownership mismatch
34
+ - use deterministic MD5 point IDs in both vault ingest scripts
35
+ - replace qdrant-client with httpx REST calls for search_vault
36
+ - qdrant port 443, verify=False, batch upsert for vault ingestion
37
+ - **ask**: add optional api key auth, fix default model name to mistral
38
+ - **ask**: use LiteLLM OpenAI-compatible API instead of internal Ollama URL
39
+
40
+ ### Perf
41
+
42
+ - parallelise status/diagnose/incident and add HTTP connection pooling
43
+ - batch pod kubectl calls and parallelise flux sync checks
44
+ - parallelise context gathering in sre ask
@@ -0,0 +1,12 @@
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY pyproject.toml README.md ./
6
+ COPY argus ./argus
7
+
8
+ RUN apt-get update && apt-get install -y --no-install-recommends git openssh-client && \
9
+ rm -rf /var/lib/apt/lists/* && \
10
+ pip install --no-cache-dir -e .
11
+
12
+ CMD ["python", "-m", "argus.scripts.ingest_vault"]
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: argus-sre
3
+ Version: 0.2.0
4
+ Summary: Autonomous SRE intelligence for sovereign infrastructure
5
+ License: Apache-2.0
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: confluent-kafka>=2.3
8
+ Requires-Dist: httpx>=0.27
9
+ Requires-Dist: mcp[cli]>=1.0
10
+ Requires-Dist: qdrant-client>=1.9
11
+ Requires-Dist: rich>=13
12
+ Requires-Dist: tomllib; python_version < '3.11'
13
+ Requires-Dist: typer>=0.12
14
+ Requires-Dist: watchdog>=4.0
15
+ Provides-Extra: dev
16
+ Requires-Dist: commitizen>=3.0; extra == 'dev'
17
+ Requires-Dist: ruff>=0.4; extra == 'dev'
@@ -0,0 +1,148 @@
1
+ # argus
2
+
3
+ Autonomous SRE agent for Kubernetes — sovereign, on-premises, no data leaves the cluster.
4
+
5
+ Built by a Lead SRE with 14 years of production infrastructure experience. Argus combines live cluster observability, semantic search over a local knowledge base, LLM-powered diagnosis, and an autonomous remediation agent with human-in-the-loop control.
6
+
7
+ Every competitor (Datadog Bits AI, Dynatrace Davis, PagerDuty AI) sends your operational data to an external cloud. Argus doesn't.
8
+
9
+ ## Architecture
10
+
11
+ ```
12
+ ┌─────────────────────────────────────────────────┐
13
+ │ Argus │
14
+ │ │
15
+ │ argus status ──▶ Prometheus │
16
+ │ argus alert ──▶ Alertmanager │
17
+ │ argus ask ──▶ LiteLLM / Ollama │
18
+ │ argus diagnose ──▶ kubectl + LLM │
19
+ │ argus runbook ──▶ Qdrant (vector search) │
20
+ │ argus incident ──▶ Qdrant + vault │
21
+ │ argus agent ──▶ Alertmanager + kubectl │
22
+ │ + LLM + Slack │
23
+ └─────────────────────────────────────────────────┘
24
+
25
+
26
+ Qdrant knowledge base
27
+ (runbooks, incident history, architecture docs)
28
+ ingested in real-time via Redpanda
29
+ ```
30
+
31
+ ## Commands
32
+
33
+ ```
34
+ argus status Node health, firing alerts, high-restart pods
35
+ argus alert Active Alertmanager alerts in plain English
36
+ argus ask "<question>" LLM query with live Prometheus context
37
+ argus diagnose <namespace/pod> Logs + events + LLM root cause analysis
38
+ argus diagnose <namespace/pod> --save Diagnose and index result to Qdrant
39
+ argus runbook <query> Semantic search over runbooks and vault notes
40
+ argus runbook <query> --ask Same, with LLM synthesis of top results
41
+ argus incident start "<title>" Open incident, snapshot cluster state
42
+ argus incident note "<message>" Append timestamped observation
43
+ argus incident close "<summary>" Close with resolution summary
44
+ argus incident show Show active incident
45
+ argus agent start Start autonomous daemon (human-in-the-loop)
46
+ argus agent start --auto Start in full-auto remediation mode
47
+ argus agent approve Approve pending remediation action
48
+ argus agent reject Reject pending remediation action
49
+ argus agent skip Skip current alert
50
+ argus agent status Show agent state and pending action
51
+ ```
52
+
53
+ ## Agent
54
+
55
+ The agent daemon watches Alertmanager, diagnoses alerts automatically, and proposes remediation:
56
+
57
+ ```
58
+ Alert fires → agent wakes → argus diagnose → search runbooks
59
+ → LLM picks action → notifies Slack → awaits approval
60
+ → executes → indexes outcome to Qdrant
61
+ ```
62
+
63
+ Supported remediation actions:
64
+
65
+ | Action | Trigger |
66
+ |---|---|
67
+ | `rollout_restart` | Pod crashlooping or stuck |
68
+ | `rollout_undo` | New rollout caused the alert |
69
+ | `force_delete_pod` | Pod stuck in Terminating or Unknown |
70
+ | `scale_deployment` | Resource pressure |
71
+ | `expand_pvc` | PVC near capacity |
72
+ | `delete_failed_jobs` | Failed batch jobs accumulating |
73
+ | `cordon_node` | Node unhealthy |
74
+ | `drain_node` | Node needs evacuating |
75
+ | `flux_reconcile` | Flux sync stuck or drifted |
76
+ | `flux_suspend` | HelmRelease looping upgrade failures |
77
+ | `restart_coredns` | DNS resolution failures cluster-wide |
78
+
79
+ Human approval is required by default. Use `--auto` to execute immediately.
80
+
81
+ ## Installation
82
+
83
+ ```bash
84
+ pip install argus-sre
85
+ ```
86
+
87
+ Or from source (requires Python 3.11+ and [uv](https://github.com/astral-sh/uv)):
88
+
89
+ ```bash
90
+ git clone https://github.com/t12-pybash/argus
91
+ cd argus
92
+ uv pip install -e .
93
+ ```
94
+
95
+ ## Configuration
96
+
97
+ Config is loaded from `~/.config/argus/config.toml`. All values can be overridden with `ARGUS_` environment variables.
98
+
99
+ ```toml
100
+ prometheus_url = "https://prometheus.your-cluster.example"
101
+ alertmanager_url = "https://alertmanager.your-cluster.example"
102
+ qdrant_url = "https://qdrant.your-cluster.example"
103
+ ollama_url = "https://litellm.your-cluster.example"
104
+ ollama_model = "llama3.2"
105
+ embed_ollama_url = "http://localhost:11434"
106
+ embed_model = "nomic-embed-text"
107
+ vault_path = "/home/user/Documents/obsidian-vault"
108
+ agent_webhook_url = "https://hooks.slack.com/services/..."
109
+ agent_approval_timeout = "300"
110
+ ```
111
+
112
+ ## Knowledge Base
113
+
114
+ Argus maintains a Qdrant vector knowledge base of runbooks, incident history, and architecture docs. The `argus runbook` and `argus diagnose --save` commands search and write to this base, so the agent gets smarter with every incident.
115
+
116
+ Real-time ingestion via Redpanda keeps the index in sync as vault documents are updated.
117
+
118
+ ## MCP Server
119
+
120
+ Argus exposes an MCP server for use with Claude Code and other AI assistants:
121
+
122
+ ```bash
123
+ claude mcp add argus -- argus-mcp
124
+ ```
125
+
126
+ | Tool | Description |
127
+ |---|---|
128
+ | `get_cluster_status` | Node status, firing alerts, high-restart pods |
129
+ | `get_alerts` | Active Alertmanager alerts with severity |
130
+ | `prometheus_query` | Arbitrary PromQL execution |
131
+ | `search_vault` | Semantic search over the Qdrant knowledge base |
132
+
133
+ ## Stack
134
+
135
+ - [Typer](https://typer.tiangolo.com/) — CLI framework
136
+ - [Rich](https://rich.readthedocs.io/) — terminal output
137
+ - [FastMCP](https://github.com/jlowin/fastmcp) — MCP server
138
+ - [httpx](https://www.python-httpx.org/) — HTTP client
139
+ - [Qdrant](https://qdrant.tech/) — vector database
140
+ - [Ollama](https://ollama.ai/) / [LiteLLM](https://github.com/BerriAI/litellm) — local LLM inference
141
+
142
+ ## Related
143
+
144
+ - [homelab-gitops](https://github.com/t12-pybash/homelab-gitops) — the cluster this operates (Flux, Cilium, private AI platform)
145
+
146
+ ---
147
+
148
+ *Ciarán Donegan — Lead SRE | [LinkedIn](https://www.linkedin.com/in/cdonegan7/) | [t-12.io](https://t-12.io)*
File without changes
File without changes
@@ -0,0 +1,11 @@
1
+ import httpx
2
+
3
+
4
+ class AlertmanagerClient:
5
+ def __init__(self, url: str):
6
+ self.url = url.rstrip("/")
7
+
8
+ def alerts(self) -> list:
9
+ r = httpx.get(f"{self.url}/api/v2/alerts", verify=False, timeout=10)
10
+ r.raise_for_status()
11
+ return r.json()