argus-sre 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- argus_sre-0.2.0/.dockerignore +8 -0
- argus_sre-0.2.0/.github/workflows/ci.yml +66 -0
- argus_sre-0.2.0/.github/workflows/publish.yml +21 -0
- argus_sre-0.2.0/.github/workflows/release.yml +25 -0
- argus_sre-0.2.0/.gitignore +6 -0
- argus_sre-0.2.0/CHANGELOG.md +44 -0
- argus_sre-0.2.0/Dockerfile +12 -0
- argus_sre-0.2.0/PKG-INFO +17 -0
- argus_sre-0.2.0/README.md +148 -0
- argus_sre-0.2.0/argus/__init__.py +0 -0
- argus_sre-0.2.0/argus/clients/__init__.py +0 -0
- argus_sre-0.2.0/argus/clients/alertmanager.py +11 -0
- argus_sre-0.2.0/argus/clients/kubernetes.py +413 -0
- argus_sre-0.2.0/argus/clients/ollama.py +26 -0
- argus_sre-0.2.0/argus/clients/prometheus.py +44 -0
- argus_sre-0.2.0/argus/clients/qdrant.py +79 -0
- argus_sre-0.2.0/argus/commands/__init__.py +0 -0
- argus_sre-0.2.0/argus/commands/agent.py +548 -0
- argus_sre-0.2.0/argus/commands/alert.py +41 -0
- argus_sre-0.2.0/argus/commands/ask.py +121 -0
- argus_sre-0.2.0/argus/commands/diagnose.py +177 -0
- argus_sre-0.2.0/argus/commands/incident.py +170 -0
- argus_sre-0.2.0/argus/commands/model.py +64 -0
- argus_sre-0.2.0/argus/commands/runbook.py +128 -0
- argus_sre-0.2.0/argus/commands/status.py +75 -0
- argus_sre-0.2.0/argus/config.py +34 -0
- argus_sre-0.2.0/argus/main.py +137 -0
- argus_sre-0.2.0/argus/mcp_server.py +127 -0
- argus_sre-0.2.0/argus/scripts/consume_vault.py +153 -0
- argus_sre-0.2.0/argus/scripts/ingest_vault.py +166 -0
- argus_sre-0.2.0/argus/scripts/vault_git_publisher.py +141 -0
- argus_sre-0.2.0/argus/scripts/vault_watcher.py +115 -0
- argus_sre-0.2.0/pyproject.toml +44 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
branches: [main]
|
|
6
|
+
push:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
check-commits:
|
|
11
|
+
name: Conventional Commits
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
if: github.event_name == 'pull_request'
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
with:
|
|
17
|
+
fetch-depth: 0
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.11"
|
|
21
|
+
- run: pip install commitizen
|
|
22
|
+
- run: cz check --rev-range origin/${{ github.base_ref }}..HEAD
|
|
23
|
+
|
|
24
|
+
lint:
|
|
25
|
+
name: Lint
|
|
26
|
+
runs-on: ubuntu-latest
|
|
27
|
+
steps:
|
|
28
|
+
- uses: actions/checkout@v4
|
|
29
|
+
- uses: actions/setup-python@v5
|
|
30
|
+
with:
|
|
31
|
+
python-version: "3.11"
|
|
32
|
+
- run: pip install ruff
|
|
33
|
+
- run: ruff check argus/
|
|
34
|
+
|
|
35
|
+
smoke-test:
|
|
36
|
+
name: Import Smoke Test
|
|
37
|
+
runs-on: ubuntu-latest
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/checkout@v4
|
|
40
|
+
- uses: actions/setup-python@v5
|
|
41
|
+
with:
|
|
42
|
+
python-version: "3.11"
|
|
43
|
+
- run: pip install -e .
|
|
44
|
+
- run: python -c "from argus.main import app; from argus.commands import agent, diagnose, incident; print('OK')"
|
|
45
|
+
|
|
46
|
+
docker:
|
|
47
|
+
name: Build & Push Image
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
50
|
+
needs: [lint, smoke-test]
|
|
51
|
+
permissions:
|
|
52
|
+
contents: read
|
|
53
|
+
packages: write
|
|
54
|
+
steps:
|
|
55
|
+
- uses: actions/checkout@v4
|
|
56
|
+
- uses: docker/login-action@v3
|
|
57
|
+
with:
|
|
58
|
+
registry: ghcr.io
|
|
59
|
+
username: ${{ github.actor }}
|
|
60
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
61
|
+
- uses: docker/build-push-action@v5
|
|
62
|
+
with:
|
|
63
|
+
context: .
|
|
64
|
+
push: true
|
|
65
|
+
tags: ghcr.io/t12-pybash/argus:latest
|
|
66
|
+
labels: org.opencontainers.image.revision=${{ github.sha }}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*.*.*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
name: Publish to PyPI
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.11"
|
|
17
|
+
- run: pip install hatch
|
|
18
|
+
- run: hatch build
|
|
19
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
20
|
+
with:
|
|
21
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*.*.*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
release:
|
|
10
|
+
name: Build & Publish to PyPI
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
environment: pypi
|
|
13
|
+
permissions:
|
|
14
|
+
id-token: write
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.11"
|
|
20
|
+
- name: Install hatch
|
|
21
|
+
run: pip install hatch
|
|
22
|
+
- name: Build
|
|
23
|
+
run: hatch build
|
|
24
|
+
- name: Publish to PyPI
|
|
25
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
## v0.2.0 (2026-06-27)
|
|
2
|
+
|
|
3
|
+
### Feat
|
|
4
|
+
|
|
5
|
+
- **diagnose**: RAG over Obsidian vault — search Qdrant for runbooks before LLM call
|
|
6
|
+
- **diagnose**: use deepseek-r1:7b for root cause analysis, strip <think> blocks from output
|
|
7
|
+
- **argus**: add 'argus model' command family (status, rollout, rollback)
|
|
8
|
+
- **status**: add KServe InferenceService health to argus status
|
|
9
|
+
- expand agent remediation action space
|
|
10
|
+
- rename package to argus, add agent command and CI/CD pipeline
|
|
11
|
+
- direct Qdrant upsert on save — bypass vault sync pipeline
|
|
12
|
+
- add sre diagnose command and enrich ask with pod exit reasons
|
|
13
|
+
- add problem pods, deployment readiness, disk usage, and Flux sync context to sre ask
|
|
14
|
+
- inject running pod context into sre ask
|
|
15
|
+
- replace rsync CronJob with git-based vault publisher
|
|
16
|
+
- add CPU and memory context to sre ask with node name mapping
|
|
17
|
+
- add Dockerfile and SRE_CONFIG_FILE support
|
|
18
|
+
- add incremental re-indexing to ingest_vault
|
|
19
|
+
- add sre incident command with vault note creation and cluster snapshot
|
|
20
|
+
- add sre runbook command with semantic vault search and optional LLM synthesis
|
|
21
|
+
- initial sre CLI with status, alert, ask commands
|
|
22
|
+
|
|
23
|
+
### Fix
|
|
24
|
+
|
|
25
|
+
- **diagnose**: exit early when pod has no evidence of problems
|
|
26
|
+
- remove unused subprocess import from agent
|
|
27
|
+
- update Dockerfile and CI for argus rename
|
|
28
|
+
- rename all sre references to argus
|
|
29
|
+
- strip bold markers from Slack remediation plan
|
|
30
|
+
- rename sre agent strings to argus agent
|
|
31
|
+
- escape jsonpath braces in diagnose f-string prompt
|
|
32
|
+
- improve diagnose prompt accuracy and filter low-score chunks from runbook --ask
|
|
33
|
+
- add safe.directory config for PVC ownership mismatch
|
|
34
|
+
- use deterministic MD5 point IDs in both vault ingest scripts
|
|
35
|
+
- replace qdrant-client with httpx REST calls for search_vault
|
|
36
|
+
- qdrant port 443, verify=False, batch upsert for vault ingestion
|
|
37
|
+
- **ask**: add optional api key auth, fix default model name to mistral
|
|
38
|
+
- **ask**: use LiteLLM OpenAI-compatible API instead of internal Ollama URL
|
|
39
|
+
|
|
40
|
+
### Perf
|
|
41
|
+
|
|
42
|
+
- parallelise status/diagnose/incident and add HTTP connection pooling
|
|
43
|
+
- batch pod kubectl calls and parallelise flux sync checks
|
|
44
|
+
- parallelise context gathering in sre ask
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
WORKDIR /app
|
|
4
|
+
|
|
5
|
+
COPY pyproject.toml README.md ./
|
|
6
|
+
COPY argus ./argus
|
|
7
|
+
|
|
8
|
+
RUN apt-get update && apt-get install -y --no-install-recommends git openssh-client && \
|
|
9
|
+
rm -rf /var/lib/apt/lists/* && \
|
|
10
|
+
pip install --no-cache-dir -e .
|
|
11
|
+
|
|
12
|
+
CMD ["python", "-m", "argus.scripts.ingest_vault"]
|
argus_sre-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: argus-sre
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Autonomous SRE intelligence for sovereign infrastructure
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: confluent-kafka>=2.3
|
|
8
|
+
Requires-Dist: httpx>=0.27
|
|
9
|
+
Requires-Dist: mcp[cli]>=1.0
|
|
10
|
+
Requires-Dist: qdrant-client>=1.9
|
|
11
|
+
Requires-Dist: rich>=13
|
|
12
|
+
Requires-Dist: tomllib; python_version < '3.11'
|
|
13
|
+
Requires-Dist: typer>=0.12
|
|
14
|
+
Requires-Dist: watchdog>=4.0
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: commitizen>=3.0; extra == 'dev'
|
|
17
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# argus
|
|
2
|
+
|
|
3
|
+
Autonomous SRE agent for Kubernetes — sovereign, on-premises, no data leaves the cluster.
|
|
4
|
+
|
|
5
|
+
Built by a Lead SRE with 14 years of production infrastructure experience. Argus combines live cluster observability, semantic search over a local knowledge base, LLM-powered diagnosis, and an autonomous remediation agent with human-in-the-loop control.
|
|
6
|
+
|
|
7
|
+
Every competitor (Datadog Bits AI, Dynatrace Davis, PagerDuty AI) sends your operational data to an external cloud. Argus doesn't.
|
|
8
|
+
|
|
9
|
+
## Architecture
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
┌─────────────────────────────────────────────────┐
|
|
13
|
+
│ Argus │
|
|
14
|
+
│ │
|
|
15
|
+
│ argus status ──▶ Prometheus │
|
|
16
|
+
│ argus alert ──▶ Alertmanager │
|
|
17
|
+
│ argus ask ──▶ LiteLLM / Ollama │
|
|
18
|
+
│ argus diagnose ──▶ kubectl + LLM │
|
|
19
|
+
│ argus runbook ──▶ Qdrant (vector search) │
|
|
20
|
+
│ argus incident ──▶ Qdrant + vault │
|
|
21
|
+
│ argus agent ──▶ Alertmanager + kubectl │
|
|
22
|
+
│ + LLM + Slack │
|
|
23
|
+
└─────────────────────────────────────────────────┘
|
|
24
|
+
│
|
|
25
|
+
▼
|
|
26
|
+
Qdrant knowledge base
|
|
27
|
+
(runbooks, incident history, architecture docs)
|
|
28
|
+
ingested in real-time via Redpanda
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Commands
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
argus status Node health, firing alerts, high-restart pods
|
|
35
|
+
argus alert Active Alertmanager alerts in plain English
|
|
36
|
+
argus ask "<question>" LLM query with live Prometheus context
|
|
37
|
+
argus diagnose <namespace/pod> Logs + events + LLM root cause analysis
|
|
38
|
+
argus diagnose <namespace/pod> --save Diagnose and index result to Qdrant
|
|
39
|
+
argus runbook <query> Semantic search over runbooks and vault notes
|
|
40
|
+
argus runbook <query> --ask Same, with LLM synthesis of top results
|
|
41
|
+
argus incident start "<title>" Open incident, snapshot cluster state
|
|
42
|
+
argus incident note "<message>" Append timestamped observation
|
|
43
|
+
argus incident close "<summary>" Close with resolution summary
|
|
44
|
+
argus incident show Show active incident
|
|
45
|
+
argus agent start Start autonomous daemon (human-in-the-loop)
|
|
46
|
+
argus agent start --auto Start in full-auto remediation mode
|
|
47
|
+
argus agent approve Approve pending remediation action
|
|
48
|
+
argus agent reject Reject pending remediation action
|
|
49
|
+
argus agent skip Skip current alert
|
|
50
|
+
argus agent status Show agent state and pending action
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Agent
|
|
54
|
+
|
|
55
|
+
The agent daemon watches Alertmanager, diagnoses alerts automatically, and proposes remediation:
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
Alert fires → agent wakes → argus diagnose → search runbooks
|
|
59
|
+
→ LLM picks action → notifies Slack → awaits approval
|
|
60
|
+
→ executes → indexes outcome to Qdrant
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Supported remediation actions:
|
|
64
|
+
|
|
65
|
+
| Action | Trigger |
|
|
66
|
+
|---|---|
|
|
67
|
+
| `rollout_restart` | Pod crashlooping or stuck |
|
|
68
|
+
| `rollout_undo` | New rollout caused the alert |
|
|
69
|
+
| `force_delete_pod` | Pod stuck in Terminating or Unknown |
|
|
70
|
+
| `scale_deployment` | Resource pressure |
|
|
71
|
+
| `expand_pvc` | PVC near capacity |
|
|
72
|
+
| `delete_failed_jobs` | Failed batch jobs accumulating |
|
|
73
|
+
| `cordon_node` | Node unhealthy |
|
|
74
|
+
| `drain_node` | Node needs evacuating |
|
|
75
|
+
| `flux_reconcile` | Flux sync stuck or drifted |
|
|
76
|
+
| `flux_suspend` | HelmRelease looping upgrade failures |
|
|
77
|
+
| `restart_coredns` | DNS resolution failures cluster-wide |
|
|
78
|
+
|
|
79
|
+
Human approval is required by default. Use `--auto` to execute immediately.
|
|
80
|
+
|
|
81
|
+
## Installation
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pip install argus-sre
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Or from source (requires Python 3.11+ and [uv](https://github.com/astral-sh/uv)):
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
git clone https://github.com/t12-pybash/argus
|
|
91
|
+
cd argus
|
|
92
|
+
uv pip install -e .
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Configuration
|
|
96
|
+
|
|
97
|
+
Config is loaded from `~/.config/argus/config.toml`. All values can be overridden with `ARGUS_` environment variables.
|
|
98
|
+
|
|
99
|
+
```toml
|
|
100
|
+
prometheus_url = "https://prometheus.your-cluster.example"
|
|
101
|
+
alertmanager_url = "https://alertmanager.your-cluster.example"
|
|
102
|
+
qdrant_url = "https://qdrant.your-cluster.example"
|
|
103
|
+
ollama_url = "https://litellm.your-cluster.example"
|
|
104
|
+
ollama_model = "llama3.2"
|
|
105
|
+
embed_ollama_url = "http://localhost:11434"
|
|
106
|
+
embed_model = "nomic-embed-text"
|
|
107
|
+
vault_path = "/home/user/Documents/obsidian-vault"
|
|
108
|
+
agent_webhook_url = "https://hooks.slack.com/services/..."
|
|
109
|
+
agent_approval_timeout = "300"
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Knowledge Base
|
|
113
|
+
|
|
114
|
+
Argus maintains a Qdrant vector knowledge base of runbooks, incident history, and architecture docs. The `argus runbook` and `argus diagnose --save` commands search and write to this base, so the agent gets smarter with every incident.
|
|
115
|
+
|
|
116
|
+
Real-time ingestion via Redpanda keeps the index in sync as vault documents are updated.
|
|
117
|
+
|
|
118
|
+
## MCP Server
|
|
119
|
+
|
|
120
|
+
Argus exposes an MCP server for use with Claude Code and other AI assistants:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
claude mcp add argus -- argus-mcp
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
| Tool | Description |
|
|
127
|
+
|---|---|
|
|
128
|
+
| `get_cluster_status` | Node status, firing alerts, high-restart pods |
|
|
129
|
+
| `get_alerts` | Active Alertmanager alerts with severity |
|
|
130
|
+
| `prometheus_query` | Arbitrary PromQL execution |
|
|
131
|
+
| `search_vault` | Semantic search over the Qdrant knowledge base |
|
|
132
|
+
|
|
133
|
+
## Stack
|
|
134
|
+
|
|
135
|
+
- [Typer](https://typer.tiangolo.com/) — CLI framework
|
|
136
|
+
- [Rich](https://rich.readthedocs.io/) — terminal output
|
|
137
|
+
- [FastMCP](https://github.com/jlowin/fastmcp) — MCP server
|
|
138
|
+
- [httpx](https://www.python-httpx.org/) — HTTP client
|
|
139
|
+
- [Qdrant](https://qdrant.tech/) — vector database
|
|
140
|
+
- [Ollama](https://ollama.ai/) / [LiteLLM](https://github.com/BerriAI/litellm) — local LLM inference
|
|
141
|
+
|
|
142
|
+
## Related
|
|
143
|
+
|
|
144
|
+
- [homelab-gitops](https://github.com/t12-pybash/homelab-gitops) — the cluster this operates (Flux, Cilium, private AI platform)
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
*Ciarán Donegan — Lead SRE | [LinkedIn](https://www.linkedin.com/in/cdonegan7/) | [t-12.io](https://t-12.io)*
|
|
File without changes
|
|
File without changes
|