argus-ops 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. argus_ops-0.3.0/LICENSE +21 -0
  2. argus_ops-0.3.0/PKG-INFO +384 -0
  3. argus_ops-0.3.0/README.md +339 -0
  4. argus_ops-0.3.0/pyproject.toml +79 -0
  5. argus_ops-0.3.0/setup.cfg +4 -0
  6. argus_ops-0.3.0/src/argus_ops/__init__.py +3 -0
  7. argus_ops-0.3.0/src/argus_ops/ai/__init__.py +1 -0
  8. argus_ops-0.3.0/src/argus_ops/ai/base.py +25 -0
  9. argus_ops-0.3.0/src/argus_ops/ai/cost.py +62 -0
  10. argus_ops-0.3.0/src/argus_ops/ai/provider.py +234 -0
  11. argus_ops-0.3.0/src/argus_ops/analyzers/__init__.py +9 -0
  12. argus_ops-0.3.0/src/argus_ops/analyzers/base.py +43 -0
  13. argus_ops-0.3.0/src/argus_ops/analyzers/node_health.py +128 -0
  14. argus_ops-0.3.0/src/argus_ops/analyzers/pod_health.py +192 -0
  15. argus_ops-0.3.0/src/argus_ops/analyzers/resource.py +166 -0
  16. argus_ops-0.3.0/src/argus_ops/cli.py +504 -0
  17. argus_ops-0.3.0/src/argus_ops/collectors/__init__.py +1 -0
  18. argus_ops-0.3.0/src/argus_ops/collectors/base.py +49 -0
  19. argus_ops-0.3.0/src/argus_ops/collectors/k8s.py +430 -0
  20. argus_ops-0.3.0/src/argus_ops/config.py +235 -0
  21. argus_ops-0.3.0/src/argus_ops/engine/__init__.py +4 -0
  22. argus_ops-0.3.0/src/argus_ops/engine/pipeline.py +290 -0
  23. argus_ops-0.3.0/src/argus_ops/healers/__init__.py +1 -0
  24. argus_ops-0.3.0/src/argus_ops/logging_config.py +83 -0
  25. argus_ops-0.3.0/src/argus_ops/models.py +106 -0
  26. argus_ops-0.3.0/src/argus_ops/notifiers/__init__.py +1 -0
  27. argus_ops-0.3.0/src/argus_ops/reporters/__init__.py +29 -0
  28. argus_ops-0.3.0/src/argus_ops/reporters/console.py +173 -0
  29. argus_ops-0.3.0/src/argus_ops/reporters/json_reporter.py +62 -0
  30. argus_ops-0.3.0/src/argus_ops/store.py +297 -0
  31. argus_ops-0.3.0/src/argus_ops/web/__init__.py +1 -0
  32. argus_ops-0.3.0/src/argus_ops/web/api.py +176 -0
  33. argus_ops-0.3.0/src/argus_ops/web/watch_service.py +232 -0
  34. argus_ops-0.3.0/src/argus_ops.egg-info/PKG-INFO +384 -0
  35. argus_ops-0.3.0/src/argus_ops.egg-info/SOURCES.txt +45 -0
  36. argus_ops-0.3.0/src/argus_ops.egg-info/dependency_links.txt +1 -0
  37. argus_ops-0.3.0/src/argus_ops.egg-info/entry_points.txt +2 -0
  38. argus_ops-0.3.0/src/argus_ops.egg-info/requires.txt +22 -0
  39. argus_ops-0.3.0/src/argus_ops.egg-info/top_level.txt +1 -0
  40. argus_ops-0.3.0/tests/test_analyzers.py +290 -0
  41. argus_ops-0.3.0/tests/test_api.py +293 -0
  42. argus_ops-0.3.0/tests/test_cli.py +274 -0
  43. argus_ops-0.3.0/tests/test_config.py +82 -0
  44. argus_ops-0.3.0/tests/test_models.py +116 -0
  45. argus_ops-0.3.0/tests/test_pipeline.py +241 -0
  46. argus_ops-0.3.0/tests/test_reporters.py +64 -0
  47. argus_ops-0.3.0/tests/test_store.py +178 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mason Kim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,384 @@
1
+ Metadata-Version: 2.4
2
+ Name: argus-ops
3
+ Version: 0.3.0
4
+ Summary: AI-powered infrastructure monitoring CLI that detects, diagnoses, and remediates issues across Kubernetes, VMs, and bare metal
5
+ Author-email: Mason Kim <ehehwnwjs5052@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/mason5052/argus-ops
8
+ Project-URL: Repository, https://github.com/mason5052/argus-ops
9
+ Project-URL: Issues, https://github.com/mason5052/argus-ops/issues
10
+ Keywords: aiops,monitoring,kubernetes,infrastructure,devops,sre,observability
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: System Administrators
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: System :: Monitoring
20
+ Classifier: Topic :: System :: Systems Administration
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: click>=8.0
25
+ Requires-Dist: pydantic>=2.0.0
26
+ Requires-Dist: pyyaml>=6.0
27
+ Requires-Dist: rich>=13.0.0
28
+ Requires-Dist: litellm>=1.0.0
29
+ Requires-Dist: kubernetes>=28.0.0
30
+ Requires-Dist: jinja2>=3.0
31
+ Requires-Dist: tenacity>=8.2.0
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
35
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
36
+ Requires-Dist: httpx>=0.27.0; extra == "dev"
37
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
38
+ Requires-Dist: bandit[toml]>=1.7.0; extra == "dev"
39
+ Requires-Dist: safety>=3.0.0; extra == "dev"
40
+ Provides-Extra: web
41
+ Requires-Dist: fastapi>=0.110.0; extra == "web"
42
+ Requires-Dist: uvicorn[standard]>=0.27.0; extra == "web"
43
+ Requires-Dist: httpx>=0.27.0; extra == "web"
44
+ Dynamic: license-file
45
+
46
+ # Argus-Ops
47
+
48
+ [![CI](https://github.com/mason5052/argus-ops/actions/workflows/ci.yml/badge.svg)](https://github.com/mason5052/argus-ops/actions/workflows/ci.yml)
49
+ [![PyPI version](https://badge.fury.io/py/argus-ops.svg)](https://badge.fury.io/py/argus-ops)
50
+ [![Docker Pulls](https://img.shields.io/docker/pulls/mason530984/argus-ops.svg)](https://hub.docker.com/r/mason530984/argus-ops)
51
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
52
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
53
+ [![codecov](https://codecov.io/gh/mason5052/argus-ops/graph/badge.svg)](https://codecov.io/gh/mason5052/argus-ops)
54
+
55
+ AI-powered infrastructure monitoring CLI that detects issues, diagnoses root causes,
56
+ and (coming soon) executes remediation across Kubernetes, VMs, and bare metal servers.
57
+
58
+ ```bash
59
+ pip install argus-ops
60
+ argus-ops scan
61
+ argus-ops diagnose
62
+ ```
63
+
64
+ ## The Problem
65
+
66
+ Modern infrastructure runs dozens of services across hundreds of nodes. Existing tools
67
+ either surface alerts without context (Prometheus), require deep K8s expertise to interpret
68
+ (kubectl), or lock you into expensive SaaS platforms. There is no open-source CLI tool that:
69
+
70
+ - Scans your entire infrastructure in seconds
71
+ - Uses AI to explain *why* something is broken in plain English
72
+ - Proposes specific remediation steps based on your actual cluster state
73
+ - Works with any LLM (OpenAI, Anthropic, Ollama, 100+ providers)
74
+
75
+ Argus-Ops fills that gap.
76
+
77
+ ## Architecture
78
+
79
+ ```mermaid
80
+ flowchart LR
81
+ subgraph Collect
82
+ K8s[K8s Collector\npods / nodes / events\ndeployments / CronJobs]
83
+ end
84
+
85
+ subgraph Analyze
86
+ PA[PodHealth\nAnalyzer]
87
+ NA[NodeHealth\nAnalyzer]
88
+ RA[Resource\nAnalyzer]
89
+ end
90
+
91
+ subgraph AI Diagnose
92
+ CB[Circuit Breaker\n+ Retry]
93
+ LLM[LiteLLM\n100+ providers]
94
+ PV[Pydantic\nvalidation]
95
+ end
96
+
97
+ subgraph Report
98
+ RC[Rich Console\n+ JSON]
99
+ API[FastAPI\nDashboard]
100
+ DB[(SQLite\nIncidentStore)]
101
+ end
102
+
103
+ K8s --> PA & NA & RA
104
+ PA & NA & RA --> CB
105
+ CB --> LLM --> PV --> RC
106
+ PV --> DB --> API
107
+ ```
108
+
109
+ The pipeline is modular -- each stage uses pluggable abstract base classes,
110
+ so you can add custom collectors, analyzers, or AI providers without touching core code.
111
+
112
+ ## Quick Start
113
+
114
+ ### Install
115
+
116
+ ```bash
117
+ # Core CLI
118
+ pip install argus-ops
119
+
120
+ # With web dashboard
121
+ pip install "argus-ops[web]"
122
+ ```
123
+
124
+ ### Docker
125
+
126
+ ```bash
127
+ docker run --rm -it \
128
+ -v ~/.kube:/home/argus/.kube:ro \
129
+ -v ~/.argus-ops:/home/argus/.argus-ops \
130
+ -e OPENAI_API_KEY=sk-... \
131
+ mason530984/argus-ops:latest \
132
+ argus-ops scan
133
+ ```
134
+
135
+ ### Helm (Kubernetes)
136
+
137
+ ```bash
138
+ helm repo add argus-ops https://mason5052.github.io/argus-ops
139
+ helm repo update
140
+
141
+ # Install into the monitoring namespace
142
+ helm install argus-ops argus-ops/argus-ops \
143
+ --namespace monitoring --create-namespace \
144
+ --set existingSecret.name=argus-ops-secrets
145
+
146
+ # Create the API key secret beforehand:
147
+ kubectl create secret generic argus-ops-secrets \
148
+ --from-literal=openai-api-key=sk-... \
149
+ -n monitoring
150
+ ```
151
+
152
+ ### Configure
153
+
154
+ ```bash
155
+ argus-ops config init
156
+ # Edit ~/.argus-ops/config.yaml to set your AI provider
157
+ export OPENAI_API_KEY=sk-...
158
+ ```
159
+
160
+ ### Run
161
+
162
+ ```bash
163
+ # Scan only (no AI, fast, free)
164
+ argus-ops scan
165
+
166
+ # Scan + AI root cause analysis
167
+ argus-ops diagnose
168
+
169
+ # Filter by severity
170
+ argus-ops scan --severity high
171
+
172
+ # Specific namespaces
173
+ argus-ops scan --namespace rpa --namespace zrpa-demo
174
+
175
+ # JSON output
176
+ argus-ops scan --output json | jq '.[] | select(.severity == "critical")'
177
+
178
+ # Use a different AI model
179
+ argus-ops diagnose --model gpt-4o
180
+ argus-ops diagnose --model claude-sonnet-4-6
181
+ argus-ops diagnose --model ollama/llama3.2 # local, no API key needed
182
+
183
+ # Web dashboard (requires argus-ops[web])
184
+ argus-ops serve
185
+ ```
186
+
187
+ ## Supported AI Providers
188
+
189
+ Argus-Ops uses [LiteLLM](https://github.com/BerriAI/litellm) for unified LLM access.
190
+ Set your preferred model in config.yaml and export the API key:
191
+
192
+ | Provider | Model Example | API Key Env Var |
193
+ |------------|--------------------------------|-----------------------|
194
+ | OpenAI | gpt-4o-mini, gpt-4o | OPENAI_API_KEY |
195
+ | Anthropic | claude-haiku-4-5-20251001 | ANTHROPIC_API_KEY |
196
+ | Ollama | ollama/llama3.2 (local) | (none required) |
197
+ | Azure | azure/gpt-4o | AZURE_API_KEY |
198
+ | Bedrock | bedrock/claude-3-haiku | AWS credentials |
199
+ | Gemini | gemini/gemini-1.5-flash | GEMINI_API_KEY |
200
+
201
+ For Ollama (fully local, no data sent to cloud):
202
+
203
+ ```yaml
204
+ # ~/.argus-ops/config.yaml
205
+ ai:
206
+ model: ollama/llama3.2
207
+ base_url: http://localhost:11434
208
+ ```
209
+
210
+ ## Built-in Detections
211
+
212
+ | Analyzer | Detections |
213
+ |---------------|---------------------------------------------------------------------|
214
+ | pod_health | CrashLoopBackOff, OOMKilled, ImagePullBackOff, Pending timeout, Failed |
215
+ | node_health | NotReady, MemoryPressure, DiskPressure, PIDPressure, Cordoned |
216
+ | resource | Containers without CPU/memory limits, memory allocation ratio |
217
+
218
+ ## Configuration
219
+
220
+ ```yaml
221
+ # ~/.argus-ops/config.yaml
222
+
223
+ ai:
224
+ provider: openai
225
+ model: gpt-4o-mini
226
+ api_key_env: OPENAI_API_KEY # env var containing the API key
227
+ base_url: null # custom URL (Ollama: http://localhost:11434)
228
+ temperature: 0.3
229
+ max_tokens: 4096
230
+ cost_limit_per_run: 0.50 # USD -- stops AI calls if exceeded
231
+
232
+ targets:
233
+ kubernetes:
234
+ enabled: true
235
+ kubeconfig: null # null = default ~/.kube/config
236
+ context: null # null = current context
237
+ namespaces: [] # empty = scan all
238
+ exclude_namespaces:
239
+ - kube-system
240
+
241
+ analyzers:
242
+ resource:
243
+ memory_warning: 85 # % allocation to trigger warning
244
+ memory_critical: 95
245
+ pod_health:
246
+ crashloop_restart_threshold: 5
247
+ pending_timeout_minutes: 10
248
+ ```
249
+
250
+ Environment variable overrides (higher priority than file):
251
+
252
+ ```bash
253
+ ARGUS_OPS_AI_MODEL=gpt-4o argus-ops diagnose
254
+ ARGUS_OPS_AI_BASE_URL=http://localhost:11434 argus-ops diagnose --model ollama/llama3.2
255
+ ```
256
+
257
+ ## Repository Structure
258
+
259
+ ```
260
+ src/argus_ops/
261
+ cli.py - Click CLI (scan, diagnose, config, serve commands)
262
+ config.py - YAML + env var config loader
263
+ models.py - Pydantic data models (Finding, Diagnosis, Incident)
264
+ store.py - SQLite incident history (WAL mode, survives restarts)
265
+ logging_config.py - JSON-structured logging + RotatingFileHandler
266
+ collectors/ - Infrastructure data collection
267
+ base.py - BaseCollector ABC
268
+ k8s.py - Kubernetes collector (timeout, event redaction)
269
+ analyzers/ - Rule-based anomaly detection
270
+ base.py - BaseAnalyzer ABC
271
+ pod_health.py - CrashLoopBackOff, OOMKilled, Pending, ImagePullBackOff
272
+ node_health.py - NotReady, pressure conditions, cordoned nodes
273
+ resource.py - Missing CPU/memory limits, allocation ratios
274
+ ai/ - LiteLLM AI provider + Jinja2 prompt templates
275
+ provider.py - LiteLLM with Pydantic validation, 32 KB response limit
276
+ cost.py - Token/cost tracking (Decimal arithmetic)
277
+ engine/
278
+ pipeline.py - Collect->Analyze->Diagnose with retry + circuit breaker
279
+ reporters/ - Rich console + JSON output formatters
280
+ web/
281
+ api.py - FastAPI endpoints
282
+ watch_service.py - Background scan loop + DiagnoseStatus enum
283
+ tests/
284
+ conftest.py - Shared fixtures (mock K8s snapshots, findings)
285
+ fixtures/ - JSON mock data for offline testing
286
+ test_analyzers.py - 15 analyzer tests
287
+ test_models.py - 9 model tests
288
+ test_config.py - 9 config tests
289
+ test_pipeline.py - 15 pipeline tests (incl. circuit breaker)
290
+ test_reporters.py - 5 reporter tests
291
+ test_store.py - 16 SQLite store tests
292
+ test_cli.py - 19 CLI tests
293
+ test_api.py - 23 API endpoint tests
294
+ deploy/k8s/ - Kubernetes deployment manifests
295
+ ```
296
+
297
+ ## Reliability Features
298
+
299
+ Argus-Ops is built for production use:
300
+
301
+ - **Exponential backoff retry** (tenacity): each collector retries up to 3 times with 2s -> 4s -> 8s wait before reporting failure
302
+ - **Circuit breaker** per collector: after 3 consecutive failures, the circuit opens for 60 seconds to prevent thundering-herd API calls against an unreachable cluster
303
+ - **K8s API timeouts**: all Kubernetes API calls use a 30-second timeout
304
+ - **LLM timeouts**: all LiteLLM completion calls use a 60-second timeout
305
+ - **LLM response validation**: responses are parsed through a Pydantic model with a 32 KB size limit
306
+ - **Event message redaction**: Bearer tokens, private registry credentials, and RFC-1918 IPs are stripped before sending to AI providers
307
+ - **SQLite incident history**: incidents persist across restarts with WAL journal mode
308
+ - **JSON structured logging**: machine-readable logs with RotatingFileHandler (10 MB / 5 backups)
309
+
310
+ ## Extending Argus-Ops
311
+
312
+ ### Add a custom collector
313
+
314
+ ```python
315
+ from argus_ops.collectors.base import BaseCollector
316
+ from argus_ops.models import HealthSnapshot, InfraType
317
+
318
+ class MyCollector(BaseCollector):
319
+ @property
320
+ def name(self) -> str:
321
+ return "my_collector"
322
+
323
+ @property
324
+ def infra_type(self) -> InfraType:
325
+ return InfraType.KUBERNETES
326
+
327
+ def is_available(self) -> bool:
328
+ return True
329
+
330
+ def collect(self) -> list[HealthSnapshot]:
331
+ return [HealthSnapshot(
332
+ collector_name=self.name,
333
+ infra_type=self.infra_type,
334
+ target="my://target",
335
+ data={"custom": "data"},
336
+ )]
337
+ ```
338
+
339
+ ### Add a custom analyzer
340
+
341
+ ```python
342
+ from argus_ops.analyzers.base import BaseAnalyzer
343
+ from argus_ops.models import Finding, FindingCategory, HealthSnapshot, Severity, InfraType
344
+
345
+ class MyAnalyzer(BaseAnalyzer):
346
+ @property
347
+ def name(self) -> str:
348
+ return "my_analyzer"
349
+
350
+ def analyze(self, snapshots: list[HealthSnapshot]) -> list[Finding]:
351
+ findings = []
352
+ for snapshot in snapshots:
353
+ # your detection logic here
354
+ pass
355
+ return findings
356
+ ```
357
+
358
+ ## Roadmap
359
+
360
+ - [x] v0.1.0 - K8s scan + AI diagnosis + Rich console output
361
+ - [x] v0.2.0 - Security hardening (timeouts, event redaction, LLM response validation)
362
+ - [x] v0.3.0 - Reliability hardening (circuit breaker, retry, SQLite history, 109 tests)
363
+ - [ ] v0.4.0 - SSH collector (bare metal/VMs) + Slack/Teams notifications
364
+ - [ ] v0.5.0 - Remediation engine with human approval gate (K8s Healer)
365
+ - [ ] v1.0.0 - Helm chart, Docker Hub image, stable API
366
+
367
+ ## Contributing
368
+
369
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for the full guide: dev environment setup,
370
+ how to add collectors and analyzers, test requirements, and PR checklist.
371
+
372
+ ## Security
373
+
374
+ To report a vulnerability, see [SECURITY.md](SECURITY.md).
375
+
376
+ ## Author
377
+
378
+ Mason Kim - DevSecOps Engineer
379
+ GitHub: [mason5052](https://github.com/mason5052)
380
+ LinkedIn: [linkedin.com/in/mason-kim-devops](https://linkedin.com/in/mason-kim-devops)
381
+
382
+ ## License
383
+
384
+ MIT License - see [LICENSE](LICENSE) for details.