opspilot-ai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opspilot/__init__.py +0 -0
- opspilot/agents/fixer.py +46 -0
- opspilot/agents/planner.py +74 -0
- opspilot/agents/remediation.py +200 -0
- opspilot/agents/verifier.py +67 -0
- opspilot/cli.py +360 -0
- opspilot/config.py +22 -0
- opspilot/context/__init__.py +26 -0
- opspilot/context/deployment_history.py +347 -0
- opspilot/context/deps.py +14 -0
- opspilot/context/docker.py +17 -0
- opspilot/context/env.py +17 -0
- opspilot/context/logs.py +16 -0
- opspilot/context/production_logs.py +262 -0
- opspilot/context/project.py +19 -0
- opspilot/diffs/redis.py +23 -0
- opspilot/graph/engine.py +33 -0
- opspilot/graph/nodes.py +41 -0
- opspilot/memory.py +24 -0
- opspilot/memory_redis.py +322 -0
- opspilot/state.py +18 -0
- opspilot/tools/__init__.py +52 -0
- opspilot/tools/dep_tools.py +5 -0
- opspilot/tools/env_tools.py +5 -0
- opspilot/tools/log_tools.py +11 -0
- opspilot/tools/pattern_analysis.py +194 -0
- opspilot/utils/__init__.py +1 -0
- opspilot/utils/llm.py +23 -0
- opspilot/utils/llm_providers.py +499 -0
- opspilot_ai-0.1.0.dist-info/METADATA +408 -0
- opspilot_ai-0.1.0.dist-info/RECORD +35 -0
- opspilot_ai-0.1.0.dist-info/WHEEL +5 -0
- opspilot_ai-0.1.0.dist-info/entry_points.txt +2 -0
- opspilot_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
- opspilot_ai-0.1.0.dist-info/top_level.txt +1 -0
opspilot/__init__.py
ADDED
|
File without changes
|
opspilot/agents/fixer.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
import json
|
|
3
|
+
from opspilot.utils.llm import call_llama, safe_json_parse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
SYSTEM_PROMPT = """
|
|
7
|
+
You are a senior engineer generating SAFE, DRY-RUN fix suggestions.
|
|
8
|
+
|
|
9
|
+
Rules:
|
|
10
|
+
- Output ONLY valid JSON
|
|
11
|
+
- Do NOT include explanations outside JSON
|
|
12
|
+
- Do NOT apply changes
|
|
13
|
+
- Diffs must be strings
|
|
14
|
+
- If unsure, return an empty suggestions list
|
|
15
|
+
|
|
16
|
+
STRICT JSON FORMAT:
|
|
17
|
+
{
|
|
18
|
+
"suggestions": [
|
|
19
|
+
{
|
|
20
|
+
"file": "path",
|
|
21
|
+
"diff": "unified diff as a string",
|
|
22
|
+
"rationale": "why this helps"
|
|
23
|
+
}
|
|
24
|
+
]
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
Return ONLY JSON. No text before or after.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def suggest(hypothesis: str, evidence: Dict) -> Dict:
|
|
32
|
+
prompt = SYSTEM_PROMPT + f"""
|
|
33
|
+
|
|
34
|
+
HYPOTHESIS:
|
|
35
|
+
{hypothesis}
|
|
36
|
+
|
|
37
|
+
EVIDENCE:
|
|
38
|
+
{json.dumps(evidence, indent=2)}
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
raw_output = call_llama(prompt)
|
|
43
|
+
return safe_json_parse(raw_output)
|
|
44
|
+
except Exception:
|
|
45
|
+
# Silent failure: template-based fixes will still apply
|
|
46
|
+
return {"suggestions": []}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
import json
|
|
3
|
+
from opspilot.utils.llm import call_llama, safe_json_parse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
SYSTEM_PROMPT = """
|
|
7
|
+
You are a senior site reliability engineer.
|
|
8
|
+
|
|
9
|
+
Your task is to analyze project context and form a hypothesis
|
|
10
|
+
about the root cause of runtime issues.
|
|
11
|
+
|
|
12
|
+
Rules:
|
|
13
|
+
- Do NOT suggest fixes
|
|
14
|
+
- Do NOT use tools
|
|
15
|
+
- Base reasoning ONLY on provided context
|
|
16
|
+
- If information is missing, say so
|
|
17
|
+
- Output STRICT JSON only
|
|
18
|
+
|
|
19
|
+
CRITICAL: Your response must be ONLY valid JSON with this exact format:
|
|
20
|
+
{
|
|
21
|
+
"hypothesis": "...",
|
|
22
|
+
"confidence": 0.0,
|
|
23
|
+
"possible_causes": ["..."],
|
|
24
|
+
"required_checks": ["..."]
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
Do not include any text before the opening { or after the closing }.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def plan(context: Dict) -> Dict:
|
|
32
|
+
"""
|
|
33
|
+
Planner agent:
|
|
34
|
+
- Summarizes context
|
|
35
|
+
- Calls LLM exactly once
|
|
36
|
+
- Enforces strict JSON output
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
summarized_context = {
|
|
40
|
+
"logs": context.get("logs", "")[:2000] if context.get("logs") else None,
|
|
41
|
+
"env_vars": list(context.get("env", {}).keys()),
|
|
42
|
+
"docker_present": bool(context.get("docker")),
|
|
43
|
+
"dependencies": context.get("dependencies", [])[:20],
|
|
44
|
+
"structure": str(context.get("structure", ""))[:1000],
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
user_prompt = f"""
|
|
48
|
+
PROJECT CONTEXT:
|
|
49
|
+
{json.dumps(summarized_context, indent=2)}
|
|
50
|
+
|
|
51
|
+
Analyze the context and produce a hypothesis.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
full_prompt = SYSTEM_PROMPT + "\n" + user_prompt
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
raw_output = call_llama(full_prompt)
|
|
58
|
+
result = safe_json_parse(raw_output)
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
"hypothesis": result.get("hypothesis"),
|
|
62
|
+
"confidence": float(result.get("confidence", 0.0)),
|
|
63
|
+
"possible_causes": result.get("possible_causes", []),
|
|
64
|
+
"required_checks": result.get("required_checks", []),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
except Exception:
|
|
68
|
+
# Production-grade failure semantics
|
|
69
|
+
return {
|
|
70
|
+
"hypothesis": None,
|
|
71
|
+
"confidence": 0.0,
|
|
72
|
+
"possible_causes": [],
|
|
73
|
+
"required_checks": [],
|
|
74
|
+
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Remediation plan generator for production incidents."""
|
|
2
|
+
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def generate_remediation_plan(hypothesis: str, evidence: Dict, suggestions: List[Dict]) -> Dict:
|
|
7
|
+
"""
|
|
8
|
+
Generate manager-friendly remediation plan with immediate, short-term, and long-term actions.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
hypothesis: Root cause hypothesis
|
|
12
|
+
evidence: Collected evidence
|
|
13
|
+
suggestions: LLM-generated fix suggestions
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Structured remediation plan with actions, commands, risks, and time estimates
|
|
17
|
+
"""
|
|
18
|
+
plan = {
|
|
19
|
+
"immediate_actions": [],
|
|
20
|
+
"short_term_fixes": [],
|
|
21
|
+
"long_term_fixes": [],
|
|
22
|
+
"verification_steps": []
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Determine issue type from evidence
|
|
26
|
+
severity = evidence.get("severity", "P3")
|
|
27
|
+
uses_redis = evidence.get("uses_redis", False)
|
|
28
|
+
has_timeouts = evidence.get("timeout_errors", 0) > 0
|
|
29
|
+
has_http_5xx = any(
|
|
30
|
+
code.startswith('5')
|
|
31
|
+
for code in evidence.get("error_patterns", {}).get("http_errors", {}).keys()
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Generate immediate actions based on severity and evidence
|
|
35
|
+
if severity in ["P0", "P1"]:
|
|
36
|
+
if uses_redis and has_timeouts:
|
|
37
|
+
plan["immediate_actions"].extend([
|
|
38
|
+
{
|
|
39
|
+
"step": 1,
|
|
40
|
+
"action": "Check Redis service status",
|
|
41
|
+
"command": "redis-cli ping || systemctl status redis",
|
|
42
|
+
"risk": "LOW",
|
|
43
|
+
"estimated_time": "30 seconds",
|
|
44
|
+
"rationale": "Verify Redis is running and responsive"
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"step": 2,
|
|
48
|
+
"action": "Check Redis connection count",
|
|
49
|
+
"command": "redis-cli info clients | grep connected_clients",
|
|
50
|
+
"risk": "LOW",
|
|
51
|
+
"estimated_time": "30 seconds",
|
|
52
|
+
"rationale": "Identify if connection pool is exhausted"
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"step": 3,
|
|
56
|
+
"action": "Restart application service (if safe)",
|
|
57
|
+
"command": "systemctl restart app-service",
|
|
58
|
+
"risk": "MEDIUM",
|
|
59
|
+
"estimated_time": "2 minutes",
|
|
60
|
+
"rationale": "Clear stale connections and refresh connection pool"
|
|
61
|
+
}
|
|
62
|
+
])
|
|
63
|
+
|
|
64
|
+
if has_http_5xx:
|
|
65
|
+
plan["immediate_actions"].append({
|
|
66
|
+
"step": len(plan["immediate_actions"]) + 1,
|
|
67
|
+
"action": "Check application logs for recent errors",
|
|
68
|
+
"command": "tail -100 /var/log/app/error.log",
|
|
69
|
+
"risk": "LOW",
|
|
70
|
+
"estimated_time": "1 minute",
|
|
71
|
+
"rationale": "Identify immediate cause of 5xx errors"
|
|
72
|
+
})
|
|
73
|
+
|
|
74
|
+
# Convert LLM suggestions to short-term fixes
|
|
75
|
+
for idx, suggestion in enumerate(suggestions, 1):
|
|
76
|
+
plan["short_term_fixes"].append({
|
|
77
|
+
"step": idx,
|
|
78
|
+
"action": f"Update {suggestion.get('file', 'configuration')}",
|
|
79
|
+
"file": suggestion.get("file"),
|
|
80
|
+
"diff": suggestion.get("diff"),
|
|
81
|
+
"rationale": suggestion.get("rationale"),
|
|
82
|
+
"risk": "LOW",
|
|
83
|
+
"requires_restart": True
|
|
84
|
+
})
|
|
85
|
+
|
|
86
|
+
# Add standard short-term fixes based on evidence
|
|
87
|
+
if uses_redis and has_timeouts:
|
|
88
|
+
plan["short_term_fixes"].extend([
|
|
89
|
+
{
|
|
90
|
+
"step": len(plan["short_term_fixes"]) + 1,
|
|
91
|
+
"action": "Increase Redis timeout",
|
|
92
|
+
"file": ".env or config file",
|
|
93
|
+
"change": "REDIS_TIMEOUT=1 → REDIS_TIMEOUT=5",
|
|
94
|
+
"rationale": "Reduce timeout errors under load",
|
|
95
|
+
"risk": "LOW"
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"step": len(plan["short_term_fixes"]) + 1,
|
|
99
|
+
"action": "Enable Redis connection pooling",
|
|
100
|
+
"file": "app/config/redis.py",
|
|
101
|
+
"change": "Add max_connections=50, socket_timeout=5",
|
|
102
|
+
"rationale": "Prevent connection exhaustion",
|
|
103
|
+
"risk": "LOW"
|
|
104
|
+
}
|
|
105
|
+
])
|
|
106
|
+
|
|
107
|
+
# Generate long-term fixes
|
|
108
|
+
if uses_redis:
|
|
109
|
+
plan["long_term_fixes"].extend([
|
|
110
|
+
{
|
|
111
|
+
"action": "Implement circuit breaker pattern",
|
|
112
|
+
"estimated_effort": "2-3 days",
|
|
113
|
+
"rationale": "Prevent cascading failures when Redis is unavailable"
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"action": "Add Redis connection monitoring",
|
|
117
|
+
"estimated_effort": "1 day",
|
|
118
|
+
"rationale": "Alert on connection pool exhaustion before it causes issues"
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
"action": "Implement Redis failover/HA setup",
|
|
122
|
+
"estimated_effort": "1 week",
|
|
123
|
+
"rationale": "Ensure Redis availability with automatic failover"
|
|
124
|
+
}
|
|
125
|
+
])
|
|
126
|
+
|
|
127
|
+
# Generate verification steps
|
|
128
|
+
if severity in ["P0", "P1"]:
|
|
129
|
+
plan["verification_steps"] = [
|
|
130
|
+
"Monitor error rate (should drop to <5/min within 5 minutes)",
|
|
131
|
+
"Check Redis connection count (should stabilize below 80% of max)",
|
|
132
|
+
"Verify application response time (should return to normal)",
|
|
133
|
+
"Check for any new errors in logs"
|
|
134
|
+
]
|
|
135
|
+
else:
|
|
136
|
+
plan["verification_steps"] = [
|
|
137
|
+
"Monitor error rate over next 24 hours",
|
|
138
|
+
"Review application metrics for improvements",
|
|
139
|
+
"Check logs for recurring patterns"
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
return plan
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def format_remediation_output(plan: Dict) -> str:
|
|
146
|
+
"""
|
|
147
|
+
Format remediation plan as human-readable text.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
plan: Remediation plan dictionary
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Formatted multi-line string
|
|
154
|
+
"""
|
|
155
|
+
output = []
|
|
156
|
+
|
|
157
|
+
if plan.get("immediate_actions"):
|
|
158
|
+
output.append("\n[!] IMMEDIATE ACTIONS (0-5 min):")
|
|
159
|
+
output.append("=" * 60)
|
|
160
|
+
for action in plan["immediate_actions"]:
|
|
161
|
+
output.append(f"\n{action['step']}. {action['action']}")
|
|
162
|
+
if action.get("command"):
|
|
163
|
+
output.append(f" Command: {action['command']}")
|
|
164
|
+
output.append(f" Risk: {action['risk']} | Time: {action['estimated_time']}")
|
|
165
|
+
if action.get("rationale"):
|
|
166
|
+
output.append(f" Why: {action['rationale']}")
|
|
167
|
+
|
|
168
|
+
if plan.get("short_term_fixes"):
|
|
169
|
+
output.append("\n\n[~] SHORT-TERM FIXES (1-24 hours):")
|
|
170
|
+
output.append("=" * 60)
|
|
171
|
+
for fix in plan["short_term_fixes"]:
|
|
172
|
+
output.append(f"\n{fix['step']}. {fix['action']}")
|
|
173
|
+
if fix.get("file"):
|
|
174
|
+
output.append(f" File: {fix['file']}")
|
|
175
|
+
if fix.get("change"):
|
|
176
|
+
output.append(f" Change: {fix['change']}")
|
|
177
|
+
elif fix.get("diff"):
|
|
178
|
+
output.append(f" Diff: [See above]")
|
|
179
|
+
if fix.get("rationale"):
|
|
180
|
+
output.append(f" Why: {fix['rationale']}")
|
|
181
|
+
if fix.get("risk"):
|
|
182
|
+
output.append(f" Risk: {fix['risk']}")
|
|
183
|
+
|
|
184
|
+
if plan.get("long_term_fixes"):
|
|
185
|
+
output.append("\n\n[+] LONG-TERM FIXES (1-4 weeks):")
|
|
186
|
+
output.append("=" * 60)
|
|
187
|
+
for idx, fix in enumerate(plan["long_term_fixes"], 1):
|
|
188
|
+
output.append(f"\n{idx}. {fix['action']}")
|
|
189
|
+
if fix.get("estimated_effort"):
|
|
190
|
+
output.append(f" Effort: {fix['estimated_effort']}")
|
|
191
|
+
if fix.get("rationale"):
|
|
192
|
+
output.append(f" Why: {fix['rationale']}")
|
|
193
|
+
|
|
194
|
+
if plan.get("verification_steps"):
|
|
195
|
+
output.append("\n\n[v] VERIFICATION STEPS:")
|
|
196
|
+
output.append("=" * 60)
|
|
197
|
+
for idx, step in enumerate(plan["verification_steps"], 1):
|
|
198
|
+
output.append(f"{idx}. {step}")
|
|
199
|
+
|
|
200
|
+
return "\n".join(output)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
import json
|
|
3
|
+
from opspilot.utils.llm import call_llama, safe_json_parse
|
|
4
|
+
|
|
5
|
+
SYSTEM_PROMPT = """
|
|
6
|
+
You are a senior reliability engineer.
|
|
7
|
+
|
|
8
|
+
You are given:
|
|
9
|
+
- A hypothesis
|
|
10
|
+
- Evidence collected from tools
|
|
11
|
+
|
|
12
|
+
Your task:
|
|
13
|
+
- Decide if the hypothesis is supported
|
|
14
|
+
- Explain briefly
|
|
15
|
+
- Update confidence
|
|
16
|
+
- Output STRICT JSON only
|
|
17
|
+
|
|
18
|
+
Format:
|
|
19
|
+
{
|
|
20
|
+
"supported": true/false,
|
|
21
|
+
"confidence": 0.0,
|
|
22
|
+
"reason": "..."
|
|
23
|
+
}
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def verify(hypothesis: str, evidence: Dict) -> Dict:
|
|
28
|
+
prompt = SYSTEM_PROMPT + f"""
|
|
29
|
+
HYPOTHESIS:
|
|
30
|
+
{hypothesis}
|
|
31
|
+
|
|
32
|
+
EVIDENCE:
|
|
33
|
+
{json.dumps(evidence, indent=2)}
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
# Use multi-provider LLM system with automatic fallback
|
|
38
|
+
raw_output = call_llama(prompt, timeout=120)
|
|
39
|
+
|
|
40
|
+
if not raw_output:
|
|
41
|
+
print(f"[ERROR] Verifier LLM returned empty response")
|
|
42
|
+
return {
|
|
43
|
+
"supported": False,
|
|
44
|
+
"confidence": 0.0,
|
|
45
|
+
"reason": "LLM verification failed"
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Parse JSON response using safe parser
|
|
49
|
+
result = safe_json_parse(raw_output)
|
|
50
|
+
|
|
51
|
+
if result and "supported" in result:
|
|
52
|
+
return result
|
|
53
|
+
else:
|
|
54
|
+
print(f"[ERROR] Invalid verifier response format: {raw_output}")
|
|
55
|
+
return {
|
|
56
|
+
"supported": False,
|
|
57
|
+
"confidence": 0.0,
|
|
58
|
+
"reason": "Unable to verify hypothesis"
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
except Exception as e:
|
|
62
|
+
print(f"[ERROR] Verifier failed: {e}")
|
|
63
|
+
return {
|
|
64
|
+
"supported": False,
|
|
65
|
+
"confidence": 0.0,
|
|
66
|
+
"reason": f"Verification error: {str(e)}"
|
|
67
|
+
}
|