opspilot-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
opspilot/__init__.py ADDED
File without changes
@@ -0,0 +1,46 @@
1
+ from typing import Dict
2
+ import json
3
+ from opspilot.utils.llm import call_llama, safe_json_parse
4
+
5
+
6
+ SYSTEM_PROMPT = """
7
+ You are a senior engineer generating SAFE, DRY-RUN fix suggestions.
8
+
9
+ Rules:
10
+ - Output ONLY valid JSON
11
+ - Do NOT include explanations outside JSON
12
+ - Do NOT apply changes
13
+ - Diffs must be strings
14
+ - If unsure, return an empty suggestions list
15
+
16
+ STRICT JSON FORMAT:
17
+ {
18
+ "suggestions": [
19
+ {
20
+ "file": "path",
21
+ "diff": "unified diff as a string",
22
+ "rationale": "why this helps"
23
+ }
24
+ ]
25
+ }
26
+
27
+ Return ONLY JSON. No text before or after.
28
+ """
29
+
30
+
31
+ def suggest(hypothesis: str, evidence: Dict) -> Dict:
32
+ prompt = SYSTEM_PROMPT + f"""
33
+
34
+ HYPOTHESIS:
35
+ {hypothesis}
36
+
37
+ EVIDENCE:
38
+ {json.dumps(evidence, indent=2)}
39
+ """
40
+
41
+ try:
42
+ raw_output = call_llama(prompt)
43
+ return safe_json_parse(raw_output)
44
+ except Exception:
45
+ # Silent failure: template-based fixes will still apply
46
+ return {"suggestions": []}
@@ -0,0 +1,74 @@
1
+ from typing import Dict
2
+ import json
3
+ from opspilot.utils.llm import call_llama, safe_json_parse
4
+
5
+
6
+ SYSTEM_PROMPT = """
7
+ You are a senior site reliability engineer.
8
+
9
+ Your task is to analyze project context and form a hypothesis
10
+ about the root cause of runtime issues.
11
+
12
+ Rules:
13
+ - Do NOT suggest fixes
14
+ - Do NOT use tools
15
+ - Base reasoning ONLY on provided context
16
+ - If information is missing, say so
17
+ - Output STRICT JSON only
18
+
19
+ CRITICAL: Your response must be ONLY valid JSON with this exact format:
20
+ {
21
+ "hypothesis": "...",
22
+ "confidence": 0.0,
23
+ "possible_causes": ["..."],
24
+ "required_checks": ["..."]
25
+ }
26
+
27
+ Do not include any text before the opening { or after the closing }.
28
+ """
29
+
30
+
31
+ def plan(context: Dict) -> Dict:
32
+ """
33
+ Planner agent:
34
+ - Summarizes context
35
+ - Calls LLM exactly once
36
+ - Enforces strict JSON output
37
+ """
38
+
39
+ summarized_context = {
40
+ "logs": context.get("logs", "")[:2000] if context.get("logs") else None,
41
+ "env_vars": list(context.get("env", {}).keys()),
42
+ "docker_present": bool(context.get("docker")),
43
+ "dependencies": context.get("dependencies", [])[:20],
44
+ "structure": str(context.get("structure", ""))[:1000],
45
+ }
46
+
47
+ user_prompt = f"""
48
+ PROJECT CONTEXT:
49
+ {json.dumps(summarized_context, indent=2)}
50
+
51
+ Analyze the context and produce a hypothesis.
52
+ """
53
+
54
+ full_prompt = SYSTEM_PROMPT + "\n" + user_prompt
55
+
56
+ try:
57
+ raw_output = call_llama(full_prompt)
58
+ result = safe_json_parse(raw_output)
59
+
60
+ return {
61
+ "hypothesis": result.get("hypothesis"),
62
+ "confidence": float(result.get("confidence", 0.0)),
63
+ "possible_causes": result.get("possible_causes", []),
64
+ "required_checks": result.get("required_checks", []),
65
+ }
66
+
67
+ except Exception:
68
+ # Production-grade failure semantics
69
+ return {
70
+ "hypothesis": None,
71
+ "confidence": 0.0,
72
+ "possible_causes": [],
73
+ "required_checks": [],
74
+ }
@@ -0,0 +1,200 @@
1
+ """Remediation plan generator for production incidents."""
2
+
3
+ from typing import Dict, List
4
+
5
+
6
+ def generate_remediation_plan(hypothesis: str, evidence: Dict, suggestions: List[Dict]) -> Dict:
7
+ """
8
+ Generate manager-friendly remediation plan with immediate, short-term, and long-term actions.
9
+
10
+ Args:
11
+ hypothesis: Root cause hypothesis
12
+ evidence: Collected evidence
13
+ suggestions: LLM-generated fix suggestions
14
+
15
+ Returns:
16
+ Structured remediation plan with actions, commands, risks, and time estimates
17
+ """
18
+ plan = {
19
+ "immediate_actions": [],
20
+ "short_term_fixes": [],
21
+ "long_term_fixes": [],
22
+ "verification_steps": []
23
+ }
24
+
25
+ # Determine issue type from evidence
26
+ severity = evidence.get("severity", "P3")
27
+ uses_redis = evidence.get("uses_redis", False)
28
+ has_timeouts = evidence.get("timeout_errors", 0) > 0
29
+ has_http_5xx = any(
30
+ code.startswith('5')
31
+ for code in evidence.get("error_patterns", {}).get("http_errors", {}).keys()
32
+ )
33
+
34
+ # Generate immediate actions based on severity and evidence
35
+ if severity in ["P0", "P1"]:
36
+ if uses_redis and has_timeouts:
37
+ plan["immediate_actions"].extend([
38
+ {
39
+ "step": 1,
40
+ "action": "Check Redis service status",
41
+ "command": "redis-cli ping || systemctl status redis",
42
+ "risk": "LOW",
43
+ "estimated_time": "30 seconds",
44
+ "rationale": "Verify Redis is running and responsive"
45
+ },
46
+ {
47
+ "step": 2,
48
+ "action": "Check Redis connection count",
49
+ "command": "redis-cli info clients | grep connected_clients",
50
+ "risk": "LOW",
51
+ "estimated_time": "30 seconds",
52
+ "rationale": "Identify if connection pool is exhausted"
53
+ },
54
+ {
55
+ "step": 3,
56
+ "action": "Restart application service (if safe)",
57
+ "command": "systemctl restart app-service",
58
+ "risk": "MEDIUM",
59
+ "estimated_time": "2 minutes",
60
+ "rationale": "Clear stale connections and refresh connection pool"
61
+ }
62
+ ])
63
+
64
+ if has_http_5xx:
65
+ plan["immediate_actions"].append({
66
+ "step": len(plan["immediate_actions"]) + 1,
67
+ "action": "Check application logs for recent errors",
68
+ "command": "tail -100 /var/log/app/error.log",
69
+ "risk": "LOW",
70
+ "estimated_time": "1 minute",
71
+ "rationale": "Identify immediate cause of 5xx errors"
72
+ })
73
+
74
+ # Convert LLM suggestions to short-term fixes
75
+ for idx, suggestion in enumerate(suggestions, 1):
76
+ plan["short_term_fixes"].append({
77
+ "step": idx,
78
+ "action": f"Update {suggestion.get('file', 'configuration')}",
79
+ "file": suggestion.get("file"),
80
+ "diff": suggestion.get("diff"),
81
+ "rationale": suggestion.get("rationale"),
82
+ "risk": "LOW",
83
+ "requires_restart": True
84
+ })
85
+
86
+ # Add standard short-term fixes based on evidence
87
+ if uses_redis and has_timeouts:
88
+ plan["short_term_fixes"].extend([
89
+ {
90
+ "step": len(plan["short_term_fixes"]) + 1,
91
+ "action": "Increase Redis timeout",
92
+ "file": ".env or config file",
93
+ "change": "REDIS_TIMEOUT=1 → REDIS_TIMEOUT=5",
94
+ "rationale": "Reduce timeout errors under load",
95
+ "risk": "LOW"
96
+ },
97
+ {
98
+ "step": len(plan["short_term_fixes"]) + 1,
99
+ "action": "Enable Redis connection pooling",
100
+ "file": "app/config/redis.py",
101
+ "change": "Add max_connections=50, socket_timeout=5",
102
+ "rationale": "Prevent connection exhaustion",
103
+ "risk": "LOW"
104
+ }
105
+ ])
106
+
107
+ # Generate long-term fixes
108
+ if uses_redis:
109
+ plan["long_term_fixes"].extend([
110
+ {
111
+ "action": "Implement circuit breaker pattern",
112
+ "estimated_effort": "2-3 days",
113
+ "rationale": "Prevent cascading failures when Redis is unavailable"
114
+ },
115
+ {
116
+ "action": "Add Redis connection monitoring",
117
+ "estimated_effort": "1 day",
118
+ "rationale": "Alert on connection pool exhaustion before it causes issues"
119
+ },
120
+ {
121
+ "action": "Implement Redis failover/HA setup",
122
+ "estimated_effort": "1 week",
123
+ "rationale": "Ensure Redis availability with automatic failover"
124
+ }
125
+ ])
126
+
127
+ # Generate verification steps
128
+ if severity in ["P0", "P1"]:
129
+ plan["verification_steps"] = [
130
+ "Monitor error rate (should drop to <5/min within 5 minutes)",
131
+ "Check Redis connection count (should stabilize below 80% of max)",
132
+ "Verify application response time (should return to normal)",
133
+ "Check for any new errors in logs"
134
+ ]
135
+ else:
136
+ plan["verification_steps"] = [
137
+ "Monitor error rate over next 24 hours",
138
+ "Review application metrics for improvements",
139
+ "Check logs for recurring patterns"
140
+ ]
141
+
142
+ return plan
143
+
144
+
145
+ def format_remediation_output(plan: Dict) -> str:
146
+ """
147
+ Format remediation plan as human-readable text.
148
+
149
+ Args:
150
+ plan: Remediation plan dictionary
151
+
152
+ Returns:
153
+ Formatted multi-line string
154
+ """
155
+ output = []
156
+
157
+ if plan.get("immediate_actions"):
158
+ output.append("\n[!] IMMEDIATE ACTIONS (0-5 min):")
159
+ output.append("=" * 60)
160
+ for action in plan["immediate_actions"]:
161
+ output.append(f"\n{action['step']}. {action['action']}")
162
+ if action.get("command"):
163
+ output.append(f" Command: {action['command']}")
164
+ output.append(f" Risk: {action['risk']} | Time: {action['estimated_time']}")
165
+ if action.get("rationale"):
166
+ output.append(f" Why: {action['rationale']}")
167
+
168
+ if plan.get("short_term_fixes"):
169
+ output.append("\n\n[~] SHORT-TERM FIXES (1-24 hours):")
170
+ output.append("=" * 60)
171
+ for fix in plan["short_term_fixes"]:
172
+ output.append(f"\n{fix['step']}. {fix['action']}")
173
+ if fix.get("file"):
174
+ output.append(f" File: {fix['file']}")
175
+ if fix.get("change"):
176
+ output.append(f" Change: {fix['change']}")
177
+ elif fix.get("diff"):
178
+ output.append(f" Diff: [See above]")
179
+ if fix.get("rationale"):
180
+ output.append(f" Why: {fix['rationale']}")
181
+ if fix.get("risk"):
182
+ output.append(f" Risk: {fix['risk']}")
183
+
184
+ if plan.get("long_term_fixes"):
185
+ output.append("\n\n[+] LONG-TERM FIXES (1-4 weeks):")
186
+ output.append("=" * 60)
187
+ for idx, fix in enumerate(plan["long_term_fixes"], 1):
188
+ output.append(f"\n{idx}. {fix['action']}")
189
+ if fix.get("estimated_effort"):
190
+ output.append(f" Effort: {fix['estimated_effort']}")
191
+ if fix.get("rationale"):
192
+ output.append(f" Why: {fix['rationale']}")
193
+
194
+ if plan.get("verification_steps"):
195
+ output.append("\n\n[v] VERIFICATION STEPS:")
196
+ output.append("=" * 60)
197
+ for idx, step in enumerate(plan["verification_steps"], 1):
198
+ output.append(f"{idx}. {step}")
199
+
200
+ return "\n".join(output)
@@ -0,0 +1,67 @@
1
+ from typing import Dict
2
+ import json
3
+ from opspilot.utils.llm import call_llama, safe_json_parse
4
+
5
+ SYSTEM_PROMPT = """
6
+ You are a senior reliability engineer.
7
+
8
+ You are given:
9
+ - A hypothesis
10
+ - Evidence collected from tools
11
+
12
+ Your task:
13
+ - Decide if the hypothesis is supported
14
+ - Explain briefly
15
+ - Update confidence
16
+ - Output STRICT JSON only
17
+
18
+ Format:
19
+ {
20
+ "supported": true/false,
21
+ "confidence": 0.0,
22
+ "reason": "..."
23
+ }
24
+ """
25
+
26
+
27
+ def verify(hypothesis: str, evidence: Dict) -> Dict:
28
+ prompt = SYSTEM_PROMPT + f"""
29
+ HYPOTHESIS:
30
+ {hypothesis}
31
+
32
+ EVIDENCE:
33
+ {json.dumps(evidence, indent=2)}
34
+ """
35
+
36
+ try:
37
+ # Use multi-provider LLM system with automatic fallback
38
+ raw_output = call_llama(prompt, timeout=120)
39
+
40
+ if not raw_output:
41
+ print(f"[ERROR] Verifier LLM returned empty response")
42
+ return {
43
+ "supported": False,
44
+ "confidence": 0.0,
45
+ "reason": "LLM verification failed"
46
+ }
47
+
48
+ # Parse JSON response using safe parser
49
+ result = safe_json_parse(raw_output)
50
+
51
+ if result and "supported" in result:
52
+ return result
53
+ else:
54
+ print(f"[ERROR] Invalid verifier response format: {raw_output}")
55
+ return {
56
+ "supported": False,
57
+ "confidence": 0.0,
58
+ "reason": "Unable to verify hypothesis"
59
+ }
60
+
61
+ except Exception as e:
62
+ print(f"[ERROR] Verifier failed: {e}")
63
+ return {
64
+ "supported": False,
65
+ "confidence": 0.0,
66
+ "reason": f"Verification error: {str(e)}"
67
+ }