devops-ai-agent 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +1 -0
- agent/classifier.py +135 -0
- agent/core.py +658 -0
- agent/prompts.py +150 -0
- api/__init__.py +1 -0
- api/server.py +222 -0
- collectors/__init__.py +0 -0
- collectors/argocd.py +238 -0
- collectors/aws.py +741 -0
- collectors/azure.py +647 -0
- collectors/azure_devops.py +94 -0
- collectors/bamboo.py +98 -0
- collectors/cloud_registry.py +103 -0
- collectors/database_policy.py +72 -0
- collectors/docker.py +7 -0
- collectors/gcp.py +508 -0
- collectors/github.py +81 -0
- collectors/gitlab.py +81 -0
- collectors/jenkins.py +110 -0
- collectors/k8s.py +164 -0
- collectors/security_scanner.py +343 -0
- collectors/server.py +40 -0
- collectors/server_enhanced.py +265 -0
- devops_agent/__init__.py +3 -0
- devops_agent/cli.py +64 -0
- devops_ai_agent-1.0.0.dist-info/METADATA +952 -0
- devops_ai_agent-1.0.0.dist-info/RECORD +43 -0
- devops_ai_agent-1.0.0.dist-info/WHEEL +5 -0
- devops_ai_agent-1.0.0.dist-info/entry_points.txt +2 -0
- devops_ai_agent-1.0.0.dist-info/licenses/LICENSE +21 -0
- devops_ai_agent-1.0.0.dist-info/top_level.txt +5 -0
- tools/__init__.py +1 -0
- tools/argocd_tools.py +111 -0
- tools/cicd_tools.py +301 -0
- tools/cloud_tools.py +337 -0
- tools/documentation_generator.py +452 -0
- tools/email_notifier.py +453 -0
- tools/executor.py +142 -0
- tools/fix_verifier.py +349 -0
- tools/github_tools.py +107 -0
- tools/k8s_tools.py +123 -0
- tools/notify.py +133 -0
- tools/safe_executor_enhanced.py +423 -0
agent/core.py
ADDED
|
@@ -0,0 +1,658 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DevOps AI Agent — Core Agent Loop
|
|
3
|
+
Uses Claude's tool_use to reason, decide, and act on infrastructure incidents.
|
|
4
|
+
"""
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import anthropic
|
|
10
|
+
import structlog
|
|
11
|
+
|
|
12
|
+
from agent.classifier import classify_issue, get_cicd_platform
|
|
13
|
+
from agent.prompts import get_system_prompt
|
|
14
|
+
from collectors.k8s import K8sCollector
|
|
15
|
+
from collectors.github import GitHubCollector
|
|
16
|
+
from collectors.gitlab import GitLabCollector
|
|
17
|
+
from collectors.jenkins import JenkinsCollector
|
|
18
|
+
from collectors.bamboo import BambooCollector
|
|
19
|
+
from collectors.azure_devops import AzureDevOpsCollector
|
|
20
|
+
from collectors.argocd import ArgoCDCollector
|
|
21
|
+
from collectors.aws import AWSCollector
|
|
22
|
+
from collectors.gcp import GCPCollector
|
|
23
|
+
from collectors.azure import AzureCollector
|
|
24
|
+
from collectors.server import ServerCollector
|
|
25
|
+
from tools.executor import SafeExecutor
|
|
26
|
+
from tools.k8s_tools import K8sTools
|
|
27
|
+
from tools.github_tools import GitHubTools
|
|
28
|
+
from tools.cicd_tools import CICDTools
|
|
29
|
+
from tools.argocd_tools import ArgoCDTools
|
|
30
|
+
from tools.cloud_tools import CloudTools
|
|
31
|
+
from tools.notify import SlackNotifier
|
|
32
|
+
from collectors.database_policy import check_database_access
|
|
33
|
+
|
|
34
|
+
log = structlog.get_logger()
|
|
35
|
+
|
|
36
|
+
AGENT_TOOLS = [
|
|
37
|
+
{
|
|
38
|
+
"name": "get_k8s_context",
|
|
39
|
+
"description": "Fetch Kubernetes pod logs, events, describe output, and resource usage for a failing pod.",
|
|
40
|
+
"input_schema": {
|
|
41
|
+
"type": "object",
|
|
42
|
+
"properties": {
|
|
43
|
+
"namespace": {"type": "string", "description": "K8s namespace"},
|
|
44
|
+
"pod_name": {"type": "string", "description": "Pod name or prefix"},
|
|
45
|
+
"include_previous": {"type": "boolean", "description": "Include logs from previous crashed container"},
|
|
46
|
+
},
|
|
47
|
+
"required": ["namespace"],
|
|
48
|
+
},
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"name": "get_github_logs",
|
|
52
|
+
"description": "Fetch the full failed job logs from a GitHub Actions workflow run.",
|
|
53
|
+
"input_schema": {
|
|
54
|
+
"type": "object",
|
|
55
|
+
"properties": {
|
|
56
|
+
"repo": {"type": "string", "description": "owner/repo"},
|
|
57
|
+
"run_id": {"type": "integer", "description": "Workflow run ID"},
|
|
58
|
+
},
|
|
59
|
+
"required": ["repo", "run_id"],
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"name": "apply_k8s_manifest",
|
|
64
|
+
"description": "Apply a fixed Kubernetes YAML manifest. Always dry_run=true first.",
|
|
65
|
+
"input_schema": {
|
|
66
|
+
"type": "object",
|
|
67
|
+
"properties": {
|
|
68
|
+
"manifest_yaml": {"type": "string", "description": "Complete YAML manifest"},
|
|
69
|
+
"dry_run": {"type": "boolean", "description": "If true, validate only (don't apply). Default true."},
|
|
70
|
+
"namespace": {"type": "string"},
|
|
71
|
+
},
|
|
72
|
+
"required": ["manifest_yaml"],
|
|
73
|
+
},
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"name": "run_kubectl",
|
|
77
|
+
"description": "Run a safe kubectl command (restart, scale, rollout). No delete commands.",
|
|
78
|
+
"input_schema": {
|
|
79
|
+
"type": "object",
|
|
80
|
+
"properties": {
|
|
81
|
+
"command": {"type": "string", "description": "kubectl command without 'kubectl' prefix, e.g. 'rollout restart deployment/api -n production'"},
|
|
82
|
+
},
|
|
83
|
+
"required": ["command"],
|
|
84
|
+
},
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
"name": "run_shell_command",
|
|
88
|
+
"description": "Run a safe server remediation command (systemctl, nginx, df, ps). No rm or destructive commands.",
|
|
89
|
+
"input_schema": {
|
|
90
|
+
"type": "object",
|
|
91
|
+
"properties": {
|
|
92
|
+
"command": {"type": "string", "description": "Shell command to execute"},
|
|
93
|
+
"host": {"type": "string", "description": "Target host (optional, defaults to localhost)"},
|
|
94
|
+
},
|
|
95
|
+
"required": ["command"],
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
"name": "create_github_pr",
|
|
100
|
+
"description": "Create a GitHub PR with a config/Dockerfile fix applied to a branch.",
|
|
101
|
+
"input_schema": {
|
|
102
|
+
"type": "object",
|
|
103
|
+
"properties": {
|
|
104
|
+
"repo": {"type": "string"},
|
|
105
|
+
"file_path": {"type": "string", "description": "File to update e.g. '.github/workflows/deploy.yml'"},
|
|
106
|
+
"new_content": {"type": "string", "description": "Full new file content"},
|
|
107
|
+
"pr_title": {"type": "string"},
|
|
108
|
+
"pr_body": {"type": "string"},
|
|
109
|
+
},
|
|
110
|
+
"required": ["repo", "file_path", "new_content", "pr_title", "pr_body"],
|
|
111
|
+
},
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
"name": "rollback_deployment",
|
|
115
|
+
"description": "Roll back a Kubernetes deployment to the previous revision.",
|
|
116
|
+
"input_schema": {
|
|
117
|
+
"type": "object",
|
|
118
|
+
"properties": {
|
|
119
|
+
"deployment": {"type": "string"},
|
|
120
|
+
"namespace": {"type": "string"},
|
|
121
|
+
"revision": {"type": "integer", "description": "Specific revision number, or omit for previous"},
|
|
122
|
+
},
|
|
123
|
+
"required": ["deployment", "namespace"],
|
|
124
|
+
},
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
"name": "notify_slack",
|
|
128
|
+
"description": "Send a message to Slack with diagnosis and actions taken.",
|
|
129
|
+
"input_schema": {
|
|
130
|
+
"type": "object",
|
|
131
|
+
"properties": {
|
|
132
|
+
"message": {"type": "string", "description": "Human-readable summary"},
|
|
133
|
+
"severity": {"type": "string", "enum": ["info", "warning", "critical"]},
|
|
134
|
+
"resolved": {"type": "boolean"},
|
|
135
|
+
"requires_approval": {"type": "boolean"},
|
|
136
|
+
"approval_command": {"type": "string", "description": "Command that needs human approval"},
|
|
137
|
+
},
|
|
138
|
+
"required": ["message", "severity"],
|
|
139
|
+
},
|
|
140
|
+
},
|
|
141
|
+
# ─── Multi-platform CI/CD Tools ───────────────────────────────────────────
|
|
142
|
+
{
|
|
143
|
+
"name": "get_cicd_logs",
|
|
144
|
+
"description": "Fetch CI/CD pipeline/build logs. Supports: GitHub Actions, GitLab CI, Jenkins, Bamboo, Azure DevOps.",
|
|
145
|
+
"input_schema": {
|
|
146
|
+
"type": "object",
|
|
147
|
+
"properties": {
|
|
148
|
+
"platform": {"type": "string", "enum": ["github", "gitlab", "jenkins", "bamboo", "azure_devops"]},
|
|
149
|
+
"project_id": {"type": "string", "description": "Project/repo/job identifier"},
|
|
150
|
+
"pipeline_id": {"type": "string", "description": "Pipeline/build/run ID"},
|
|
151
|
+
"additional_params": {"type": "object", "description": "Platform-specific params (e.g., zone, cluster)"},
|
|
152
|
+
},
|
|
153
|
+
"required": ["platform", "project_id"],
|
|
154
|
+
},
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
"name": "retry_cicd_pipeline",
|
|
158
|
+
"description": "Retry a failed CI/CD pipeline. Supports: GitLab, Jenkins, Bamboo, Azure DevOps.",
|
|
159
|
+
"input_schema": {
|
|
160
|
+
"type": "object",
|
|
161
|
+
"properties": {
|
|
162
|
+
"platform": {"type": "string", "enum": ["gitlab", "jenkins", "bamboo", "azure_devops"]},
|
|
163
|
+
"project_id": {"type": "string", "description": "Project/job identifier"},
|
|
164
|
+
"pipeline_id": {"type": "string", "description": "Pipeline/build ID"},
|
|
165
|
+
"additional_params": {"type": "object", "description": "Platform-specific params"},
|
|
166
|
+
},
|
|
167
|
+
"required": ["platform", "project_id"],
|
|
168
|
+
},
|
|
169
|
+
},
|
|
170
|
+
{
|
|
171
|
+
"name": "create_cicd_pr",
|
|
172
|
+
"description": "Create a PR/MR with a CI/CD config fix. Supports: GitHub (via create_github_pr), GitLab, Azure DevOps.",
|
|
173
|
+
"input_schema": {
|
|
174
|
+
"type": "object",
|
|
175
|
+
"properties": {
|
|
176
|
+
"platform": {"type": "string", "enum": ["github", "gitlab", "azure_devops"]},
|
|
177
|
+
"repo": {"type": "string", "description": "Repository identifier"},
|
|
178
|
+
"file_path": {"type": "string"},
|
|
179
|
+
"new_content": {"type": "string"},
|
|
180
|
+
"pr_title": {"type": "string"},
|
|
181
|
+
"pr_body": {"type": "string"},
|
|
182
|
+
"additional_params": {"type": "object"},
|
|
183
|
+
},
|
|
184
|
+
"required": ["platform", "repo", "file_path", "new_content", "pr_title", "pr_body"],
|
|
185
|
+
},
|
|
186
|
+
},
|
|
187
|
+
# ─── ArgoCD Tools ─────────────────────────────────────────────────────────
|
|
188
|
+
{
|
|
189
|
+
"name": "get_argocd_status",
|
|
190
|
+
"description": "Get ArgoCD application status, health, sync status, and resource details.",
|
|
191
|
+
"input_schema": {
|
|
192
|
+
"type": "object",
|
|
193
|
+
"properties": {
|
|
194
|
+
"app_name": {"type": "string", "description": "ArgoCD application name"},
|
|
195
|
+
},
|
|
196
|
+
"required": ["app_name"],
|
|
197
|
+
},
|
|
198
|
+
},
|
|
199
|
+
{
|
|
200
|
+
"name": "sync_argocd_app",
|
|
201
|
+
"description": "Sync an ArgoCD application. Always use dry_run=true first to preview changes.",
|
|
202
|
+
"input_schema": {
|
|
203
|
+
"type": "object",
|
|
204
|
+
"properties": {
|
|
205
|
+
"app_name": {"type": "string"},
|
|
206
|
+
"prune": {"type": "boolean", "description": "Remove resources not in Git. Default false."},
|
|
207
|
+
"dry_run": {"type": "boolean", "description": "Preview only. Default true."},
|
|
208
|
+
},
|
|
209
|
+
"required": ["app_name"],
|
|
210
|
+
},
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
"name": "rollback_argocd_app",
|
|
214
|
+
"description": "Rollback an ArgoCD application to a previous git revision.",
|
|
215
|
+
"input_schema": {
|
|
216
|
+
"type": "object",
|
|
217
|
+
"properties": {
|
|
218
|
+
"app_name": {"type": "string"},
|
|
219
|
+
"revision": {"type": "string", "description": "Git commit SHA. If omitted, rolls back to previous revision."},
|
|
220
|
+
},
|
|
221
|
+
"required": ["app_name"],
|
|
222
|
+
},
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
"name": "get_argocd_history",
|
|
226
|
+
"description": "Get deployment history for an ArgoCD application.",
|
|
227
|
+
"input_schema": {
|
|
228
|
+
"type": "object",
|
|
229
|
+
"properties": {
|
|
230
|
+
"app_name": {"type": "string"},
|
|
231
|
+
"limit": {"type": "integer", "description": "Max history items. Default 10."},
|
|
232
|
+
},
|
|
233
|
+
"required": ["app_name"],
|
|
234
|
+
},
|
|
235
|
+
},
|
|
236
|
+
# ─── Cloud Provider Tools ─────────────────────────────────────────────────
|
|
237
|
+
{
|
|
238
|
+
"name": "get_cloud_resource",
|
|
239
|
+
"description": (
|
|
240
|
+
"Get diagnostic info for cloud resources. Supports AWS, GCP, Azure compute, "
|
|
241
|
+
"containers, K8s (EKS/GKE/AKS), load balancers, and more. "
|
|
242
|
+
"Database resources (RDS, Cloud SQL, Azure SQL, Redis, DynamoDB) are OPTIONAL "
|
|
243
|
+
"and disabled by default (ENABLE_DATABASE_COLLECTION=false) for security."
|
|
244
|
+
),
|
|
245
|
+
"input_schema": {
|
|
246
|
+
"type": "object",
|
|
247
|
+
"properties": {
|
|
248
|
+
"cloud": {"type": "string", "enum": ["aws", "gcp", "azure"]},
|
|
249
|
+
"resource_type": {"type": "string", "description": "e.g., 'ec2', 'ecs', 'gce', 'vm', 'cloud_run'"},
|
|
250
|
+
"resource_id": {"type": "string", "description": "Instance ID, name, etc."},
|
|
251
|
+
"additional_params": {"type": "object", "description": "Cloud-specific params (region, zone, resource_group, cluster)"},
|
|
252
|
+
},
|
|
253
|
+
"required": ["cloud", "resource_type", "resource_id"],
|
|
254
|
+
},
|
|
255
|
+
},
|
|
256
|
+
{
|
|
257
|
+
"name": "restart_cloud_resource",
|
|
258
|
+
"description": "Restart a cloud instance or service (safe operation). Supports: AWS EC2/ECS, GCP GCE/Cloud Run, Azure VM/App Service/Function.",
|
|
259
|
+
"input_schema": {
|
|
260
|
+
"type": "object",
|
|
261
|
+
"properties": {
|
|
262
|
+
"cloud": {"type": "string", "enum": ["aws", "gcp", "azure"]},
|
|
263
|
+
"resource_type": {"type": "string", "description": "e.g., 'ec2', 'ecs', 'gce', 'vm', 'cloud_run', 'app_service', 'function'"},
|
|
264
|
+
"resource_id": {"type": "string"},
|
|
265
|
+
"additional_params": {"type": "object"},
|
|
266
|
+
},
|
|
267
|
+
"required": ["cloud", "resource_type", "resource_id"],
|
|
268
|
+
},
|
|
269
|
+
},
|
|
270
|
+
{
|
|
271
|
+
"name": "scale_cloud_service",
|
|
272
|
+
"description": "Scale a cloud service. Supports: AWS ECS, Azure App Service. (GCP Cloud Run autoscales)",
|
|
273
|
+
"input_schema": {
|
|
274
|
+
"type": "object",
|
|
275
|
+
"properties": {
|
|
276
|
+
"cloud": {"type": "string", "enum": ["aws", "gcp", "azure"]},
|
|
277
|
+
"service_type": {"type": "string"},
|
|
278
|
+
"service_id": {"type": "string"},
|
|
279
|
+
"desired_count": {"type": "integer", "description": "Target instance count (minimum 1)"},
|
|
280
|
+
"additional_params": {"type": "object"},
|
|
281
|
+
},
|
|
282
|
+
"required": ["cloud", "service_type", "service_id", "desired_count"],
|
|
283
|
+
},
|
|
284
|
+
},
|
|
285
|
+
]
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
class DevOpsAgent:
|
|
289
|
+
def __init__(self):
|
|
290
|
+
self.client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
|
|
291
|
+
self.executor = SafeExecutor()
|
|
292
|
+
|
|
293
|
+
# K8s and existing tools
|
|
294
|
+
self.k8s_tools = K8sTools()
|
|
295
|
+
self.github_tools = GitHubTools()
|
|
296
|
+
|
|
297
|
+
# New CI/CD and cloud tools
|
|
298
|
+
self.cicd_tools = CICDTools()
|
|
299
|
+
self.argocd_tools = ArgoCDTools()
|
|
300
|
+
self.cloud_tools = CloudTools()
|
|
301
|
+
|
|
302
|
+
self.notifier = SlackNotifier()
|
|
303
|
+
|
|
304
|
+
# Existing collectors
|
|
305
|
+
self.k8s_collector = K8sCollector()
|
|
306
|
+
self.github_collector = GitHubCollector()
|
|
307
|
+
self.server_collector = ServerCollector()
|
|
308
|
+
|
|
309
|
+
# New CI/CD collectors
|
|
310
|
+
self.gitlab_collector = GitLabCollector()
|
|
311
|
+
self.jenkins_collector = JenkinsCollector()
|
|
312
|
+
self.bamboo_collector = BambooCollector()
|
|
313
|
+
self.azure_devops_collector = AzureDevOpsCollector()
|
|
314
|
+
|
|
315
|
+
# ArgoCD collector
|
|
316
|
+
self.argocd_collector = ArgoCDCollector()
|
|
317
|
+
|
|
318
|
+
# Cloud collectors
|
|
319
|
+
self.aws_collector = AWSCollector()
|
|
320
|
+
self.gcp_collector = GCPCollector()
|
|
321
|
+
self.azure_collector = AzureCollector()
|
|
322
|
+
|
|
323
|
+
self.max_steps = int(os.getenv("MAX_AGENT_STEPS", "10"))
|
|
324
|
+
self.auto_apply = os.getenv("AUTO_APPLY", "false").lower() == "true"
|
|
325
|
+
self._pending_approvals: dict[str, str] = {}
|
|
326
|
+
|
|
327
|
+
async def run(self, context: dict) -> dict:
|
|
328
|
+
"""Main agent loop: collect context → reason → act → return result."""
|
|
329
|
+
log.info("Agent starting", type=context.get("type"), source=context.get("source"))
|
|
330
|
+
|
|
331
|
+
# Enrich context with collected data
|
|
332
|
+
full_context = await self._collect_context(context)
|
|
333
|
+
|
|
334
|
+
# Build initial message
|
|
335
|
+
issue_type = context.get("type", "server")
|
|
336
|
+
system_prompt = get_system_prompt(issue_type)
|
|
337
|
+
|
|
338
|
+
messages = [
|
|
339
|
+
{
|
|
340
|
+
"role": "user",
|
|
341
|
+
"content": (
|
|
342
|
+
f"Incident detected. Diagnose and fix this:\n\n"
|
|
343
|
+
f"```json\n{json.dumps(full_context, indent=2)}\n```\n\n"
|
|
344
|
+
f"Use tools to gather more context if needed, then apply the fix. "
|
|
345
|
+
f"Always notify Slack with what you found and what you did."
|
|
346
|
+
),
|
|
347
|
+
}
|
|
348
|
+
]
|
|
349
|
+
|
|
350
|
+
actions_taken = []
|
|
351
|
+
steps = 0
|
|
352
|
+
|
|
353
|
+
# Agentic loop
|
|
354
|
+
while steps < self.max_steps:
|
|
355
|
+
steps += 1
|
|
356
|
+
log.info("Agent step", step=steps)
|
|
357
|
+
|
|
358
|
+
response = self.client.messages.create(
|
|
359
|
+
model="claude-sonnet-4-20250514",
|
|
360
|
+
max_tokens=4096,
|
|
361
|
+
system=system_prompt,
|
|
362
|
+
tools=AGENT_TOOLS,
|
|
363
|
+
messages=messages,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# Collect assistant message
|
|
367
|
+
messages.append({"role": "assistant", "content": response.content})
|
|
368
|
+
|
|
369
|
+
if response.stop_reason == "end_turn":
|
|
370
|
+
# Done — extract final text
|
|
371
|
+
final_text = next(
|
|
372
|
+
(b.text for b in response.content if hasattr(b, "text")), ""
|
|
373
|
+
)
|
|
374
|
+
return {
|
|
375
|
+
"resolved": True,
|
|
376
|
+
"diagnosis": final_text,
|
|
377
|
+
"actions": actions_taken,
|
|
378
|
+
"steps": steps,
|
|
379
|
+
"reasoning": final_text,
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
if response.stop_reason == "tool_use":
|
|
383
|
+
tool_results = []
|
|
384
|
+
for block in response.content:
|
|
385
|
+
if block.type != "tool_use":
|
|
386
|
+
continue
|
|
387
|
+
|
|
388
|
+
log.info("Tool call", tool=block.name, input=block.input)
|
|
389
|
+
result = await self._execute_tool(block.name, block.input, context)
|
|
390
|
+
actions_taken.append({"tool": block.name, "input": block.input, "result": result})
|
|
391
|
+
|
|
392
|
+
tool_results.append({
|
|
393
|
+
"type": "tool_result",
|
|
394
|
+
"tool_use_id": block.id,
|
|
395
|
+
"content": json.dumps(result),
|
|
396
|
+
})
|
|
397
|
+
|
|
398
|
+
messages.append({"role": "user", "content": tool_results})
|
|
399
|
+
continue
|
|
400
|
+
|
|
401
|
+
break # Unexpected stop reason
|
|
402
|
+
|
|
403
|
+
return {
|
|
404
|
+
"resolved": False,
|
|
405
|
+
"diagnosis": "Max steps reached without resolution",
|
|
406
|
+
"actions": actions_taken,
|
|
407
|
+
"steps": steps,
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
async def execute_approved_action(self, incident_id: str, command: str):
|
|
411
|
+
"""Execute a command that was approved via Slack."""
|
|
412
|
+
log.info("Executing approved action", incident_id=incident_id, command=command)
|
|
413
|
+
result = await self.executor.run(command)
|
|
414
|
+
await self.notifier.send_message(
|
|
415
|
+
f"✅ Approved action executed for `{incident_id}`:\n```{command}```\nResult: {result.get('stdout', '')}"
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
async def _collect_context(self, context: dict) -> dict:
|
|
419
|
+
"""Enrich the incident context with collected data."""
|
|
420
|
+
enriched = dict(context)
|
|
421
|
+
issue_type = context.get("type")
|
|
422
|
+
|
|
423
|
+
try:
|
|
424
|
+
if issue_type == "k8s" and context.get("namespace"):
|
|
425
|
+
enriched["k8s_data"] = await self.k8s_collector.collect(
|
|
426
|
+
context.get("namespace"), context.get("pod")
|
|
427
|
+
)
|
|
428
|
+
elif issue_type == "cicd":
|
|
429
|
+
# Determine CI/CD platform
|
|
430
|
+
platform = get_cicd_platform(context.get("labels", {}), context)
|
|
431
|
+
enriched["cicd_platform"] = platform
|
|
432
|
+
|
|
433
|
+
if platform == "github" and context.get("repo") and context.get("run_id"):
|
|
434
|
+
enriched["ci_logs"] = await self.github_collector.collect(
|
|
435
|
+
context["repo"], context["run_id"]
|
|
436
|
+
)
|
|
437
|
+
elif platform == "gitlab" and context.get("project_id") and context.get("pipeline_id"):
|
|
438
|
+
enriched["ci_logs"] = await self.gitlab_collector.collect(
|
|
439
|
+
context["project_id"], context["pipeline_id"]
|
|
440
|
+
)
|
|
441
|
+
elif platform == "jenkins" and context.get("job_name"):
|
|
442
|
+
enriched["ci_logs"] = await self.jenkins_collector.collect(
|
|
443
|
+
context["job_name"], context.get("build_number")
|
|
444
|
+
)
|
|
445
|
+
elif platform == "bamboo" and context.get("plan_key"):
|
|
446
|
+
enriched["ci_logs"] = await self.bamboo_collector.collect(
|
|
447
|
+
context["plan_key"], context.get("build_number")
|
|
448
|
+
)
|
|
449
|
+
elif platform == "azure_devops" and context.get("project") and context.get("pipeline_id"):
|
|
450
|
+
enriched["ci_logs"] = await self.azure_devops_collector.collect(
|
|
451
|
+
context["project"], context["pipeline_id"], context.get("run_id")
|
|
452
|
+
)
|
|
453
|
+
elif issue_type == "argocd" and context.get("app_name"):
|
|
454
|
+
enriched["argocd_data"] = await self.argocd_collector.collect(context["app_name"])
|
|
455
|
+
elif issue_type == "cloud_aws" and context.get("resource_type") and context.get("resource_id"):
|
|
456
|
+
rt = context["resource_type"]
|
|
457
|
+
blocked = check_database_access(rt, cloud="aws")
|
|
458
|
+
if blocked:
|
|
459
|
+
enriched["cloud_data"] = blocked
|
|
460
|
+
else:
|
|
461
|
+
enriched["cloud_data"] = await self.aws_collector.collect(
|
|
462
|
+
rt, context["resource_id"], **context.get("params", {})
|
|
463
|
+
)
|
|
464
|
+
elif issue_type == "cloud_gcp" and context.get("resource_type") and context.get("resource_id"):
|
|
465
|
+
rt = context["resource_type"]
|
|
466
|
+
blocked = check_database_access(rt, cloud="gcp")
|
|
467
|
+
if blocked:
|
|
468
|
+
enriched["cloud_data"] = blocked
|
|
469
|
+
else:
|
|
470
|
+
enriched["cloud_data"] = await self.gcp_collector.collect(
|
|
471
|
+
rt, context["resource_id"], **context.get("params", {})
|
|
472
|
+
)
|
|
473
|
+
elif issue_type == "cloud_azure" and context.get("resource_type") and context.get("resource_id"):
|
|
474
|
+
rt = context["resource_type"]
|
|
475
|
+
blocked = check_database_access(rt, cloud="azure")
|
|
476
|
+
if blocked:
|
|
477
|
+
enriched["cloud_data"] = blocked
|
|
478
|
+
else:
|
|
479
|
+
enriched["cloud_data"] = await self.azure_collector.collect(
|
|
480
|
+
rt, context["resource_id"], **context.get("params", {})
|
|
481
|
+
)
|
|
482
|
+
elif issue_type == "server":
|
|
483
|
+
enriched["server_data"] = await self.server_collector.collect()
|
|
484
|
+
except Exception as e:
|
|
485
|
+
log.warning("Context collection partial failure", error=str(e))
|
|
486
|
+
enriched["collection_error"] = str(e)
|
|
487
|
+
|
|
488
|
+
return enriched
|
|
489
|
+
|
|
490
|
+
async def _execute_tool(self, name: str, inputs: dict, context: dict) -> dict:
|
|
491
|
+
"""Route tool calls to the appropriate handler."""
|
|
492
|
+
try:
|
|
493
|
+
# ─── Existing K8s Tools ───────────────────────────────────────────
|
|
494
|
+
if name == "get_k8s_context":
|
|
495
|
+
return await self.k8s_collector.collect(
|
|
496
|
+
inputs.get("namespace", "default"),
|
|
497
|
+
inputs.get("pod_name"),
|
|
498
|
+
inputs.get("include_previous", True),
|
|
499
|
+
)
|
|
500
|
+
elif name == "apply_k8s_manifest":
|
|
501
|
+
return await self.k8s_tools.apply_manifest(
|
|
502
|
+
inputs["manifest_yaml"],
|
|
503
|
+
dry_run=inputs.get("dry_run", True),
|
|
504
|
+
namespace=inputs.get("namespace"),
|
|
505
|
+
auto_apply=self.auto_apply,
|
|
506
|
+
notifier=self.notifier,
|
|
507
|
+
)
|
|
508
|
+
elif name == "run_kubectl":
|
|
509
|
+
return await self.k8s_tools.run_kubectl(inputs["command"], auto_apply=self.auto_apply)
|
|
510
|
+
elif name == "rollback_deployment":
|
|
511
|
+
return await self.k8s_tools.rollback(
|
|
512
|
+
inputs["deployment"], inputs["namespace"],
|
|
513
|
+
revision=inputs.get("revision"),
|
|
514
|
+
auto_apply=self.auto_apply,
|
|
515
|
+
notifier=self.notifier,
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
# ─── Existing GitHub/Server Tools ─────────────────────────────────
|
|
519
|
+
elif name == "get_github_logs":
|
|
520
|
+
return await self.github_collector.collect(inputs["repo"], inputs["run_id"])
|
|
521
|
+
elif name == "create_github_pr":
|
|
522
|
+
return await self.github_tools.create_fix_pr(
|
|
523
|
+
inputs["repo"], inputs["file_path"],
|
|
524
|
+
inputs["new_content"], inputs["pr_title"], inputs["pr_body"],
|
|
525
|
+
)
|
|
526
|
+
elif name == "run_shell_command":
|
|
527
|
+
return await self.executor.run_safe(inputs["command"], host=inputs.get("host"))
|
|
528
|
+
|
|
529
|
+
# ─── Multi-platform CI/CD Tools ────────────────────────────────────
|
|
530
|
+
elif name == "get_cicd_logs":
|
|
531
|
+
platform = inputs["platform"]
|
|
532
|
+
project_id = inputs["project_id"]
|
|
533
|
+
pipeline_id = inputs.get("pipeline_id")
|
|
534
|
+
params = inputs.get("additional_params", {})
|
|
535
|
+
|
|
536
|
+
if platform == "github":
|
|
537
|
+
return await self.github_collector.collect(project_id, int(pipeline_id))
|
|
538
|
+
elif platform == "gitlab":
|
|
539
|
+
return await self.gitlab_collector.collect(project_id, int(pipeline_id))
|
|
540
|
+
elif platform == "jenkins":
|
|
541
|
+
return await self.jenkins_collector.collect(project_id, int(pipeline_id) if pipeline_id else None)
|
|
542
|
+
elif platform == "bamboo":
|
|
543
|
+
return await self.bamboo_collector.collect(project_id, int(pipeline_id) if pipeline_id else None)
|
|
544
|
+
elif platform == "azure_devops":
|
|
545
|
+
return await self.azure_devops_collector.collect(
|
|
546
|
+
params.get("project", project_id),
|
|
547
|
+
int(params.get("pipeline_id", pipeline_id)),
|
|
548
|
+
int(params.get("run_id", pipeline_id))
|
|
549
|
+
)
|
|
550
|
+
else:
|
|
551
|
+
return {"error": f"Unsupported CI/CD platform: {platform}"}
|
|
552
|
+
|
|
553
|
+
elif name == "retry_cicd_pipeline":
|
|
554
|
+
platform = inputs["platform"]
|
|
555
|
+
return await self.cicd_tools.retry_pipeline(
|
|
556
|
+
platform, inputs["project_id"], inputs.get("pipeline_id"),
|
|
557
|
+
**inputs.get("additional_params", {})
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
elif name == "create_cicd_pr":
|
|
561
|
+
platform = inputs["platform"]
|
|
562
|
+
if platform == "github":
|
|
563
|
+
# Use existing GitHub tool
|
|
564
|
+
return await self.github_tools.create_fix_pr(
|
|
565
|
+
inputs["repo"], inputs["file_path"],
|
|
566
|
+
inputs["new_content"], inputs["pr_title"], inputs["pr_body"]
|
|
567
|
+
)
|
|
568
|
+
else:
|
|
569
|
+
return await self.cicd_tools.create_fix_pr(
|
|
570
|
+
platform, inputs["repo"], inputs["file_path"],
|
|
571
|
+
inputs["new_content"], inputs["pr_title"], inputs["pr_body"],
|
|
572
|
+
**inputs.get("additional_params", {})
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
# ─── ArgoCD Tools ──────────────────────────────────────────────────
|
|
576
|
+
elif name == "get_argocd_status":
|
|
577
|
+
return await self.argocd_tools.get_application_status(inputs["app_name"])
|
|
578
|
+
|
|
579
|
+
elif name == "sync_argocd_app":
|
|
580
|
+
return await self.argocd_tools.sync_application(
|
|
581
|
+
inputs["app_name"],
|
|
582
|
+
prune=inputs.get("prune", False),
|
|
583
|
+
dry_run=inputs.get("dry_run", True),
|
|
584
|
+
auto_apply=self.auto_apply,
|
|
585
|
+
notifier=self.notifier
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
elif name == "rollback_argocd_app":
|
|
589
|
+
return await self.argocd_tools.rollback_application(
|
|
590
|
+
inputs["app_name"],
|
|
591
|
+
revision=inputs.get("revision"),
|
|
592
|
+
auto_apply=self.auto_apply,
|
|
593
|
+
notifier=self.notifier
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
elif name == "get_argocd_history":
|
|
597
|
+
return await self.argocd_tools.get_application_history(
|
|
598
|
+
inputs["app_name"],
|
|
599
|
+
limit=inputs.get("limit", 10)
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
# ─── Cloud Provider Tools ──────────────────────────────────────────
|
|
603
|
+
elif name == "get_cloud_resource":
|
|
604
|
+
cloud = inputs["cloud"]
|
|
605
|
+
resource_type = inputs["resource_type"]
|
|
606
|
+
resource_id = inputs["resource_id"]
|
|
607
|
+
params = inputs.get("additional_params", {})
|
|
608
|
+
|
|
609
|
+
blocked = check_database_access(resource_type, cloud=cloud)
|
|
610
|
+
if blocked:
|
|
611
|
+
return blocked
|
|
612
|
+
|
|
613
|
+
if cloud == "aws":
|
|
614
|
+
return await self.aws_collector.collect(resource_type, resource_id, **params)
|
|
615
|
+
elif cloud == "gcp":
|
|
616
|
+
return await self.gcp_collector.collect(resource_type, resource_id, **params)
|
|
617
|
+
elif cloud == "azure":
|
|
618
|
+
return await self.azure_collector.collect(resource_type, resource_id, **params)
|
|
619
|
+
else:
|
|
620
|
+
return {"error": f"Unsupported cloud: {cloud}"}
|
|
621
|
+
|
|
622
|
+
elif name == "restart_cloud_resource":
|
|
623
|
+
cloud = inputs["cloud"]
|
|
624
|
+
resource_type = inputs["resource_type"]
|
|
625
|
+
resource_id = inputs["resource_id"]
|
|
626
|
+
params = inputs.get("additional_params", {})
|
|
627
|
+
|
|
628
|
+
if resource_type in ["instance", "ec2", "gce", "vm"]:
|
|
629
|
+
return await self.cloud_tools.restart_instance(cloud, resource_id, **params)
|
|
630
|
+
else:
|
|
631
|
+
return await self.cloud_tools.restart_service(cloud, resource_type, resource_id, **params)
|
|
632
|
+
|
|
633
|
+
elif name == "scale_cloud_service":
|
|
634
|
+
return await self.cloud_tools.scale_service(
|
|
635
|
+
inputs["cloud"],
|
|
636
|
+
inputs["service_type"],
|
|
637
|
+
inputs["service_id"],
|
|
638
|
+
inputs["desired_count"],
|
|
639
|
+
**inputs.get("additional_params", {})
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
# ─── Notifications ─────────────────────────────────────────────────
|
|
643
|
+
elif name == "notify_slack":
|
|
644
|
+
await self.notifier.send_message(
|
|
645
|
+
inputs["message"],
|
|
646
|
+
severity=inputs.get("severity", "info"),
|
|
647
|
+
resolved=inputs.get("resolved", False),
|
|
648
|
+
requires_approval=inputs.get("requires_approval", False),
|
|
649
|
+
approval_command=inputs.get("approval_command"),
|
|
650
|
+
)
|
|
651
|
+
return {"sent": True}
|
|
652
|
+
|
|
653
|
+
else:
|
|
654
|
+
return {"error": f"Unknown tool: {name}"}
|
|
655
|
+
|
|
656
|
+
except Exception as e:
|
|
657
|
+
log.error("Tool execution error", tool=name, error=str(e))
|
|
658
|
+
return {"error": str(e)}
|