kubectl-mcp-server 1.16.0__py3-none-any.whl → 1.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kubectl_mcp_server-1.16.0.dist-info → kubectl_mcp_server-1.17.0.dist-info}/METADATA +1 -1
- {kubectl_mcp_server-1.16.0.dist-info → kubectl_mcp_server-1.17.0.dist-info}/RECORD +28 -14
- kubectl_mcp_tool/__init__.py +1 -1
- kubectl_mcp_tool/cli/cli.py +83 -9
- kubectl_mcp_tool/cli/output.py +14 -0
- kubectl_mcp_tool/config/__init__.py +46 -0
- kubectl_mcp_tool/config/loader.py +386 -0
- kubectl_mcp_tool/config/schema.py +184 -0
- kubectl_mcp_tool/mcp_server.py +219 -8
- kubectl_mcp_tool/observability/__init__.py +59 -0
- kubectl_mcp_tool/observability/metrics.py +223 -0
- kubectl_mcp_tool/observability/stats.py +255 -0
- kubectl_mcp_tool/observability/tracing.py +335 -0
- kubectl_mcp_tool/prompts/__init__.py +43 -0
- kubectl_mcp_tool/prompts/builtin.py +695 -0
- kubectl_mcp_tool/prompts/custom.py +298 -0
- kubectl_mcp_tool/prompts/prompts.py +180 -4
- kubectl_mcp_tool/safety.py +155 -0
- kubectl_mcp_tool/tools/cluster.py +384 -0
- tests/test_config.py +386 -0
- tests/test_mcp_integration.py +251 -0
- tests/test_observability.py +521 -0
- tests/test_prompts.py +716 -0
- tests/test_safety.py +218 -0
- {kubectl_mcp_server-1.16.0.dist-info → kubectl_mcp_server-1.17.0.dist-info}/WHEEL +0 -0
- {kubectl_mcp_server-1.16.0.dist-info → kubectl_mcp_server-1.17.0.dist-info}/entry_points.txt +0 -0
- {kubectl_mcp_server-1.16.0.dist-info → kubectl_mcp_server-1.17.0.dist-info}/licenses/LICENSE +0 -0
- {kubectl_mcp_server-1.16.0.dist-info → kubectl_mcp_server-1.17.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,695 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Built-in prompts for kubectl-mcp-server.
|
|
3
|
+
|
|
4
|
+
These prompts provide comprehensive workflows for common Kubernetes tasks.
|
|
5
|
+
Users can override any of these by defining a prompt with the same name
|
|
6
|
+
in their custom configuration.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .custom import CustomPrompt, PromptArgument, PromptMessage
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Cluster Health Check Prompt
|
|
13
|
+
CLUSTER_HEALTH_CHECK = CustomPrompt(
|
|
14
|
+
name="cluster-health-check",
|
|
15
|
+
title="Cluster Health Check",
|
|
16
|
+
description="Comprehensive health assessment of your Kubernetes cluster",
|
|
17
|
+
arguments=[
|
|
18
|
+
PromptArgument(
|
|
19
|
+
name="namespace",
|
|
20
|
+
description="Limit to specific namespace (optional)",
|
|
21
|
+
required=False
|
|
22
|
+
),
|
|
23
|
+
PromptArgument(
|
|
24
|
+
name="check_events",
|
|
25
|
+
description="Include recent events in the check (true/false)",
|
|
26
|
+
required=False,
|
|
27
|
+
default="true"
|
|
28
|
+
),
|
|
29
|
+
PromptArgument(
|
|
30
|
+
name="check_metrics",
|
|
31
|
+
description="Include resource metrics (true/false)",
|
|
32
|
+
required=False,
|
|
33
|
+
default="true"
|
|
34
|
+
),
|
|
35
|
+
],
|
|
36
|
+
messages=[
|
|
37
|
+
PromptMessage(
|
|
38
|
+
role="user",
|
|
39
|
+
content="""Perform a comprehensive health check of the Kubernetes cluster{{#namespace}} in namespace {{namespace}}{{/namespace}}.
|
|
40
|
+
|
|
41
|
+
## Check Categories
|
|
42
|
+
|
|
43
|
+
### 1. Node Health
|
|
44
|
+
- Check node status and conditions (Ready, MemoryPressure, DiskPressure, PIDPressure)
|
|
45
|
+
- Verify all nodes are reporting Ready
|
|
46
|
+
- Check node resource capacity and allocatable resources
|
|
47
|
+
{{#check_metrics}}- Review node CPU and memory utilization{{/check_metrics}}
|
|
48
|
+
|
|
49
|
+
### 2. Pod Health{{#namespace}} in {{namespace}}{{/namespace}}
|
|
50
|
+
- Identify pods in problematic states:
|
|
51
|
+
- CrashLoopBackOff: Application crash on start
|
|
52
|
+
- ImagePullBackOff: Image not found or registry auth issues
|
|
53
|
+
- Pending: Scheduling issues or resource constraints
|
|
54
|
+
- OOMKilled: Memory limit exceeded
|
|
55
|
+
- Error: Container exited with error
|
|
56
|
+
- Check restart counts for running pods
|
|
57
|
+
{{#check_metrics}}- Review pod resource usage vs limits{{/check_metrics}}
|
|
58
|
+
|
|
59
|
+
### 3. Workload Status{{#namespace}} in {{namespace}}{{/namespace}}
|
|
60
|
+
- Deployment replica status (desired vs ready)
|
|
61
|
+
- StatefulSet replica status
|
|
62
|
+
- DaemonSet desired vs scheduled vs ready
|
|
63
|
+
- Job completion status
|
|
64
|
+
- CronJob schedule health
|
|
65
|
+
|
|
66
|
+
### 4. Storage Health
|
|
67
|
+
- PVC binding status (Bound, Pending, Lost)
|
|
68
|
+
- PV availability and reclaim status
|
|
69
|
+
- StorageClass availability
|
|
70
|
+
|
|
71
|
+
### 5. Networking Health
|
|
72
|
+
- Service endpoint health (services with no endpoints)
|
|
73
|
+
- Ingress configuration validity
|
|
74
|
+
- NetworkPolicy presence and coverage
|
|
75
|
+
|
|
76
|
+
{{#check_events}}### 6. Recent Events{{#namespace}} in {{namespace}}{{/namespace}}
|
|
77
|
+
- Warning events in the last hour
|
|
78
|
+
- Error events indicating failures
|
|
79
|
+
- Event patterns suggesting issues{{/check_events}}
|
|
80
|
+
|
|
81
|
+
## Output Required
|
|
82
|
+
|
|
83
|
+
Provide:
|
|
84
|
+
1. **Overall Health Status**: Healthy, Warning, or Critical
|
|
85
|
+
2. **Summary Statistics**:
|
|
86
|
+
- Total nodes and healthy count
|
|
87
|
+
- Total pods and healthy count
|
|
88
|
+
- Workloads with issues
|
|
89
|
+
3. **Critical Issues** (require immediate attention)
|
|
90
|
+
4. **Warnings** (should be addressed soon)
|
|
91
|
+
5. **Recommendations** for improvements
|
|
92
|
+
|
|
93
|
+
Start the health check now using the appropriate kubectl tools."""
|
|
94
|
+
),
|
|
95
|
+
]
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# Debug Workload Prompt
|
|
100
|
+
DEBUG_WORKLOAD = CustomPrompt(
|
|
101
|
+
name="debug-workload",
|
|
102
|
+
title="Debug Workload Issues",
|
|
103
|
+
description="Diagnose and troubleshoot Kubernetes workload problems",
|
|
104
|
+
arguments=[
|
|
105
|
+
PromptArgument(
|
|
106
|
+
name="workload_name",
|
|
107
|
+
description="Name of the workload to debug",
|
|
108
|
+
required=True
|
|
109
|
+
),
|
|
110
|
+
PromptArgument(
|
|
111
|
+
name="namespace",
|
|
112
|
+
description="Namespace of the workload",
|
|
113
|
+
required=False,
|
|
114
|
+
default="default"
|
|
115
|
+
),
|
|
116
|
+
PromptArgument(
|
|
117
|
+
name="workload_type",
|
|
118
|
+
description="Type of workload (deployment, statefulset, daemonset, pod)",
|
|
119
|
+
required=False,
|
|
120
|
+
default="deployment"
|
|
121
|
+
),
|
|
122
|
+
PromptArgument(
|
|
123
|
+
name="include_related",
|
|
124
|
+
description="Check related resources (services, configmaps, secrets)",
|
|
125
|
+
required=False,
|
|
126
|
+
default="true"
|
|
127
|
+
),
|
|
128
|
+
],
|
|
129
|
+
messages=[
|
|
130
|
+
PromptMessage(
|
|
131
|
+
role="user",
|
|
132
|
+
content="""Debug the {{workload_type}} '{{workload_name}}' in namespace '{{namespace}}'.
|
|
133
|
+
|
|
134
|
+
## Debugging Steps
|
|
135
|
+
|
|
136
|
+
### Step 1: Identify the Workload
|
|
137
|
+
- Get the {{workload_type}} details
|
|
138
|
+
- List all pods belonging to this workload
|
|
139
|
+
- Note the current status and any error conditions
|
|
140
|
+
|
|
141
|
+
### Step 2: Pod Status Analysis
|
|
142
|
+
For each pod:
|
|
143
|
+
1. Check phase (Running, Pending, Failed, Unknown)
|
|
144
|
+
2. Check container readiness
|
|
145
|
+
3. Count restarts
|
|
146
|
+
4. Identify the problematic state if any
|
|
147
|
+
|
|
148
|
+
### Step 3: Event Investigation
|
|
149
|
+
- Get events for the {{workload_type}}
|
|
150
|
+
- Get events for each pod
|
|
151
|
+
- Look for:
|
|
152
|
+
- FailedScheduling
|
|
153
|
+
- FailedCreate
|
|
154
|
+
- FailedMount
|
|
155
|
+
- Unhealthy (probe failures)
|
|
156
|
+
- BackOff (crash loops)
|
|
157
|
+
|
|
158
|
+
### Step 4: Log Analysis
|
|
159
|
+
For pods not in Running/Ready state:
|
|
160
|
+
- Get current logs (last 100 lines)
|
|
161
|
+
- If crashed, get previous container logs
|
|
162
|
+
- Look for:
|
|
163
|
+
- Stack traces / exceptions
|
|
164
|
+
- Connection errors
|
|
165
|
+
- Configuration errors
|
|
166
|
+
- Permission issues
|
|
167
|
+
|
|
168
|
+
### Step 5: Resource Analysis
|
|
169
|
+
- Check resource requests vs limits
|
|
170
|
+
- Verify if pods are OOMKilled
|
|
171
|
+
- Check if resources are available on nodes
|
|
172
|
+
|
|
173
|
+
{{#include_related}}### Step 6: Related Resources
|
|
174
|
+
Check dependencies:
|
|
175
|
+
- Services pointing to this workload
|
|
176
|
+
- ConfigMaps mounted by pods
|
|
177
|
+
- Secrets referenced by pods
|
|
178
|
+
- PVCs used by pods
|
|
179
|
+
- ServiceAccount permissions{{/include_related}}
|
|
180
|
+
|
|
181
|
+
## Output Required
|
|
182
|
+
|
|
183
|
+
For each issue found, provide:
|
|
184
|
+
1. **Issue Description**: What is wrong
|
|
185
|
+
2. **Evidence**: Specific log lines, events, or status
|
|
186
|
+
3. **Root Cause**: Likely reason for the issue
|
|
187
|
+
4. **Resolution**: Steps to fix the problem
|
|
188
|
+
5. **Verification**: How to confirm the fix worked
|
|
189
|
+
|
|
190
|
+
Start debugging now."""
|
|
191
|
+
),
|
|
192
|
+
]
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# Resource Usage Analysis Prompt
|
|
197
|
+
RESOURCE_USAGE = CustomPrompt(
|
|
198
|
+
name="resource-usage",
|
|
199
|
+
title="Resource Usage Analysis",
|
|
200
|
+
description="Analyze resource consumption and identify optimization opportunities",
|
|
201
|
+
arguments=[
|
|
202
|
+
PromptArgument(
|
|
203
|
+
name="namespace",
|
|
204
|
+
description="Namespace to analyze (optional, defaults to all)",
|
|
205
|
+
required=False
|
|
206
|
+
),
|
|
207
|
+
PromptArgument(
|
|
208
|
+
name="threshold_cpu",
|
|
209
|
+
description="CPU utilization threshold percentage for alerts",
|
|
210
|
+
required=False,
|
|
211
|
+
default="80"
|
|
212
|
+
),
|
|
213
|
+
PromptArgument(
|
|
214
|
+
name="threshold_memory",
|
|
215
|
+
description="Memory utilization threshold percentage for alerts",
|
|
216
|
+
required=False,
|
|
217
|
+
default="80"
|
|
218
|
+
),
|
|
219
|
+
PromptArgument(
|
|
220
|
+
name="include_recommendations",
|
|
221
|
+
description="Include right-sizing recommendations",
|
|
222
|
+
required=False,
|
|
223
|
+
default="true"
|
|
224
|
+
),
|
|
225
|
+
],
|
|
226
|
+
messages=[
|
|
227
|
+
PromptMessage(
|
|
228
|
+
role="user",
|
|
229
|
+
content="""Analyze resource usage{{#namespace}} in namespace {{namespace}}{{/namespace}} and identify optimization opportunities.
|
|
230
|
+
|
|
231
|
+
## Analysis Areas
|
|
232
|
+
|
|
233
|
+
### 1. Node Resource Utilization
|
|
234
|
+
- Current CPU and memory usage per node
|
|
235
|
+
- Identify nodes exceeding {{threshold_cpu}}% CPU
|
|
236
|
+
- Identify nodes exceeding {{threshold_memory}}% memory
|
|
237
|
+
- Node capacity vs allocatable resources
|
|
238
|
+
|
|
239
|
+
### 2. Pod Resource Consumption{{#namespace}} in {{namespace}}{{/namespace}}
|
|
240
|
+
- Top CPU consumers
|
|
241
|
+
- Top memory consumers
|
|
242
|
+
- Pods near resource limits
|
|
243
|
+
- Pods with no resource limits (risk)
|
|
244
|
+
|
|
245
|
+
### 3. Request vs Usage Analysis
|
|
246
|
+
Compare for each workload:
|
|
247
|
+
- CPU requested vs actual usage
|
|
248
|
+
- Memory requested vs actual usage
|
|
249
|
+
- Identify over-provisioned resources (waste)
|
|
250
|
+
- Identify under-provisioned resources (risk)
|
|
251
|
+
|
|
252
|
+
### 4. Resource Efficiency
|
|
253
|
+
Calculate:
|
|
254
|
+
- Overall cluster utilization
|
|
255
|
+
- Namespace-level utilization{{#namespace}} for {{namespace}}{{/namespace}}
|
|
256
|
+
- Cost of unused/reserved resources
|
|
257
|
+
|
|
258
|
+
{{#include_recommendations}}### 5. Right-sizing Recommendations
|
|
259
|
+
For each over/under-provisioned workload:
|
|
260
|
+
- Current requests/limits
|
|
261
|
+
- Recommended requests/limits
|
|
262
|
+
- Estimated savings/risk reduction
|
|
263
|
+
- Priority (high/medium/low){{/include_recommendations}}
|
|
264
|
+
|
|
265
|
+
### 6. Autoscaling Candidates
|
|
266
|
+
Identify workloads that would benefit from:
|
|
267
|
+
- Horizontal Pod Autoscaler (HPA)
|
|
268
|
+
- Vertical Pod Autoscaler (VPA)
|
|
269
|
+
- Cluster Autoscaler configuration
|
|
270
|
+
|
|
271
|
+
## Output Required
|
|
272
|
+
|
|
273
|
+
Provide:
|
|
274
|
+
1. **Summary Dashboard**:
|
|
275
|
+
- Cluster utilization overview
|
|
276
|
+
- Resource efficiency score
|
|
277
|
+
- Potential savings estimate
|
|
278
|
+
|
|
279
|
+
2. **Alerts** (immediate attention):
|
|
280
|
+
- Workloads exceeding thresholds
|
|
281
|
+
- Workloads without limits
|
|
282
|
+
- Nodes at capacity
|
|
283
|
+
|
|
284
|
+
3. **Optimization Actions** (prioritized list):
|
|
285
|
+
- Quick wins (immediate impact)
|
|
286
|
+
- Medium-term improvements
|
|
287
|
+
- Long-term architecture changes
|
|
288
|
+
|
|
289
|
+
Start the resource analysis now."""
|
|
290
|
+
),
|
|
291
|
+
]
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# Security Posture Review Prompt
|
|
296
|
+
SECURITY_POSTURE = CustomPrompt(
|
|
297
|
+
name="security-posture",
|
|
298
|
+
title="Security Posture Review",
|
|
299
|
+
description="Comprehensive security assessment of your Kubernetes cluster",
|
|
300
|
+
arguments=[
|
|
301
|
+
PromptArgument(
|
|
302
|
+
name="namespace",
|
|
303
|
+
description="Namespace to analyze (optional, defaults to all)",
|
|
304
|
+
required=False
|
|
305
|
+
),
|
|
306
|
+
PromptArgument(
|
|
307
|
+
name="check_rbac",
|
|
308
|
+
description="Include RBAC analysis",
|
|
309
|
+
required=False,
|
|
310
|
+
default="true"
|
|
311
|
+
),
|
|
312
|
+
PromptArgument(
|
|
313
|
+
name="check_network",
|
|
314
|
+
description="Include network policy analysis",
|
|
315
|
+
required=False,
|
|
316
|
+
default="true"
|
|
317
|
+
),
|
|
318
|
+
PromptArgument(
|
|
319
|
+
name="check_secrets",
|
|
320
|
+
description="Include secrets management analysis",
|
|
321
|
+
required=False,
|
|
322
|
+
default="true"
|
|
323
|
+
),
|
|
324
|
+
],
|
|
325
|
+
messages=[
|
|
326
|
+
PromptMessage(
|
|
327
|
+
role="user",
|
|
328
|
+
content="""Perform a security posture review{{#namespace}} for namespace {{namespace}}{{/namespace}}.
|
|
329
|
+
|
|
330
|
+
## Security Assessment Areas
|
|
331
|
+
|
|
332
|
+
{{#check_rbac}}### 1. RBAC Analysis
|
|
333
|
+
- Review ClusterRoles with elevated privileges
|
|
334
|
+
- Identify ClusterRoleBindings to cluster-admin
|
|
335
|
+
- Check for overly permissive roles (wildcards)
|
|
336
|
+
- ServiceAccount analysis:
|
|
337
|
+
- Default SA usage
|
|
338
|
+
- Token automounting
|
|
339
|
+
- Unnecessary privileges{{/check_rbac}}
|
|
340
|
+
|
|
341
|
+
### 2. Pod Security Standards{{#namespace}} in {{namespace}}{{/namespace}}
|
|
342
|
+
Check for security anti-patterns:
|
|
343
|
+
- [ ] Privileged containers
|
|
344
|
+
- [ ] Running as root
|
|
345
|
+
- [ ] Host namespace usage (hostNetwork, hostPID, hostIPC)
|
|
346
|
+
- [ ] Host path mounts
|
|
347
|
+
- [ ] Privilege escalation allowed
|
|
348
|
+
- [ ] Missing security contexts
|
|
349
|
+
- [ ] Writable root filesystem
|
|
350
|
+
|
|
351
|
+
### 3. Image Security
|
|
352
|
+
- Images using 'latest' tag
|
|
353
|
+
- Images from untrusted registries
|
|
354
|
+
- Missing imagePullPolicy
|
|
355
|
+
- Missing imagePullSecrets
|
|
356
|
+
|
|
357
|
+
{{#check_network}}### 4. Network Security
|
|
358
|
+
- Namespaces without NetworkPolicies
|
|
359
|
+
- Missing default deny policies
|
|
360
|
+
- Overly permissive ingress rules
|
|
361
|
+
- Unprotected egress traffic{{/check_network}}
|
|
362
|
+
|
|
363
|
+
{{#check_secrets}}### 5. Secrets Management
|
|
364
|
+
- Secrets mounted as environment variables (less secure)
|
|
365
|
+
- Secrets in pod specs (should use references)
|
|
366
|
+
- Unused secrets
|
|
367
|
+
- Secrets without encryption at rest{{/check_secrets}}
|
|
368
|
+
|
|
369
|
+
### 6. Supply Chain Security
|
|
370
|
+
- Image vulnerability scanning status
|
|
371
|
+
- Admission controller configuration
|
|
372
|
+
- Pod Security Standards enforcement
|
|
373
|
+
|
|
374
|
+
## Risk Categories
|
|
375
|
+
|
|
376
|
+
For each finding, classify as:
|
|
377
|
+
- **Critical**: Immediate exploitation risk
|
|
378
|
+
- **High**: Significant security gap
|
|
379
|
+
- **Medium**: Best practice violation
|
|
380
|
+
- **Low**: Hardening opportunity
|
|
381
|
+
|
|
382
|
+
## Output Required
|
|
383
|
+
|
|
384
|
+
Provide:
|
|
385
|
+
1. **Security Score**: Overall assessment (A-F grade)
|
|
386
|
+
|
|
387
|
+
2. **Critical Findings** (fix immediately):
|
|
388
|
+
- Issue description
|
|
389
|
+
- Affected resources
|
|
390
|
+
- Remediation steps
|
|
391
|
+
|
|
392
|
+
3. **High Priority Issues**:
|
|
393
|
+
- Issue description
|
|
394
|
+
- Risk explanation
|
|
395
|
+
- Fix recommendation
|
|
396
|
+
|
|
397
|
+
4. **Compliance Checklist**:
|
|
398
|
+
- Pod Security Standards
|
|
399
|
+
- CIS Kubernetes Benchmark highlights
|
|
400
|
+
- Network segmentation
|
|
401
|
+
|
|
402
|
+
5. **Remediation Plan** (prioritized):
|
|
403
|
+
- Quick wins
|
|
404
|
+
- Medium-term hardening
|
|
405
|
+
- Long-term architecture
|
|
406
|
+
|
|
407
|
+
Start the security review now."""
|
|
408
|
+
),
|
|
409
|
+
]
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
# Deployment Checklist Prompt
|
|
414
|
+
DEPLOYMENT_CHECKLIST = CustomPrompt(
|
|
415
|
+
name="deployment-checklist",
|
|
416
|
+
title="Production Deployment Checklist",
|
|
417
|
+
description="Pre-deployment verification checklist for production workloads",
|
|
418
|
+
arguments=[
|
|
419
|
+
PromptArgument(
|
|
420
|
+
name="app_name",
|
|
421
|
+
description="Application name being deployed",
|
|
422
|
+
required=True
|
|
423
|
+
),
|
|
424
|
+
PromptArgument(
|
|
425
|
+
name="namespace",
|
|
426
|
+
description="Target namespace for deployment",
|
|
427
|
+
required=True
|
|
428
|
+
),
|
|
429
|
+
PromptArgument(
|
|
430
|
+
name="image",
|
|
431
|
+
description="Container image being deployed",
|
|
432
|
+
required=True
|
|
433
|
+
),
|
|
434
|
+
PromptArgument(
|
|
435
|
+
name="replicas",
|
|
436
|
+
description="Number of replicas",
|
|
437
|
+
required=False,
|
|
438
|
+
default="2"
|
|
439
|
+
),
|
|
440
|
+
],
|
|
441
|
+
messages=[
|
|
442
|
+
PromptMessage(
|
|
443
|
+
role="user",
|
|
444
|
+
content="""Verify deployment readiness for '{{app_name}}' ({{image}}) in namespace '{{namespace}}' with {{replicas}} replicas.
|
|
445
|
+
|
|
446
|
+
## Pre-Deployment Checklist
|
|
447
|
+
|
|
448
|
+
### 1. Namespace Preparation
|
|
449
|
+
- [ ] Namespace '{{namespace}}' exists
|
|
450
|
+
- [ ] Resource quotas are set
|
|
451
|
+
- [ ] LimitRanges are configured
|
|
452
|
+
- [ ] Network policies exist
|
|
453
|
+
|
|
454
|
+
### 2. Image Verification
|
|
455
|
+
- [ ] Image '{{image}}' uses specific tag (not :latest)
|
|
456
|
+
- [ ] Image exists and is pullable
|
|
457
|
+
- [ ] Image has been scanned for vulnerabilities
|
|
458
|
+
- [ ] imagePullSecrets are configured if needed
|
|
459
|
+
|
|
460
|
+
### 3. Resource Configuration
|
|
461
|
+
- [ ] CPU requests and limits are set
|
|
462
|
+
- [ ] Memory requests and limits are set
|
|
463
|
+
- [ ] Resources are appropriate for workload
|
|
464
|
+
- [ ] Pod Disruption Budget is configured for {{replicas}} replicas
|
|
465
|
+
|
|
466
|
+
### 4. Health Checks
|
|
467
|
+
- [ ] Liveness probe is configured
|
|
468
|
+
- [ ] Readiness probe is configured
|
|
469
|
+
- [ ] Startup probe (if slow starting)
|
|
470
|
+
- [ ] Probe endpoints are implemented
|
|
471
|
+
|
|
472
|
+
### 5. Security Configuration
|
|
473
|
+
- [ ] runAsNonRoot: true
|
|
474
|
+
- [ ] readOnlyRootFilesystem: true
|
|
475
|
+
- [ ] allowPrivilegeEscalation: false
|
|
476
|
+
- [ ] Capabilities dropped
|
|
477
|
+
- [ ] SecurityContext is set
|
|
478
|
+
|
|
479
|
+
### 6. Configuration Management
|
|
480
|
+
- [ ] ConfigMaps are created
|
|
481
|
+
- [ ] Secrets are created
|
|
482
|
+
- [ ] Environment variables are set
|
|
483
|
+
- [ ] Volume mounts are configured
|
|
484
|
+
|
|
485
|
+
### 7. Service Configuration
|
|
486
|
+
- [ ] Service is created
|
|
487
|
+
- [ ] Ports are correctly mapped
|
|
488
|
+
- [ ] Service selectors match pod labels
|
|
489
|
+
- [ ] Service type is appropriate
|
|
490
|
+
|
|
491
|
+
### 8. Scaling Configuration
|
|
492
|
+
- [ ] HPA is configured (if needed)
|
|
493
|
+
- [ ] Scaling metrics are appropriate
|
|
494
|
+
- [ ] Min/max replicas are set correctly
|
|
495
|
+
|
|
496
|
+
### 9. Observability
|
|
497
|
+
- [ ] Logging is configured
|
|
498
|
+
- [ ] Metrics endpoints exposed
|
|
499
|
+
- [ ] Tracing enabled (if applicable)
|
|
500
|
+
- [ ] Alerts are configured
|
|
501
|
+
|
|
502
|
+
### 10. Rollout Strategy
|
|
503
|
+
- [ ] RollingUpdate strategy configured
|
|
504
|
+
- [ ] maxSurge and maxUnavailable set
|
|
505
|
+
- [ ] Rollback plan documented
|
|
506
|
+
|
|
507
|
+
## Verification Commands
|
|
508
|
+
|
|
509
|
+
Run these checks before deployment:
|
|
510
|
+
1. Verify namespace: `get_namespaces()`
|
|
511
|
+
2. Check existing resources: `get_deployments(namespace="{{namespace}}")`
|
|
512
|
+
3. Verify configmaps: `get_configmaps(namespace="{{namespace}}")`
|
|
513
|
+
4. Check services: `get_services(namespace="{{namespace}}")`
|
|
514
|
+
|
|
515
|
+
## Post-Deployment Verification
|
|
516
|
+
|
|
517
|
+
After deploying:
|
|
518
|
+
1. Watch rollout: `kubectl_rollout_status("deployment", "{{app_name}}", "{{namespace}}")`
|
|
519
|
+
2. Check pods: `get_pods(namespace="{{namespace}}")`
|
|
520
|
+
3. Get logs: `get_logs(pod_name, "{{namespace}}")`
|
|
521
|
+
4. Verify endpoints: Check service has endpoints
|
|
522
|
+
|
|
523
|
+
Start the deployment checklist verification now."""
|
|
524
|
+
),
|
|
525
|
+
]
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
# Incident Response Prompt
|
|
530
|
+
INCIDENT_RESPONSE = CustomPrompt(
|
|
531
|
+
name="incident-response",
|
|
532
|
+
title="Incident Response Guide",
|
|
533
|
+
description="Structured incident response workflow for Kubernetes issues",
|
|
534
|
+
arguments=[
|
|
535
|
+
PromptArgument(
|
|
536
|
+
name="incident_type",
|
|
537
|
+
description="Type of incident (pod-crash, high-latency, oom, network, storage)",
|
|
538
|
+
required=True
|
|
539
|
+
),
|
|
540
|
+
PromptArgument(
|
|
541
|
+
name="affected_service",
|
|
542
|
+
description="Name of affected service/workload",
|
|
543
|
+
required=True
|
|
544
|
+
),
|
|
545
|
+
PromptArgument(
|
|
546
|
+
name="namespace",
|
|
547
|
+
description="Namespace of affected resources",
|
|
548
|
+
required=True
|
|
549
|
+
),
|
|
550
|
+
PromptArgument(
|
|
551
|
+
name="severity",
|
|
552
|
+
description="Incident severity (critical, high, medium, low)",
|
|
553
|
+
required=False,
|
|
554
|
+
default="high"
|
|
555
|
+
),
|
|
556
|
+
],
|
|
557
|
+
messages=[
|
|
558
|
+
PromptMessage(
|
|
559
|
+
role="user",
|
|
560
|
+
content="""## INCIDENT RESPONSE: {{incident_type}}
|
|
561
|
+
|
|
562
|
+
**Affected**: {{affected_service}} in namespace {{namespace}}
|
|
563
|
+
**Severity**: {{severity}}
|
|
564
|
+
|
|
565
|
+
## Phase 1: Triage (First 5 minutes)
|
|
566
|
+
|
|
567
|
+
### Immediate Assessment
|
|
568
|
+
1. Verify the issue:
|
|
569
|
+
- `get_pods(namespace="{{namespace}}")` - Check pod status
|
|
570
|
+
- `kubectl_describe("deployment", "{{affected_service}}", "{{namespace}}")` - Get details
|
|
571
|
+
|
|
572
|
+
2. Determine blast radius:
|
|
573
|
+
- Is this isolated to {{affected_service}}?
|
|
574
|
+
- Are other services in {{namespace}} affected?
|
|
575
|
+
- Are other namespaces affected?
|
|
576
|
+
|
|
577
|
+
3. User impact assessment:
|
|
578
|
+
- Is the service completely down?
|
|
579
|
+
- Is it degraded?
|
|
580
|
+
- Are errors being returned?
|
|
581
|
+
|
|
582
|
+
## Phase 2: Investigation (Next 15 minutes)
|
|
583
|
+
|
|
584
|
+
### For {{incident_type}} incident:
|
|
585
|
+
|
|
586
|
+
{{#incident_type}}
|
|
587
|
+
**Pod Crash Investigation:**
|
|
588
|
+
- `get_pod_events("{{affected_service}}", "{{namespace}}")` - Recent events
|
|
589
|
+
- `get_logs(pod_name, "{{namespace}}", tail=200)` - Current logs
|
|
590
|
+
- `get_logs(pod_name, "{{namespace}}", previous=true)` - Previous container logs
|
|
591
|
+
- Check for OOMKilled, CrashLoopBackOff patterns
|
|
592
|
+
|
|
593
|
+
**High Latency Investigation:**
|
|
594
|
+
- Check pod resource usage
|
|
595
|
+
- Review HPA scaling events
|
|
596
|
+
- Check dependent services
|
|
597
|
+
- Review ingress/load balancer metrics
|
|
598
|
+
|
|
599
|
+
**OOM Investigation:**
|
|
600
|
+
- `kubectl_describe("pod", pod_name, "{{namespace}}")` - Check memory limits
|
|
601
|
+
- Review container memory usage patterns
|
|
602
|
+
- Check for memory leaks in logs
|
|
603
|
+
- Compare to historical usage
|
|
604
|
+
|
|
605
|
+
**Network Investigation:**
|
|
606
|
+
- Check service endpoints exist
|
|
607
|
+
- Verify NetworkPolicies aren't blocking
|
|
608
|
+
- Test DNS resolution
|
|
609
|
+
- Check for connection errors in logs
|
|
610
|
+
|
|
611
|
+
**Storage Investigation:**
|
|
612
|
+
- `kubectl_get("pvc", namespace="{{namespace}}")` - PVC status
|
|
613
|
+
- Check if PV is bound
|
|
614
|
+
- Verify storage class availability
|
|
615
|
+
- Check node storage capacity
|
|
616
|
+
{{/incident_type}}
|
|
617
|
+
|
|
618
|
+
## Phase 3: Mitigation
|
|
619
|
+
|
|
620
|
+
### Quick Mitigation Options:
|
|
621
|
+
1. **Scale up**: Increase replicas if resource-related
|
|
622
|
+
2. **Rollback**: Revert to previous working version
|
|
623
|
+
3. **Restart**: Delete problematic pods
|
|
624
|
+
4. **Cordon**: Remove problematic node from scheduling
|
|
625
|
+
|
|
626
|
+
### Execute mitigation:
|
|
627
|
+
```
|
|
628
|
+
# Option 1: Scale up
|
|
629
|
+
kubectl_scale("deployment", "{{affected_service}}", replicas+2, "{{namespace}}")
|
|
630
|
+
|
|
631
|
+
# Option 2: Rollback
|
|
632
|
+
kubectl_rollout("undo", "deployment", "{{affected_service}}", "{{namespace}}")
|
|
633
|
+
|
|
634
|
+
# Option 3: Restart
|
|
635
|
+
kubectl_delete_pod(pod_name, "{{namespace}}", force=false)
|
|
636
|
+
```
|
|
637
|
+
|
|
638
|
+
## Phase 4: Verification
|
|
639
|
+
|
|
640
|
+
1. Confirm mitigation worked:
|
|
641
|
+
- `get_pods(namespace="{{namespace}}")` - All pods healthy
|
|
642
|
+
- Service is responding normally
|
|
643
|
+
- Error rates decreased
|
|
644
|
+
|
|
645
|
+
2. Monitor for recurrence:
|
|
646
|
+
- Watch pod events
|
|
647
|
+
- Monitor resource usage
|
|
648
|
+
- Track error rates
|
|
649
|
+
|
|
650
|
+
## Phase 5: Post-Incident
|
|
651
|
+
|
|
652
|
+
After incident is resolved:
|
|
653
|
+
1. Document timeline
|
|
654
|
+
2. Capture root cause
|
|
655
|
+
3. Identify preventive measures
|
|
656
|
+
4. Update runbooks
|
|
657
|
+
|
|
658
|
+
## Incident Log Template
|
|
659
|
+
|
|
660
|
+
| Time | Action | Result |
|
|
661
|
+
|------|--------|--------|
|
|
662
|
+
| | Issue reported | |
|
|
663
|
+
| | Investigation started | |
|
|
664
|
+
| | Root cause identified | |
|
|
665
|
+
| | Mitigation applied | |
|
|
666
|
+
| | Service restored | |
|
|
667
|
+
|
|
668
|
+
Start incident response now."""
|
|
669
|
+
),
|
|
670
|
+
]
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
|
|
674
|
+
# All built-in prompts
|
|
675
|
+
BUILTIN_PROMPTS = [
|
|
676
|
+
CLUSTER_HEALTH_CHECK,
|
|
677
|
+
DEBUG_WORKLOAD,
|
|
678
|
+
RESOURCE_USAGE,
|
|
679
|
+
SECURITY_POSTURE,
|
|
680
|
+
DEPLOYMENT_CHECKLIST,
|
|
681
|
+
INCIDENT_RESPONSE,
|
|
682
|
+
]
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def get_builtin_prompts() -> list:
|
|
686
|
+
"""Return all built-in prompts."""
|
|
687
|
+
return BUILTIN_PROMPTS.copy()
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def get_builtin_prompt_by_name(name: str) -> CustomPrompt | None:
|
|
691
|
+
"""Get a built-in prompt by name."""
|
|
692
|
+
for prompt in BUILTIN_PROMPTS:
|
|
693
|
+
if prompt.name == name:
|
|
694
|
+
return prompt
|
|
695
|
+
return None
|