kubectl-mcp-server 1.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kubectl_mcp_server-1.12.0.dist-info/METADATA +711 -0
- kubectl_mcp_server-1.12.0.dist-info/RECORD +45 -0
- kubectl_mcp_server-1.12.0.dist-info/WHEEL +5 -0
- kubectl_mcp_server-1.12.0.dist-info/entry_points.txt +3 -0
- kubectl_mcp_server-1.12.0.dist-info/licenses/LICENSE +21 -0
- kubectl_mcp_server-1.12.0.dist-info/top_level.txt +2 -0
- kubectl_mcp_tool/__init__.py +21 -0
- kubectl_mcp_tool/__main__.py +46 -0
- kubectl_mcp_tool/auth/__init__.py +13 -0
- kubectl_mcp_tool/auth/config.py +71 -0
- kubectl_mcp_tool/auth/scopes.py +148 -0
- kubectl_mcp_tool/auth/verifier.py +82 -0
- kubectl_mcp_tool/cli/__init__.py +9 -0
- kubectl_mcp_tool/cli/__main__.py +10 -0
- kubectl_mcp_tool/cli/cli.py +111 -0
- kubectl_mcp_tool/diagnostics.py +355 -0
- kubectl_mcp_tool/k8s_config.py +289 -0
- kubectl_mcp_tool/mcp_server.py +530 -0
- kubectl_mcp_tool/prompts/__init__.py +5 -0
- kubectl_mcp_tool/prompts/prompts.py +823 -0
- kubectl_mcp_tool/resources/__init__.py +5 -0
- kubectl_mcp_tool/resources/resources.py +305 -0
- kubectl_mcp_tool/tools/__init__.py +28 -0
- kubectl_mcp_tool/tools/browser.py +371 -0
- kubectl_mcp_tool/tools/cluster.py +315 -0
- kubectl_mcp_tool/tools/core.py +421 -0
- kubectl_mcp_tool/tools/cost.py +680 -0
- kubectl_mcp_tool/tools/deployments.py +381 -0
- kubectl_mcp_tool/tools/diagnostics.py +174 -0
- kubectl_mcp_tool/tools/helm.py +1561 -0
- kubectl_mcp_tool/tools/networking.py +296 -0
- kubectl_mcp_tool/tools/operations.py +501 -0
- kubectl_mcp_tool/tools/pods.py +582 -0
- kubectl_mcp_tool/tools/security.py +333 -0
- kubectl_mcp_tool/tools/storage.py +133 -0
- kubectl_mcp_tool/utils/__init__.py +17 -0
- kubectl_mcp_tool/utils/helpers.py +80 -0
- tests/__init__.py +9 -0
- tests/conftest.py +379 -0
- tests/test_auth.py +256 -0
- tests/test_browser.py +349 -0
- tests/test_prompts.py +536 -0
- tests/test_resources.py +343 -0
- tests/test_server.py +384 -0
- tests/test_tools.py +659 -0
|
@@ -0,0 +1,823 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
logger = logging.getLogger("mcp-server")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def register_prompts(server):
|
|
8
|
+
"""Register all MCP prompts for Kubernetes workflows.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
server: FastMCP server instance
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
@server.prompt()
|
|
15
|
+
def troubleshoot_workload(workload: str, namespace: Optional[str] = None, resource_type: str = "pod") -> str:
|
|
16
|
+
"""Comprehensive troubleshooting guide for Kubernetes workloads."""
|
|
17
|
+
ns_text = f"in namespace '{namespace}'" if namespace else "across all namespaces"
|
|
18
|
+
return f"""# Kubernetes Troubleshooting: {workload}
|
|
19
|
+
|
|
20
|
+
Target: {resource_type}s matching '{workload}' {ns_text}
|
|
21
|
+
|
|
22
|
+
## Step 1: Discovery
|
|
23
|
+
First, identify all relevant resources:
|
|
24
|
+
- Use `get_pods` with namespace={namespace or 'None'} to list pods
|
|
25
|
+
- Filter results for pods containing '{workload}' in the name
|
|
26
|
+
- Note the status of each pod (Running, Pending, CrashLoopBackOff, etc.)
|
|
27
|
+
|
|
28
|
+
## Step 2: Status Analysis
|
|
29
|
+
For each pod found, check:
|
|
30
|
+
- **Phase**: Is it Running, Pending, Failed, or Unknown?
|
|
31
|
+
- **Ready**: Are all containers ready? (e.g., 1/1, 2/2)
|
|
32
|
+
- **Restarts**: High restart count indicates crashes
|
|
33
|
+
- **Age**: Recently created pods may still be starting
|
|
34
|
+
|
|
35
|
+
### Common Status Issues:
|
|
36
|
+
| Status | Likely Cause | First Check |
|
|
37
|
+
|--------|--------------|-------------|
|
|
38
|
+
| Pending | Scheduling issues | get_pod_events |
|
|
39
|
+
| CrashLoopBackOff | App crash on start | get_logs |
|
|
40
|
+
| ImagePullBackOff | Image not found | kubectl_describe |
|
|
41
|
+
| OOMKilled | Memory limit exceeded | kubectl_describe |
|
|
42
|
+
| CreateContainerError | Config issue | get_pod_events |
|
|
43
|
+
|
|
44
|
+
## Step 3: Deep Inspection
|
|
45
|
+
Use these tools in order:
|
|
46
|
+
|
|
47
|
+
1. **Events** - `get_pod_events(pod_name, namespace)`
|
|
48
|
+
- Look for: FailedScheduling, FailedMount, Unhealthy
|
|
49
|
+
- Check timestamps for recent issues
|
|
50
|
+
|
|
51
|
+
2. **Logs** - `get_logs(pod_name, namespace, tail=100)`
|
|
52
|
+
- Look for: exceptions, errors, stack traces
|
|
53
|
+
- If container crashed: use previous=true
|
|
54
|
+
|
|
55
|
+
3. **Describe** - `kubectl_describe("pod", pod_name, namespace)`
|
|
56
|
+
- Check: resource requests/limits, node assignment
|
|
57
|
+
- Look at: conditions, volumes, container states
|
|
58
|
+
|
|
59
|
+
## Step 4: Related Resources
|
|
60
|
+
Check parent resources:
|
|
61
|
+
- For Deployments: `kubectl_describe("deployment", name, namespace)`
|
|
62
|
+
- For StatefulSets: `kubectl_describe("statefulset", name, namespace)`
|
|
63
|
+
- For DaemonSets: `kubectl_describe("daemonset", name, namespace)`
|
|
64
|
+
|
|
65
|
+
Check dependencies:
|
|
66
|
+
- Services: `get_services(namespace)`
|
|
67
|
+
- ConfigMaps/Secrets: referenced in pod spec
|
|
68
|
+
- PVCs: `kubectl_describe("pvc", name, namespace)` if storage issues
|
|
69
|
+
|
|
70
|
+
## Step 5: Resolution Checklist
|
|
71
|
+
For each issue, provide:
|
|
72
|
+
|
|
73
|
+
1. **Root Cause**: What is actually wrong
|
|
74
|
+
2. **Evidence**: Specific log line or event message
|
|
75
|
+
3. **Fix Command**: Exact kubectl command or manifest change
|
|
76
|
+
4. **Verification**: How to confirm the fix worked
|
|
77
|
+
5. **Prevention**: Configuration to prevent recurrence
|
|
78
|
+
|
|
79
|
+
## Common Fixes Reference:
|
|
80
|
+
- **OOMKilled**: Increase memory limits in deployment spec
|
|
81
|
+
- **CrashLoopBackOff**: Fix application error, check logs
|
|
82
|
+
- **Pending (no nodes)**: Check node capacity, add nodes
|
|
83
|
+
- **ImagePullBackOff**: Verify image name, check imagePullSecrets
|
|
84
|
+
- **Mount failures**: Check PVC status, storage class
|
|
85
|
+
|
|
86
|
+
Start the investigation now."""
|
|
87
|
+
|
|
88
|
+
@server.prompt()
|
|
89
|
+
def deploy_application(app_name: str, namespace: str = "default", replicas: int = 1) -> str:
|
|
90
|
+
"""Step-by-step guide for deploying applications to Kubernetes."""
|
|
91
|
+
return f"""# Kubernetes Deployment Guide: {app_name}
|
|
92
|
+
|
|
93
|
+
Target Namespace: {namespace}
|
|
94
|
+
Desired Replicas: {replicas}
|
|
95
|
+
|
|
96
|
+
## Pre-Deployment Checklist
|
|
97
|
+
|
|
98
|
+
### Step 1: Verify Cluster Access
|
|
99
|
+
- Use `get_namespaces` to confirm cluster connectivity
|
|
100
|
+
- Check if namespace '{namespace}' exists
|
|
101
|
+
- If not, create it: `kubectl_create_namespace("{namespace}")`
|
|
102
|
+
|
|
103
|
+
### Step 2: Review Existing Resources
|
|
104
|
+
Check for conflicts or existing deployments:
|
|
105
|
+
- `get_deployments(namespace="{namespace}")` - List current deployments
|
|
106
|
+
- `get_services(namespace="{namespace}")` - List current services
|
|
107
|
+
- `get_configmaps(namespace="{namespace}")` - List ConfigMaps
|
|
108
|
+
|
|
109
|
+
### Step 3: Prepare Deployment Manifest
|
|
110
|
+
Required components for '{app_name}':
|
|
111
|
+
|
|
112
|
+
```yaml
|
|
113
|
+
apiVersion: apps/v1
|
|
114
|
+
kind: Deployment
|
|
115
|
+
metadata:
|
|
116
|
+
name: {app_name}
|
|
117
|
+
namespace: {namespace}
|
|
118
|
+
labels:
|
|
119
|
+
app: {app_name}
|
|
120
|
+
spec:
|
|
121
|
+
replicas: {replicas}
|
|
122
|
+
selector:
|
|
123
|
+
matchLabels:
|
|
124
|
+
app: {app_name}
|
|
125
|
+
template:
|
|
126
|
+
metadata:
|
|
127
|
+
labels:
|
|
128
|
+
app: {app_name}
|
|
129
|
+
spec:
|
|
130
|
+
containers:
|
|
131
|
+
- name: {app_name}
|
|
132
|
+
image: <IMAGE_NAME>:<TAG>
|
|
133
|
+
ports:
|
|
134
|
+
- containerPort: <PORT>
|
|
135
|
+
resources:
|
|
136
|
+
requests:
|
|
137
|
+
memory: "64Mi"
|
|
138
|
+
cpu: "100m"
|
|
139
|
+
limits:
|
|
140
|
+
memory: "128Mi"
|
|
141
|
+
cpu: "200m"
|
|
142
|
+
livenessProbe:
|
|
143
|
+
httpGet:
|
|
144
|
+
path: /health
|
|
145
|
+
port: <PORT>
|
|
146
|
+
initialDelaySeconds: 30
|
|
147
|
+
periodSeconds: 10
|
|
148
|
+
readinessProbe:
|
|
149
|
+
httpGet:
|
|
150
|
+
path: /ready
|
|
151
|
+
port: <PORT>
|
|
152
|
+
initialDelaySeconds: 5
|
|
153
|
+
periodSeconds: 5
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### Step 4: Apply Configuration
|
|
157
|
+
Use `kubectl_apply` with the manifest YAML
|
|
158
|
+
|
|
159
|
+
### Step 5: Verify Deployment
|
|
160
|
+
1. `kubectl_rollout_status("deployment", "{app_name}", "{namespace}")` - Watch rollout
|
|
161
|
+
2. `get_pods(namespace="{namespace}")` - Check pod status
|
|
162
|
+
3. `get_logs(pod_name, "{namespace}")` - Check application logs
|
|
163
|
+
|
|
164
|
+
### Step 6: Expose Service (if needed)
|
|
165
|
+
```yaml
|
|
166
|
+
apiVersion: v1
|
|
167
|
+
kind: Service
|
|
168
|
+
metadata:
|
|
169
|
+
name: {app_name}
|
|
170
|
+
namespace: {namespace}
|
|
171
|
+
spec:
|
|
172
|
+
selector:
|
|
173
|
+
app: {app_name}
|
|
174
|
+
ports:
|
|
175
|
+
- port: 80
|
|
176
|
+
targetPort: <PORT>
|
|
177
|
+
type: ClusterIP # or LoadBalancer, NodePort
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Step 7: Post-Deployment Verification
|
|
181
|
+
- `kubectl_describe("deployment", "{app_name}", "{namespace}")` - Full details
|
|
182
|
+
- `kubectl_get_endpoints("{app_name}", "{namespace}")` - Service endpoints
|
|
183
|
+
- Test connectivity to the application
|
|
184
|
+
|
|
185
|
+
## Rollback Plan
|
|
186
|
+
If issues occur:
|
|
187
|
+
- `kubectl_rollout("undo", "deployment", "{app_name}", "{namespace}")` - Rollback
|
|
188
|
+
- `kubectl_rollout_history("deployment", "{app_name}", "{namespace}")` - View history
|
|
189
|
+
|
|
190
|
+
Start the deployment process now."""
|
|
191
|
+
|
|
192
|
+
@server.prompt()
|
|
193
|
+
def security_audit(namespace: Optional[str] = None, scope: str = "full") -> str:
|
|
194
|
+
"""Security audit workflow for Kubernetes clusters."""
|
|
195
|
+
ns_text = f"namespace '{namespace}'" if namespace else "all namespaces"
|
|
196
|
+
return f"""# Kubernetes Security Audit
|
|
197
|
+
|
|
198
|
+
Scope: {scope}
|
|
199
|
+
Target: {ns_text}
|
|
200
|
+
|
|
201
|
+
## Phase 1: RBAC Analysis
|
|
202
|
+
|
|
203
|
+
### Step 1: Review ClusterRoles
|
|
204
|
+
- `kubectl_get("clusterroles")` - List all cluster roles
|
|
205
|
+
- Look for overly permissive roles (e.g., `*` on verbs or resources)
|
|
206
|
+
- Check for `cluster-admin` bindings
|
|
207
|
+
|
|
208
|
+
### Step 2: Review RoleBindings
|
|
209
|
+
- `kubectl_get("clusterrolebindings")` - Cluster-wide bindings
|
|
210
|
+
- `kubectl_get("rolebindings", namespace="{namespace or 'all'}")` - Namespace bindings
|
|
211
|
+
- Identify which users/serviceaccounts have elevated privileges
|
|
212
|
+
|
|
213
|
+
### Step 3: ServiceAccount Analysis
|
|
214
|
+
- `kubectl_get("serviceaccounts", namespace="{namespace or 'all'}")` - List SAs
|
|
215
|
+
- Check for default SA usage in pods
|
|
216
|
+
- Verify SA token automounting is disabled where not needed
|
|
217
|
+
|
|
218
|
+
## Phase 2: Pod Security
|
|
219
|
+
|
|
220
|
+
### Step 4: Security Context Review
|
|
221
|
+
Use `kubectl_get_security_contexts(namespace="{namespace}")` to check:
|
|
222
|
+
- [ ] Pods running as non-root
|
|
223
|
+
- [ ] Read-only root filesystem
|
|
224
|
+
- [ ] Dropped capabilities
|
|
225
|
+
- [ ] No privilege escalation
|
|
226
|
+
|
|
227
|
+
### Step 5: Image Security
|
|
228
|
+
- `kubectl_list_images(namespace="{namespace}")` - List all images
|
|
229
|
+
- Check for:
|
|
230
|
+
- [ ] Specific image tags (not `latest`)
|
|
231
|
+
- [ ] Trusted registries
|
|
232
|
+
- [ ] Image pull policies
|
|
233
|
+
|
|
234
|
+
### Step 6: Network Policies
|
|
235
|
+
- `kubectl_get("networkpolicies", namespace="{namespace}")` - List policies
|
|
236
|
+
- Verify default deny policies exist
|
|
237
|
+
- Check ingress/egress rules
|
|
238
|
+
|
|
239
|
+
## Phase 3: Secrets Management
|
|
240
|
+
|
|
241
|
+
### Step 7: Secrets Audit
|
|
242
|
+
- `kubectl_get("secrets", namespace="{namespace}")` - List secrets
|
|
243
|
+
- Check for:
|
|
244
|
+
- [ ] Unused secrets
|
|
245
|
+
- [ ] Secrets mounted as environment variables (less secure)
|
|
246
|
+
- [ ] External secrets management integration
|
|
247
|
+
|
|
248
|
+
### Step 8: ConfigMap Review
|
|
249
|
+
- `kubectl_get("configmaps", namespace="{namespace}")` - List ConfigMaps
|
|
250
|
+
- Ensure no sensitive data in ConfigMaps
|
|
251
|
+
|
|
252
|
+
## Phase 4: Resource Security
|
|
253
|
+
|
|
254
|
+
### Step 9: Resource Quotas
|
|
255
|
+
- `kubectl_get("resourcequotas", namespace="{namespace}")` - Check quotas
|
|
256
|
+
- Verify limits are set appropriately
|
|
257
|
+
|
|
258
|
+
### Step 10: Pod Disruption Budgets
|
|
259
|
+
- `kubectl_get("poddisruptionbudgets", namespace="{namespace}")` - Check PDBs
|
|
260
|
+
- Ensure critical workloads have PDBs
|
|
261
|
+
|
|
262
|
+
## Security Report Template
|
|
263
|
+
|
|
264
|
+
For each finding, document:
|
|
265
|
+
1. **Severity**: Critical/High/Medium/Low
|
|
266
|
+
2. **Resource**: Specific resource affected
|
|
267
|
+
3. **Issue**: Description of the security concern
|
|
268
|
+
4. **Risk**: Potential impact if exploited
|
|
269
|
+
5. **Remediation**: Steps to fix
|
|
270
|
+
6. **Verification**: How to confirm the fix
|
|
271
|
+
|
|
272
|
+
## Common Fixes:
|
|
273
|
+
- Add `securityContext.runAsNonRoot: true`
|
|
274
|
+
- Set `automountServiceAccountToken: false`
|
|
275
|
+
- Add NetworkPolicy with default deny
|
|
276
|
+
- Use specific image tags
|
|
277
|
+
- Enable Pod Security Standards
|
|
278
|
+
|
|
279
|
+
Begin the security audit now."""
|
|
280
|
+
|
|
281
|
+
@server.prompt()
|
|
282
|
+
def cost_optimization(namespace: Optional[str] = None) -> str:
|
|
283
|
+
"""Cost optimization analysis workflow."""
|
|
284
|
+
ns_text = f"namespace '{namespace}'" if namespace else "cluster-wide"
|
|
285
|
+
return f"""# Kubernetes Cost Optimization Analysis
|
|
286
|
+
|
|
287
|
+
Scope: {ns_text}
|
|
288
|
+
|
|
289
|
+
## Phase 1: Resource Usage Analysis
|
|
290
|
+
|
|
291
|
+
### Step 1: Current Resource Consumption
|
|
292
|
+
Use `kubectl_get_resource_usage(namespace="{namespace}")` to analyze:
|
|
293
|
+
- CPU requests vs actual usage
|
|
294
|
+
- Memory requests vs actual usage
|
|
295
|
+
- Identify over-provisioned resources
|
|
296
|
+
|
|
297
|
+
### Step 2: Idle Resource Detection
|
|
298
|
+
Use `kubectl_get_idle_resources(namespace="{namespace}")` to find:
|
|
299
|
+
- Pods with < 10% CPU utilization
|
|
300
|
+
- Pods with < 20% memory utilization
|
|
301
|
+
- Unused PersistentVolumes
|
|
302
|
+
- Idle LoadBalancer services
|
|
303
|
+
|
|
304
|
+
### Step 3: Resource Recommendations
|
|
305
|
+
Use `kubectl_get_resource_recommendations(namespace="{namespace}")`:
|
|
306
|
+
- Right-sizing suggestions based on usage
|
|
307
|
+
- HPA recommendations
|
|
308
|
+
- VPA recommendations
|
|
309
|
+
|
|
310
|
+
## Phase 2: Workload Optimization
|
|
311
|
+
|
|
312
|
+
### Step 4: Deployment Analysis
|
|
313
|
+
For each deployment:
|
|
314
|
+
- Check replica count vs actual load
|
|
315
|
+
- Review resource requests/limits
|
|
316
|
+
- Identify candidates for autoscaling
|
|
317
|
+
|
|
318
|
+
### Step 5: Node Utilization
|
|
319
|
+
- `kubectl_top("nodes")` - Node resource usage
|
|
320
|
+
- Identify underutilized nodes
|
|
321
|
+
- Consider node consolidation
|
|
322
|
+
|
|
323
|
+
### Step 6: Storage Optimization
|
|
324
|
+
- Review PVC sizes vs actual usage
|
|
325
|
+
- Identify unused PVCs
|
|
326
|
+
- Consider storage class optimization
|
|
327
|
+
|
|
328
|
+
## Phase 3: Scheduling Optimization
|
|
329
|
+
|
|
330
|
+
### Step 7: Pod Scheduling
|
|
331
|
+
- Review pod affinity/anti-affinity rules
|
|
332
|
+
- Check for bin-packing opportunities
|
|
333
|
+
- Evaluate spot/preemptible node usage
|
|
334
|
+
|
|
335
|
+
### Step 8: Priority Classes
|
|
336
|
+
- Review PriorityClasses
|
|
337
|
+
- Ensure critical workloads have appropriate priority
|
|
338
|
+
|
|
339
|
+
## Phase 4: Cost Estimation
|
|
340
|
+
|
|
341
|
+
### Step 9: Current Cost Analysis
|
|
342
|
+
Use `kubectl_get_cost_analysis(namespace="{namespace}")`:
|
|
343
|
+
- Estimated monthly costs
|
|
344
|
+
- Cost breakdown by resource type
|
|
345
|
+
- Cost per namespace/workload
|
|
346
|
+
|
|
347
|
+
### Step 10: Optimization Savings
|
|
348
|
+
Estimate savings from:
|
|
349
|
+
- Right-sizing resources
|
|
350
|
+
- Implementing autoscaling
|
|
351
|
+
- Using spot instances
|
|
352
|
+
- Consolidating workloads
|
|
353
|
+
|
|
354
|
+
## Optimization Actions
|
|
355
|
+
|
|
356
|
+
### Quick Wins (Immediate Impact):
|
|
357
|
+
1. Remove idle resources
|
|
358
|
+
2. Right-size over-provisioned pods
|
|
359
|
+
3. Delete unused PVCs
|
|
360
|
+
|
|
361
|
+
### Medium-Term:
|
|
362
|
+
1. Implement HPA for variable workloads
|
|
363
|
+
2. Use VPA for stable workloads
|
|
364
|
+
3. Consolidate underutilized nodes
|
|
365
|
+
|
|
366
|
+
### Long-Term:
|
|
367
|
+
1. Implement cluster autoscaler
|
|
368
|
+
2. Use spot/preemptible nodes
|
|
369
|
+
3. Multi-tenancy optimization
|
|
370
|
+
|
|
371
|
+
Begin the cost optimization analysis now."""
|
|
372
|
+
|
|
373
|
+
@server.prompt()
|
|
374
|
+
def disaster_recovery(namespace: Optional[str] = None, dr_type: str = "full") -> str:
|
|
375
|
+
"""Disaster recovery planning and execution workflow."""
|
|
376
|
+
ns_text = f"namespace '{namespace}'" if namespace else "entire cluster"
|
|
377
|
+
return f"""# Kubernetes Disaster Recovery Plan
|
|
378
|
+
|
|
379
|
+
Scope: {ns_text}
|
|
380
|
+
DR Type: {dr_type}
|
|
381
|
+
|
|
382
|
+
## Phase 1: Pre-Disaster Preparation
|
|
383
|
+
|
|
384
|
+
### Step 1: Inventory Current State
|
|
385
|
+
Document all resources:
|
|
386
|
+
- `get_deployments(namespace="{namespace}")` - All deployments
|
|
387
|
+
- `get_services(namespace="{namespace}")` - All services
|
|
388
|
+
- `kubectl_get("configmaps", namespace="{namespace}")` - ConfigMaps
|
|
389
|
+
- `kubectl_get("secrets", namespace="{namespace}")` - Secrets
|
|
390
|
+
- `kubectl_get("pvc", namespace="{namespace}")` - Persistent volumes
|
|
391
|
+
|
|
392
|
+
### Step 2: Backup Strategy
|
|
393
|
+
For each resource type:
|
|
394
|
+
|
|
395
|
+
**Deployments/StatefulSets:**
|
|
396
|
+
- Export YAML manifests using `kubectl_export`
|
|
397
|
+
- Store in version control
|
|
398
|
+
- Document image versions
|
|
399
|
+
|
|
400
|
+
**ConfigMaps/Secrets:**
|
|
401
|
+
- Export with `kubectl_export`
|
|
402
|
+
- Encrypt secrets before storage
|
|
403
|
+
- Use external secret management
|
|
404
|
+
|
|
405
|
+
**Persistent Data:**
|
|
406
|
+
- Volume snapshots (if supported)
|
|
407
|
+
- Application-level backups
|
|
408
|
+
- Document backup frequency
|
|
409
|
+
|
|
410
|
+
### Step 3: Document Dependencies
|
|
411
|
+
Create dependency map:
|
|
412
|
+
- External services (databases, APIs)
|
|
413
|
+
- DNS configurations
|
|
414
|
+
- Load balancer settings
|
|
415
|
+
- SSL certificates
|
|
416
|
+
|
|
417
|
+
## Phase 2: Backup Execution
|
|
418
|
+
|
|
419
|
+
### Step 4: Export Resources
|
|
420
|
+
For namespace '{namespace or "all"}':
|
|
421
|
+
```
|
|
422
|
+
kubectl_export("deployment", namespace="{namespace}")
|
|
423
|
+
kubectl_export("service", namespace="{namespace}")
|
|
424
|
+
kubectl_export("configmap", namespace="{namespace}")
|
|
425
|
+
kubectl_export("secret", namespace="{namespace}")
|
|
426
|
+
kubectl_export("ingress", namespace="{namespace}")
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
### Step 5: Verify Backups
|
|
430
|
+
- Validate YAML syntax
|
|
431
|
+
- Check completeness
|
|
432
|
+
- Test restore in staging
|
|
433
|
+
|
|
434
|
+
## Phase 3: Recovery Procedures
|
|
435
|
+
|
|
436
|
+
### Step 6: Cluster Recovery
|
|
437
|
+
If cluster is lost:
|
|
438
|
+
1. Provision new cluster
|
|
439
|
+
2. Configure networking
|
|
440
|
+
3. Set up storage classes
|
|
441
|
+
4. Apply RBAC configurations
|
|
442
|
+
|
|
443
|
+
### Step 7: Namespace Recovery
|
|
444
|
+
For namespace '{namespace or "default"}':
|
|
445
|
+
1. `kubectl_create_namespace("{namespace}")` - Create namespace
|
|
446
|
+
2. Apply secrets first (dependencies)
|
|
447
|
+
3. Apply ConfigMaps
|
|
448
|
+
4. Apply PVCs
|
|
449
|
+
5. Apply deployments
|
|
450
|
+
6. Apply services
|
|
451
|
+
7. Apply ingresses
|
|
452
|
+
|
|
453
|
+
### Step 8: Data Recovery
|
|
454
|
+
- Restore from volume snapshots
|
|
455
|
+
- Execute application-level restores
|
|
456
|
+
- Verify data integrity
|
|
457
|
+
|
|
458
|
+
### Step 9: Verification
|
|
459
|
+
Post-recovery checks:
|
|
460
|
+
- `get_pods(namespace="{namespace}")` - All pods running
|
|
461
|
+
- `kubectl_get_endpoints` - Services have endpoints
|
|
462
|
+
- Application health checks
|
|
463
|
+
- Data verification
|
|
464
|
+
|
|
465
|
+
## Quick Reference Commands
|
|
466
|
+
```
|
|
467
|
+
# Full namespace backup
|
|
468
|
+
kubectl get all -n {namespace or 'default'} -o yaml > backup.yaml
|
|
469
|
+
|
|
470
|
+
# Restore from backup
|
|
471
|
+
kubectl apply -f backup.yaml
|
|
472
|
+
|
|
473
|
+
# Check restore status
|
|
474
|
+
kubectl get pods -n {namespace or 'default'} -w
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
Begin disaster recovery planning now."""
|
|
478
|
+
|
|
479
|
+
@server.prompt()
|
|
480
|
+
def debug_networking(service_name: str, namespace: str = "default") -> str:
|
|
481
|
+
"""Network debugging workflow for Kubernetes services."""
|
|
482
|
+
return f"""# Kubernetes Network Debugging: {service_name}
|
|
483
|
+
|
|
484
|
+
Target: Service '{service_name}' in namespace '{namespace}'
|
|
485
|
+
|
|
486
|
+
## Phase 1: Service Discovery
|
|
487
|
+
|
|
488
|
+
### Step 1: Verify Service Exists
|
|
489
|
+
- `kubectl_describe("service", "{service_name}", "{namespace}")` - Service details
|
|
490
|
+
- Check service type (ClusterIP, NodePort, LoadBalancer)
|
|
491
|
+
- Note selector labels and ports
|
|
492
|
+
|
|
493
|
+
### Step 2: Check Endpoints
|
|
494
|
+
- `kubectl_get_endpoints("{service_name}", "{namespace}")` - Endpoint addresses
|
|
495
|
+
- If empty: No pods match the service selector
|
|
496
|
+
- Verify endpoint IPs match pod IPs
|
|
497
|
+
|
|
498
|
+
### Step 3: Verify Backend Pods
|
|
499
|
+
- `get_pods(namespace="{namespace}")` - List pods
|
|
500
|
+
- Check pods have matching labels
|
|
501
|
+
- Verify pods are in Running state
|
|
502
|
+
|
|
503
|
+
## Phase 2: DNS Resolution
|
|
504
|
+
|
|
505
|
+
### Step 4: Test DNS Resolution
|
|
506
|
+
From within a pod:
|
|
507
|
+
```
|
|
508
|
+
nslookup {service_name}.{namespace}.svc.cluster.local
|
|
509
|
+
```
|
|
510
|
+
|
|
511
|
+
Expected format: `<service>.<namespace>.svc.cluster.local`
|
|
512
|
+
|
|
513
|
+
### Step 5: CoreDNS Health
|
|
514
|
+
- `get_pods(namespace="kube-system")` - Check CoreDNS pods
|
|
515
|
+
- `get_logs(coredns_pod, "kube-system")` - Check DNS logs
|
|
516
|
+
|
|
517
|
+
## Phase 3: Connectivity Testing
|
|
518
|
+
|
|
519
|
+
### Step 6: Pod-to-Pod Connectivity
|
|
520
|
+
Test from a debug pod:
|
|
521
|
+
```
|
|
522
|
+
kubectl run debug --rm -it --image=busybox -- wget -qO- http://{service_name}.{namespace}:PORT
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
### Step 7: Service Port Verification
|
|
526
|
+
- Verify `port` (service port) and `targetPort` (container port) match
|
|
527
|
+
- Check if container is listening on targetPort
|
|
528
|
+
- Use `kubectl_port_forward` to test directly
|
|
529
|
+
|
|
530
|
+
### Step 8: Network Policies
|
|
531
|
+
- `kubectl_get("networkpolicies", namespace="{namespace}")` - List policies
|
|
532
|
+
- Check if policies block ingress/egress
|
|
533
|
+
- Verify policy selectors
|
|
534
|
+
|
|
535
|
+
## Phase 4: Common Issues
|
|
536
|
+
|
|
537
|
+
### Issue: No Endpoints
|
|
538
|
+
Causes:
|
|
539
|
+
- Pod selector mismatch
|
|
540
|
+
- Pods not running
|
|
541
|
+
- Readiness probe failing
|
|
542
|
+
|
|
543
|
+
Fix:
|
|
544
|
+
1. Check service selector labels
|
|
545
|
+
2. Verify pod labels match
|
|
546
|
+
3. Fix readiness probe
|
|
547
|
+
|
|
548
|
+
### Issue: Connection Refused
|
|
549
|
+
Causes:
|
|
550
|
+
- Wrong targetPort
|
|
551
|
+
- App not listening
|
|
552
|
+
- App crashed
|
|
553
|
+
|
|
554
|
+
Fix:
|
|
555
|
+
1. Verify container port
|
|
556
|
+
2. Check pod logs
|
|
557
|
+
3. Test with port-forward
|
|
558
|
+
|
|
559
|
+
### Issue: Connection Timeout
|
|
560
|
+
Causes:
|
|
561
|
+
- Network policy blocking
|
|
562
|
+
- Wrong service IP/port
|
|
563
|
+
- CNI issues
|
|
564
|
+
|
|
565
|
+
Fix:
|
|
566
|
+
1. Review network policies
|
|
567
|
+
2. Verify kube-proxy running
|
|
568
|
+
3. Check CNI plugin status
|
|
569
|
+
|
|
570
|
+
## Debugging Commands Reference
|
|
571
|
+
```
|
|
572
|
+
# Test from debug pod
|
|
573
|
+
kubectl run debug --rm -it --image=nicolaka/netshoot -- /bin/bash
|
|
574
|
+
|
|
575
|
+
# Inside debug pod:
|
|
576
|
+
curl -v http://{service_name}.{namespace}:PORT
|
|
577
|
+
dig {service_name}.{namespace}.svc.cluster.local
|
|
578
|
+
traceroute {{service_ip}}
|
|
579
|
+
netstat -tlnp
|
|
580
|
+
```
|
|
581
|
+
|
|
582
|
+
## Network Flow Verification
|
|
583
|
+
1. Client -> Service IP (kube-proxy/iptables)
|
|
584
|
+
2. Service -> Endpoint (pod IP)
|
|
585
|
+
3. Pod IP -> Container port
|
|
586
|
+
|
|
587
|
+
Check each hop for failures.
|
|
588
|
+
|
|
589
|
+
Begin network debugging now."""
|
|
590
|
+
|
|
591
|
+
@server.prompt()
|
|
592
|
+
def scale_application(app_name: str, namespace: str = "default", target_replicas: int = 3) -> str:
|
|
593
|
+
"""Application scaling guide with best practices."""
|
|
594
|
+
return f"""# Kubernetes Scaling Guide: {app_name}
|
|
595
|
+
|
|
596
|
+
Target: Deployment '{app_name}' in namespace '{namespace}'
|
|
597
|
+
Target Replicas: {target_replicas}
|
|
598
|
+
|
|
599
|
+
## Pre-Scaling Checklist
|
|
600
|
+
|
|
601
|
+
### Step 1: Current State Assessment
|
|
602
|
+
- `kubectl_describe("deployment", "{app_name}", "{namespace}")` - Current config
|
|
603
|
+
- `kubectl_top("pods", namespace="{namespace}")` - Resource usage
|
|
604
|
+
- Note current replica count and resource limits
|
|
605
|
+
|
|
606
|
+
### Step 2: Capacity Planning
|
|
607
|
+
Calculate required resources:
|
|
608
|
+
- Current pod resources × {target_replicas} = Total needed
|
|
609
|
+
- Check node capacity: `kubectl_top("nodes")`
|
|
610
|
+
- Verify cluster can accommodate new pods
|
|
611
|
+
|
|
612
|
+
### Step 3: Check Dependencies
|
|
613
|
+
- Database connection pools
|
|
614
|
+
- External API rate limits
|
|
615
|
+
- Shared resources (ConfigMaps, Secrets)
|
|
616
|
+
- Service mesh sidecar limits
|
|
617
|
+
|
|
618
|
+
## Scaling Methods
|
|
619
|
+
|
|
620
|
+
### Method 1: Manual Scaling
|
|
621
|
+
Immediate scale operation:
|
|
622
|
+
```
|
|
623
|
+
kubectl_scale("deployment", "{app_name}", {target_replicas}, "{namespace}")
|
|
624
|
+
```
|
|
625
|
+
|
|
626
|
+
Monitor rollout:
|
|
627
|
+
```
|
|
628
|
+
kubectl_rollout_status("deployment", "{app_name}", "{namespace}")
|
|
629
|
+
```
|
|
630
|
+
|
|
631
|
+
### Method 2: Horizontal Pod Autoscaler (HPA)
|
|
632
|
+
For automatic scaling based on metrics:
|
|
633
|
+
|
|
634
|
+
```yaml
|
|
635
|
+
apiVersion: autoscaling/v2
|
|
636
|
+
kind: HorizontalPodAutoscaler
|
|
637
|
+
metadata:
|
|
638
|
+
name: {app_name}-hpa
|
|
639
|
+
namespace: {namespace}
|
|
640
|
+
spec:
|
|
641
|
+
scaleTargetRef:
|
|
642
|
+
apiVersion: apps/v1
|
|
643
|
+
kind: Deployment
|
|
644
|
+
name: {app_name}
|
|
645
|
+
minReplicas: 2
|
|
646
|
+
maxReplicas: {target_replicas * 2}
|
|
647
|
+
metrics:
|
|
648
|
+
- type: Resource
|
|
649
|
+
resource:
|
|
650
|
+
name: cpu
|
|
651
|
+
target:
|
|
652
|
+
type: Utilization
|
|
653
|
+
averageUtilization: 70
|
|
654
|
+
- type: Resource
|
|
655
|
+
resource:
|
|
656
|
+
name: memory
|
|
657
|
+
target:
|
|
658
|
+
type: Utilization
|
|
659
|
+
averageUtilization: 80
|
|
660
|
+
```
|
|
661
|
+
|
|
662
|
+
## Post-Scaling Verification
|
|
663
|
+
|
|
664
|
+
### Step 4: Verify Scaling Success
|
|
665
|
+
1. `get_pods(namespace="{namespace}")` - Check pod count
|
|
666
|
+
2. Verify all pods are Running and Ready
|
|
667
|
+
3. Check pod distribution across nodes
|
|
668
|
+
|
|
669
|
+
### Step 5: Monitor Application
|
|
670
|
+
- Check application metrics
|
|
671
|
+
- Verify response times
|
|
672
|
+
- Monitor error rates
|
|
673
|
+
- Check resource utilization
|
|
674
|
+
|
|
675
|
+
## Rollback Plan
|
|
676
|
+
If issues occur after scaling:
|
|
677
|
+
```
|
|
678
|
+
kubectl_scale("deployment", "{app_name}", original_count, "{namespace}")
|
|
679
|
+
```
|
|
680
|
+
|
|
681
|
+
Begin scaling operation now."""
|
|
682
|
+
|
|
683
|
+
@server.prompt()
|
|
684
|
+
def upgrade_cluster(current_version: str = "1.28", target_version: str = "1.29") -> str:
|
|
685
|
+
"""Kubernetes cluster upgrade planning guide."""
|
|
686
|
+
return f"""# Kubernetes Cluster Upgrade Plan
|
|
687
|
+
|
|
688
|
+
Current Version: {current_version}
|
|
689
|
+
Target Version: {target_version}
|
|
690
|
+
|
|
691
|
+
## Pre-Upgrade Phase
|
|
692
|
+
|
|
693
|
+
### Step 1: Compatibility Check
|
|
694
|
+
Review upgrade path:
|
|
695
|
+
- Kubernetes supports N-2 version skew
|
|
696
|
+
- Upgrade one minor version at a time
|
|
697
|
+
- Check: {current_version} -> {target_version} is valid
|
|
698
|
+
|
|
699
|
+
### Step 2: Deprecation Review
|
|
700
|
+
Check for deprecated APIs:
|
|
701
|
+
- `kubectl_get_deprecated_resources` - Find deprecated resources
|
|
702
|
+
- Review release notes for {target_version}
|
|
703
|
+
- Update manifests before upgrade
|
|
704
|
+
|
|
705
|
+
### Step 3: Addon Compatibility
|
|
706
|
+
Verify addon versions support {target_version}:
|
|
707
|
+
- CNI plugin (Calico, Cilium, etc.)
|
|
708
|
+
- Ingress controller
|
|
709
|
+
- Metrics server
|
|
710
|
+
- Storage drivers
|
|
711
|
+
|
|
712
|
+
### Step 4: Backup Everything
|
|
713
|
+
Create full backups:
|
|
714
|
+
- etcd snapshot
|
|
715
|
+
- All resource manifests
|
|
716
|
+
- PersistentVolume data
|
|
717
|
+
- External configurations
|
|
718
|
+
|
|
719
|
+
## Control Plane Upgrade
|
|
720
|
+
|
|
721
|
+
### Step 5: Upgrade Control Plane Components
|
|
722
|
+
Order of operations:
|
|
723
|
+
1. kube-apiserver
|
|
724
|
+
2. kube-controller-manager
|
|
725
|
+
3. kube-scheduler
|
|
726
|
+
4. cloud-controller-manager (if applicable)
|
|
727
|
+
|
|
728
|
+
For managed clusters (EKS, GKE, AKS):
|
|
729
|
+
- Use provider's upgrade mechanism
|
|
730
|
+
- Monitor upgrade progress
|
|
731
|
+
|
|
732
|
+
### Step 6: Verify Control Plane
|
|
733
|
+
After control plane upgrade:
|
|
734
|
+
- `kubectl_cluster_info` - Verify API server version
|
|
735
|
+
- Check component health
|
|
736
|
+
- Test API connectivity
|
|
737
|
+
|
|
738
|
+
## Node Upgrade
|
|
739
|
+
|
|
740
|
+
### Step 7: Upgrade Strategy Selection
|
|
741
|
+
Choose one:
|
|
742
|
+
|
|
743
|
+
**Rolling Upgrade (Recommended):**
|
|
744
|
+
- Upgrade nodes one at a time
|
|
745
|
+
- Minimal disruption
|
|
746
|
+
- Slower but safer
|
|
747
|
+
|
|
748
|
+
**Blue-Green:**
|
|
749
|
+
- Create new node pool
|
|
750
|
+
- Migrate workloads
|
|
751
|
+
- Delete old nodes
|
|
752
|
+
|
|
753
|
+
### Step 8: Node Upgrade Process
|
|
754
|
+
For each node:
|
|
755
|
+
|
|
756
|
+
1. **Cordon the node:**
|
|
757
|
+
`kubectl_cordon(node_name)`
|
|
758
|
+
|
|
759
|
+
2. **Drain workloads:**
|
|
760
|
+
`kubectl_drain(node_name, ignore_daemonsets=True)`
|
|
761
|
+
|
|
762
|
+
3. **Upgrade kubelet & kubectl:**
|
|
763
|
+
- Update packages
|
|
764
|
+
- Restart kubelet
|
|
765
|
+
|
|
766
|
+
4. **Uncordon the node:**
|
|
767
|
+
`kubectl_uncordon(node_name)`
|
|
768
|
+
|
|
769
|
+
5. **Verify node health:**
|
|
770
|
+
`kubectl_describe("node", node_name)`
|
|
771
|
+
|
|
772
|
+
### Step 9: Verify Node Versions
|
|
773
|
+
- `kubectl_get("nodes")` - Check all node versions
|
|
774
|
+
- Ensure all nodes show {target_version}
|
|
775
|
+
|
|
776
|
+
## Post-Upgrade Verification
|
|
777
|
+
|
|
778
|
+
### Step 10: Cluster Health Check
|
|
779
|
+
Run comprehensive checks:
|
|
780
|
+
- `kubectl_cluster_info` - Cluster status
|
|
781
|
+
- `get_pods(namespace="kube-system")` - System pods
|
|
782
|
+
- All nodes Ready
|
|
783
|
+
- All system pods Running
|
|
784
|
+
|
|
785
|
+
### Step 11: Application Verification
|
|
786
|
+
For each namespace:
|
|
787
|
+
- Check pod health
|
|
788
|
+
- Verify service endpoints
|
|
789
|
+
- Test application functionality
|
|
790
|
+
- Monitor for errors
|
|
791
|
+
|
|
792
|
+
### Step 12: Update Tooling
|
|
793
|
+
After successful upgrade:
|
|
794
|
+
- Update kubectl client
|
|
795
|
+
- Update CI/CD pipelines
|
|
796
|
+
- Update documentation
|
|
797
|
+
- Update monitoring dashboards
|
|
798
|
+
|
|
799
|
+
## Upgrade Checklist
|
|
800
|
+
|
|
801
|
+
Pre-Upgrade:
|
|
802
|
+
- [ ] Backup etcd
|
|
803
|
+
- [ ] Backup all manifests
|
|
804
|
+
- [ ] Check API deprecations
|
|
805
|
+
- [ ] Verify addon compatibility
|
|
806
|
+
- [ ] Test upgrade in staging
|
|
807
|
+
- [ ] Schedule maintenance window
|
|
808
|
+
- [ ] Notify stakeholders
|
|
809
|
+
|
|
810
|
+
During Upgrade:
|
|
811
|
+
- [ ] Upgrade control plane
|
|
812
|
+
- [ ] Verify control plane health
|
|
813
|
+
- [ ] Upgrade nodes (rolling)
|
|
814
|
+
- [ ] Monitor for issues
|
|
815
|
+
|
|
816
|
+
Post-Upgrade:
|
|
817
|
+
- [ ] Verify all nodes upgraded
|
|
818
|
+
- [ ] Check application health
|
|
819
|
+
- [ ] Update client tools
|
|
820
|
+
- [ ] Document any issues
|
|
821
|
+
- [ ] Update runbooks
|
|
822
|
+
|
|
823
|
+
Begin upgrade planning now."""
|