@techwavedev/agi-agent-kit 1.1.7 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of @techwavedev/agi-agent-kit might be problematic. Click here for more details.
- package/CHANGELOG.md +82 -1
- package/README.md +190 -12
- package/bin/init.js +30 -2
- package/package.json +6 -3
- package/templates/base/AGENTS.md +54 -23
- package/templates/base/README.md +325 -0
- package/templates/base/directives/memory_integration.md +95 -0
- package/templates/base/execution/memory_manager.py +309 -0
- package/templates/base/execution/session_boot.py +218 -0
- package/templates/base/execution/session_init.py +320 -0
- package/templates/base/skill-creator/SKILL_skillcreator.md +23 -36
- package/templates/base/skill-creator/scripts/init_skill.py +18 -135
- package/templates/skills/ec/README.md +31 -0
- package/templates/skills/ec/aws/SKILL.md +1020 -0
- package/templates/skills/ec/aws/defaults.yaml +13 -0
- package/templates/skills/ec/aws/references/common_patterns.md +80 -0
- package/templates/skills/ec/aws/references/mcp_servers.md +98 -0
- package/templates/skills/ec/aws-terraform/SKILL.md +349 -0
- package/templates/skills/ec/aws-terraform/references/best_practices.md +394 -0
- package/templates/skills/ec/aws-terraform/references/checkov_reference.md +337 -0
- package/templates/skills/ec/aws-terraform/scripts/configure_mcp.py +150 -0
- package/templates/skills/ec/confluent-kafka/SKILL.md +655 -0
- package/templates/skills/ec/confluent-kafka/references/ansible_playbooks.md +792 -0
- package/templates/skills/ec/confluent-kafka/references/ec_deployment.md +579 -0
- package/templates/skills/ec/confluent-kafka/references/kraft_migration.md +490 -0
- package/templates/skills/ec/confluent-kafka/references/troubleshooting.md +778 -0
- package/templates/skills/ec/confluent-kafka/references/upgrade_7x_to_8x.md +488 -0
- package/templates/skills/ec/confluent-kafka/scripts/kafka_health_check.py +435 -0
- package/templates/skills/ec/confluent-kafka/scripts/upgrade_preflight.py +568 -0
- package/templates/skills/ec/confluent-kafka/scripts/validate_config.py +455 -0
- package/templates/skills/ec/consul/SKILL.md +427 -0
- package/templates/skills/ec/consul/references/acl_setup.md +168 -0
- package/templates/skills/ec/consul/references/ha_config.md +196 -0
- package/templates/skills/ec/consul/references/troubleshooting.md +267 -0
- package/templates/skills/ec/consul/references/upgrades.md +213 -0
- package/templates/skills/ec/consul/scripts/consul_health_report.py +530 -0
- package/templates/skills/ec/consul/scripts/consul_status.py +264 -0
- package/templates/skills/ec/consul/scripts/generate_values.py +170 -0
- package/templates/skills/ec/documentation/SKILL.md +351 -0
- package/templates/skills/ec/documentation/references/best_practices.md +201 -0
- package/templates/skills/ec/documentation/scripts/analyze_code.py +307 -0
- package/templates/skills/ec/documentation/scripts/detect_changes.py +460 -0
- package/templates/skills/ec/documentation/scripts/generate_changelog.py +312 -0
- package/templates/skills/ec/documentation/scripts/sync_docs.py +272 -0
- package/templates/skills/ec/documentation/scripts/update_skill_docs.py +366 -0
- package/templates/skills/ec/gitlab/SKILL.md +529 -0
- package/templates/skills/ec/gitlab/references/agent_installation.md +416 -0
- package/templates/skills/ec/gitlab/references/api_reference.md +508 -0
- package/templates/skills/ec/gitlab/references/gitops_flux.md +465 -0
- package/templates/skills/ec/gitlab/references/troubleshooting.md +518 -0
- package/templates/skills/ec/gitlab/scripts/generate_agent_values.py +329 -0
- package/templates/skills/ec/gitlab/scripts/gitlab_agent_status.py +414 -0
- package/templates/skills/ec/jira/SKILL.md +484 -0
- package/templates/skills/ec/jira/references/jql_reference.md +148 -0
- package/templates/skills/ec/jira/scripts/add_comment.py +91 -0
- package/templates/skills/ec/jira/scripts/bulk_log_work.py +124 -0
- package/templates/skills/ec/jira/scripts/create_ticket.py +162 -0
- package/templates/skills/ec/jira/scripts/get_ticket.py +191 -0
- package/templates/skills/ec/jira/scripts/jira_client.py +383 -0
- package/templates/skills/ec/jira/scripts/log_work.py +154 -0
- package/templates/skills/ec/jira/scripts/search_tickets.py +104 -0
- package/templates/skills/ec/jira/scripts/update_comment.py +67 -0
- package/templates/skills/ec/jira/scripts/update_ticket.py +161 -0
- package/templates/skills/ec/karpenter/SKILL.md +301 -0
- package/templates/skills/ec/karpenter/references/ec2nodeclasses.md +421 -0
- package/templates/skills/ec/karpenter/references/migration.md +396 -0
- package/templates/skills/ec/karpenter/references/nodepools.md +400 -0
- package/templates/skills/ec/karpenter/references/troubleshooting.md +359 -0
- package/templates/skills/ec/karpenter/scripts/generate_ec2nodeclass.py +187 -0
- package/templates/skills/ec/karpenter/scripts/generate_nodepool.py +245 -0
- package/templates/skills/ec/karpenter/scripts/karpenter_status.py +359 -0
- package/templates/skills/ec/opensearch/SKILL.md +720 -0
- package/templates/skills/ec/opensearch/references/ml_neural_search.md +576 -0
- package/templates/skills/ec/opensearch/references/operator.md +532 -0
- package/templates/skills/ec/opensearch/references/query_dsl.md +532 -0
- package/templates/skills/ec/opensearch/scripts/configure_mcp.py +148 -0
- package/templates/skills/ec/victoriametrics/SKILL.md +598 -0
- package/templates/skills/ec/victoriametrics/references/kubernetes.md +531 -0
- package/templates/skills/ec/victoriametrics/references/prometheus_migration.md +333 -0
- package/templates/skills/ec/victoriametrics/references/troubleshooting.md +442 -0
- package/templates/skills/knowledge/SKILLS_CATALOG.md +274 -4
- package/templates/skills/knowledge/intelligent-routing/SKILL.md +237 -164
- package/templates/skills/knowledge/parallel-agents/SKILL.md +345 -73
- package/templates/skills/knowledge/plugin-discovery/SKILL.md +582 -0
- package/templates/skills/knowledge/plugin-discovery/scripts/platform_setup.py +1083 -0
- package/templates/skills/knowledge/design-md/README.md +0 -34
- package/templates/skills/knowledge/design-md/SKILL.md +0 -193
- package/templates/skills/knowledge/design-md/examples/DESIGN.md +0 -154
- package/templates/skills/knowledge/notebooklm-mcp/SKILL.md +0 -71
- package/templates/skills/knowledge/notebooklm-mcp/assets/example_asset.txt +0 -24
- package/templates/skills/knowledge/notebooklm-mcp/references/api_reference.md +0 -34
- package/templates/skills/knowledge/notebooklm-mcp/scripts/example.py +0 -19
- package/templates/skills/knowledge/react-components/README.md +0 -36
- package/templates/skills/knowledge/react-components/SKILL.md +0 -53
- package/templates/skills/knowledge/react-components/examples/gold-standard-card.tsx +0 -80
- package/templates/skills/knowledge/react-components/package-lock.json +0 -231
- package/templates/skills/knowledge/react-components/package.json +0 -16
- package/templates/skills/knowledge/react-components/resources/architecture-checklist.md +0 -15
- package/templates/skills/knowledge/react-components/resources/component-template.tsx +0 -37
- package/templates/skills/knowledge/react-components/resources/stitch-api-reference.md +0 -14
- package/templates/skills/knowledge/react-components/resources/style-guide.json +0 -27
- package/templates/skills/knowledge/react-components/scripts/fetch-stitch.sh +0 -30
- package/templates/skills/knowledge/react-components/scripts/validate.js +0 -68
- package/templates/skills/knowledge/self-update/SKILL.md +0 -60
- package/templates/skills/knowledge/self-update/scripts/update_kit.py +0 -103
- package/templates/skills/knowledge/stitch-loop/README.md +0 -54
- package/templates/skills/knowledge/stitch-loop/SKILL.md +0 -235
- package/templates/skills/knowledge/stitch-loop/examples/SITE.md +0 -73
- package/templates/skills/knowledge/stitch-loop/examples/next-prompt.md +0 -25
- package/templates/skills/knowledge/stitch-loop/resources/baton-schema.md +0 -61
- package/templates/skills/knowledge/stitch-loop/resources/site-template.md +0 -104
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
# Karpenter Troubleshooting Reference
|
|
2
|
+
|
|
3
|
+
Detailed troubleshooting guide for common Karpenter issues.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Controller Issues
|
|
8
|
+
|
|
9
|
+
### Karpenter Not Running
|
|
10
|
+
|
|
11
|
+
**Symptoms:** No pods in karpenter namespace
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
kubectl get pods -n karpenter
|
|
15
|
+
# No resources found
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
**Diagnosis:**
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# Check Helm release
|
|
22
|
+
helm list -n karpenter
|
|
23
|
+
|
|
24
|
+
# Check events
|
|
25
|
+
kubectl get events -n karpenter --sort-by='.lastTimestamp'
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
**Solutions:**
|
|
29
|
+
|
|
30
|
+
1. Verify Helm installation completed
|
|
31
|
+
2. Check IRSA configuration (ServiceAccount annotation)
|
|
32
|
+
3. Verify node exists for controller to run on
|
|
33
|
+
|
|
34
|
+
### Controller CrashLoopBackOff
|
|
35
|
+
|
|
36
|
+
**Symptoms:**
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
kubectl get pods -n karpenter
|
|
40
|
+
# NAME READY STATUS RESTARTS
|
|
41
|
+
# karpenter-xyz 0/1 CrashLoopBackOff 5
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
**Diagnosis:**
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter -c controller --previous
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
**Common Causes:**
|
|
51
|
+
|
|
52
|
+
| Error | Cause | Fix |
|
|
53
|
+
| ------------------------------------------------ | ----------------------- | --------------------------------- |
|
|
54
|
+
| `WebIdentityErr: failed to retrieve credentials` | IRSA not configured | Verify OIDC provider and IAM role |
|
|
55
|
+
| `sts.amazonaws.com: i/o timeout` | DNS not resolving | Set `dnsPolicy: Default` in Helm |
|
|
56
|
+
| `unauthorized` | IAM permissions missing | Check IAM role policies |
|
|
57
|
+
|
|
58
|
+
### Enable Debug Logging
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# Via Helm
|
|
62
|
+
helm upgrade karpenter oci://public.ecr.aws/karpenter/karpenter \
|
|
63
|
+
--namespace karpenter \
|
|
64
|
+
--reuse-values \
|
|
65
|
+
--set logLevel=debug
|
|
66
|
+
|
|
67
|
+
# Via env var
|
|
68
|
+
kubectl set env deployment/karpenter -n karpenter LOG_LEVEL=debug
|
|
69
|
+
|
|
70
|
+
# View debug logs
|
|
71
|
+
kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter -c controller -f
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Provisioning Issues
|
|
77
|
+
|
|
78
|
+
### Pods Stuck Pending
|
|
79
|
+
|
|
80
|
+
**Symptoms:** Pods remain in `Pending` state despite unschedulability
|
|
81
|
+
|
|
82
|
+
**Diagnosis:**
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Check pod events
|
|
86
|
+
kubectl describe pod <pod-name> -n <namespace>
|
|
87
|
+
|
|
88
|
+
# Look for Karpenter logs
|
|
89
|
+
kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter -c controller | grep <pod-name>
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**Common Causes:**
|
|
93
|
+
|
|
94
|
+
1. **No matching NodePool:**
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
# Verify NodePools exist
|
|
98
|
+
kubectl get nodepools
|
|
99
|
+
|
|
100
|
+
# Check requirements match pod spec
|
|
101
|
+
kubectl get nodepool <name> -o yaml
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
2. **NodePool limits exceeded:**
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
kubectl describe nodepool <name>
|
|
108
|
+
# Look for: Status.Resources showing at or near limits
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
3. **No valid instance types:**
|
|
112
|
+
- Requirements too restrictive
|
|
113
|
+
- Instance types unavailable in region/AZ
|
|
114
|
+
- Check: `kubectl logs -n karpenter ... | grep "no instance type"`
|
|
115
|
+
|
|
116
|
+
### Node Launched But Not Ready
|
|
117
|
+
|
|
118
|
+
**Symptoms:** Node appears in cluster but shows `NotReady`
|
|
119
|
+
|
|
120
|
+
**Diagnosis:**
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
kubectl describe node <node-name>
|
|
124
|
+
# Check Conditions section
|
|
125
|
+
|
|
126
|
+
kubectl get events --field-selector involvedObject.name=<node-name>
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**Common Causes:**
|
|
130
|
+
|
|
131
|
+
| Issue | Diagnosis | Solution |
|
|
132
|
+
| --------------------- | -------------------- | -------------------------------------- |
|
|
133
|
+
| CNI not running | Check aws-node pods | Verify VPC CNI DaemonSet |
|
|
134
|
+
| kubelet failing | Check kubelet logs | SSH to node, check `/var/log/messages` |
|
|
135
|
+
| ENI attachment failed | EC2 console | Check subnet capacity, security groups |
|
|
136
|
+
| IP exhaustion | Check pod-eni errors | Expand CIDR or use prefix delegation |
|
|
137
|
+
|
|
138
|
+
### Insufficient Capacity
|
|
139
|
+
|
|
140
|
+
**Log Message:**
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
could not schedule pod, incompatible with nodepool "<name>", no instance type satisfied resources
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**Solutions:**
|
|
147
|
+
|
|
148
|
+
1. Expand allowed instance types in NodePool
|
|
149
|
+
2. Add more instance categories/generations
|
|
150
|
+
3. Check pod resource requests aren't too large
|
|
151
|
+
4. Verify instances available in all target AZs
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Disruption Issues
|
|
156
|
+
|
|
157
|
+
### Nodes Not Consolidating
|
|
158
|
+
|
|
159
|
+
**Symptoms:** Underutilized nodes persist
|
|
160
|
+
|
|
161
|
+
**Diagnosis:**
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
# Check consolidation policy
|
|
165
|
+
kubectl get nodepool <name> -o jsonpath='{.spec.disruption}'
|
|
166
|
+
|
|
167
|
+
# Check node conditions
|
|
168
|
+
kubectl get nodeclaim <name> -o jsonpath='{.status.conditions}'
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**Common Blockers:**
|
|
172
|
+
|
|
173
|
+
1. **PodDisruptionBudgets:**
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
kubectl get pdb --all-namespaces
|
|
177
|
+
# PDBs blocking eviction prevent consolidation
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
2. **Pod annotations:**
|
|
181
|
+
|
|
182
|
+
```yaml
|
|
183
|
+
# Pods with this annotation block node deletion
|
|
184
|
+
karpenter.sh/do-not-disrupt: "true"
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
3. **NodePool annotation:**
|
|
188
|
+
|
|
189
|
+
```yaml
|
|
190
|
+
# NodePool-level disruption blocking
|
|
191
|
+
karpenter.sh/do-not-disrupt: "true"
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
4. **System pods:**
|
|
195
|
+
- kube-system pods without PDBs
|
|
196
|
+
- Check DaemonSets tolerating Karpenter nodes
|
|
197
|
+
|
|
198
|
+
### Drift Not Detected
|
|
199
|
+
|
|
200
|
+
**Symptoms:** AMI updated but nodes not replaced
|
|
201
|
+
|
|
202
|
+
**Diagnosis:**
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
# Check drift conditions
|
|
206
|
+
kubectl get nodeclaim -o custom-columns=NAME:.metadata.name,DRIFTED:.status.conditions[?(@.type=="Drifted")].status
|
|
207
|
+
|
|
208
|
+
# Verify EC2NodeClass AMI
|
|
209
|
+
kubectl get ec2nodeclass <name> -o jsonpath='{.status.amis}'
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
**Force Drift:**
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
kubectl annotate ec2nodeclass <name> karpenter.k8s.aws/forced-drift=$(date +%s) --overwrite
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### Nodes Expiring Unexpectedly
|
|
219
|
+
|
|
220
|
+
**Symptoms:** Nodes terminated before expected lifetime
|
|
221
|
+
|
|
222
|
+
**Diagnosis:**
|
|
223
|
+
|
|
224
|
+
```bash
|
|
225
|
+
kubectl get nodeclaim <name> -o jsonpath='{.status.conditions[?(@.type=="Expired")]}'
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
**Configuration:**
|
|
229
|
+
|
|
230
|
+
```yaml
|
|
231
|
+
spec:
|
|
232
|
+
disruption:
|
|
233
|
+
expireAfter: 720h # 30 days
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Installation Issues
|
|
239
|
+
|
|
240
|
+
### Missing Service Linked Role
|
|
241
|
+
|
|
242
|
+
**Error:**
|
|
243
|
+
|
|
244
|
+
```
|
|
245
|
+
ServiceLinkedRoleCreationNotPermitted: The provided credentials do not have permission to create the service-linked role for EC2 Spot Instances
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
**Solution:**
|
|
249
|
+
|
|
250
|
+
```bash
|
|
251
|
+
aws iam create-service-linked-role --aws-service-name spot.amazonaws.com
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### STS Timeout
|
|
255
|
+
|
|
256
|
+
**Error:**
|
|
257
|
+
|
|
258
|
+
```
|
|
259
|
+
Post "https://sts.eu-west-1.amazonaws.com/": dial tcp: lookup sts.eu-west-1.amazonaws.com: i/o timeout
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
**Solutions:**
|
|
263
|
+
|
|
264
|
+
1. Set `dnsPolicy: Default` in Karpenter deployment
|
|
265
|
+
2. Ensure CoreDNS is running before Karpenter
|
|
266
|
+
3. Use Fargate profile or MNG for system pods
|
|
267
|
+
|
|
268
|
+
### Role Name Too Long
|
|
269
|
+
|
|
270
|
+
**Error:** IAM role name exceeds 64 characters
|
|
271
|
+
|
|
272
|
+
**Solution:** Use shorter role name:
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
# Instead of
|
|
276
|
+
KarpenterNodeRole-my-very-long-cluster-name-with-many-characters
|
|
277
|
+
|
|
278
|
+
# Use
|
|
279
|
+
KarpenterNodeRole-prod-cluster
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## Spot Instance Issues
|
|
285
|
+
|
|
286
|
+
### Spot Interruption Handling
|
|
287
|
+
|
|
288
|
+
Karpenter automatically:
|
|
289
|
+
|
|
290
|
+
1. Receives interruption notice (2 min warning)
|
|
291
|
+
2. Cordons the node
|
|
292
|
+
3. Drains pods gracefully
|
|
293
|
+
4. Terminates the instance
|
|
294
|
+
|
|
295
|
+
**Verify SQS queue configured:**
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
kubectl get deployment -n karpenter karpenter -o yaml | grep -A5 interruptionQueue
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
### Spot Capacity Unavailable
|
|
302
|
+
|
|
303
|
+
**Log Message:**
|
|
304
|
+
|
|
305
|
+
```
|
|
306
|
+
no offering matched the requirements, none of the instance types appear to be available
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
**Solutions:**
|
|
310
|
+
|
|
311
|
+
1. Add more instance types to requirements
|
|
312
|
+
2. Enable multiple availability zones
|
|
313
|
+
3. Mix Spot with On-Demand via capacity-type weights
|
|
314
|
+
|
|
315
|
+
---
|
|
316
|
+
|
|
317
|
+
## Useful Commands
|
|
318
|
+
|
|
319
|
+
### View All Karpenter Resources
|
|
320
|
+
|
|
321
|
+
```bash
|
|
322
|
+
# All Karpenter CRDs
|
|
323
|
+
kubectl api-resources | grep karpenter
|
|
324
|
+
|
|
325
|
+
# NodePools with status
|
|
326
|
+
kubectl get nodepools -o wide
|
|
327
|
+
|
|
328
|
+
# EC2NodeClasses with status
|
|
329
|
+
kubectl get ec2nodeclasses -o wide
|
|
330
|
+
|
|
331
|
+
# NodeClaims (current nodes)
|
|
332
|
+
kubectl get nodeclaims -o wide
|
|
333
|
+
|
|
334
|
+
# Karpenter-managed nodes
|
|
335
|
+
kubectl get nodes -l karpenter.sh/nodepool -o wide
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### Export Current Configuration
|
|
339
|
+
|
|
340
|
+
```bash
|
|
341
|
+
# Export all NodePools
|
|
342
|
+
kubectl get nodepools -o yaml > nodepools-backup.yaml
|
|
343
|
+
|
|
344
|
+
# Export all EC2NodeClasses
|
|
345
|
+
kubectl get ec2nodeclasses -o yaml > ec2nodeclasses-backup.yaml
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
### Monitor Karpenter Events
|
|
349
|
+
|
|
350
|
+
```bash
|
|
351
|
+
# Watch Karpenter controller logs
|
|
352
|
+
kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter -c controller -f
|
|
353
|
+
|
|
354
|
+
# Watch node creation/deletion
|
|
355
|
+
kubectl get nodes -l karpenter.sh/nodepool -w
|
|
356
|
+
|
|
357
|
+
# Watch NodeClaims
|
|
358
|
+
kubectl get nodeclaims -w
|
|
359
|
+
```
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Script: generate_ec2nodeclass.py
|
|
4
|
+
Purpose: Generate Karpenter EC2NodeClass YAML configuration
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python generate_ec2nodeclass.py --name <name> --cluster <cluster-name> [options]
|
|
8
|
+
|
|
9
|
+
Arguments:
|
|
10
|
+
--name EC2NodeClass name (required)
|
|
11
|
+
--cluster EKS cluster name (required, used for discovery tags)
|
|
12
|
+
--role IAM role name (default: KarpenterNodeRole-<cluster>)
|
|
13
|
+
--ami-family AMI family: AL2, AL2023, Bottlerocket, Ubuntu, Windows2019, Windows2022
|
|
14
|
+
--ami-selector AMI selector type: alias, tag, id (default: alias)
|
|
15
|
+
--volume-size Root volume size in Gi (default: 100)
|
|
16
|
+
--volume-type Volume type: gp2, gp3, io1, io2 (default: gp3)
|
|
17
|
+
--encrypted Encrypt volumes (default: true)
|
|
18
|
+
--imdsv2 Require IMDSv2 (default: true)
|
|
19
|
+
--user-data Path to user data script file
|
|
20
|
+
--tags Comma-separated tags: key=value
|
|
21
|
+
--output Output file (default: stdout)
|
|
22
|
+
|
|
23
|
+
Exit Codes:
|
|
24
|
+
0 - Success
|
|
25
|
+
1 - Invalid arguments
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import argparse
|
|
29
|
+
import json
|
|
30
|
+
import sys
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import Any
|
|
33
|
+
|
|
34
|
+
import yaml
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def generate_ec2nodeclass(args: argparse.Namespace) -> dict:
|
|
38
|
+
"""Generate EC2NodeClass configuration."""
|
|
39
|
+
# Determine role name
|
|
40
|
+
role = args.role or f"KarpenterNodeRole-{args.cluster}"
|
|
41
|
+
|
|
42
|
+
# Build AMI selector
|
|
43
|
+
ami_selector_terms = []
|
|
44
|
+
if args.ami_selector == "alias":
|
|
45
|
+
ami_family_aliases = {
|
|
46
|
+
"AL2": "al2@latest",
|
|
47
|
+
"AL2023": "al2023@latest",
|
|
48
|
+
"Bottlerocket": "bottlerocket@latest"
|
|
49
|
+
}
|
|
50
|
+
alias = ami_family_aliases.get(args.ami_family, "al2023@latest")
|
|
51
|
+
ami_selector_terms.append({"alias": alias})
|
|
52
|
+
elif args.ami_selector == "tag":
|
|
53
|
+
ami_selector_terms.append({
|
|
54
|
+
"tags": {
|
|
55
|
+
"karpenter.sh/discovery": args.cluster
|
|
56
|
+
}
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
# Build subnet and security group selectors
|
|
60
|
+
subnet_selector_terms = [{
|
|
61
|
+
"tags": {
|
|
62
|
+
"karpenter.sh/discovery": args.cluster
|
|
63
|
+
}
|
|
64
|
+
}]
|
|
65
|
+
|
|
66
|
+
security_group_selector_terms = [{
|
|
67
|
+
"tags": {
|
|
68
|
+
"karpenter.sh/discovery": args.cluster
|
|
69
|
+
}
|
|
70
|
+
}]
|
|
71
|
+
|
|
72
|
+
# Build block device mappings
|
|
73
|
+
device_name = "/dev/xvda"
|
|
74
|
+
if args.ami_family in ["Windows2019", "Windows2022"]:
|
|
75
|
+
device_name = "/dev/sda1"
|
|
76
|
+
|
|
77
|
+
block_device_mappings = [{
|
|
78
|
+
"deviceName": device_name,
|
|
79
|
+
"ebs": {
|
|
80
|
+
"volumeSize": f"{args.volume_size}Gi",
|
|
81
|
+
"volumeType": args.volume_type,
|
|
82
|
+
"encrypted": args.encrypted,
|
|
83
|
+
"deleteOnTermination": True
|
|
84
|
+
}
|
|
85
|
+
}]
|
|
86
|
+
|
|
87
|
+
# Add IOPS and throughput for gp3
|
|
88
|
+
if args.volume_type == "gp3":
|
|
89
|
+
block_device_mappings[0]["ebs"]["iops"] = 3000
|
|
90
|
+
block_device_mappings[0]["ebs"]["throughput"] = 125
|
|
91
|
+
|
|
92
|
+
# Build metadata options
|
|
93
|
+
metadata_options = {
|
|
94
|
+
"httpEndpoint": "enabled",
|
|
95
|
+
"httpProtocolIPv6": "disabled",
|
|
96
|
+
"httpPutResponseHopLimit": 1,
|
|
97
|
+
"httpTokens": "required" if args.imdsv2 else "optional"
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Build tags
|
|
101
|
+
tags = {
|
|
102
|
+
"Environment": "production",
|
|
103
|
+
"ManagedBy": "karpenter",
|
|
104
|
+
f"kubernetes.io/cluster/{args.cluster}": "owned"
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
if args.tags:
|
|
108
|
+
for tag in args.tags.split(","):
|
|
109
|
+
tag = tag.strip()
|
|
110
|
+
if "=" in tag:
|
|
111
|
+
key, value = tag.split("=", 1)
|
|
112
|
+
tags[key] = value
|
|
113
|
+
|
|
114
|
+
# Assemble EC2NodeClass
|
|
115
|
+
ec2nodeclass: dict[str, Any] = {
|
|
116
|
+
"apiVersion": "karpenter.k8s.aws/v1",
|
|
117
|
+
"kind": "EC2NodeClass",
|
|
118
|
+
"metadata": {
|
|
119
|
+
"name": args.name
|
|
120
|
+
},
|
|
121
|
+
"spec": {
|
|
122
|
+
"role": role,
|
|
123
|
+
"amiSelectorTerms": ami_selector_terms,
|
|
124
|
+
"subnetSelectorTerms": subnet_selector_terms,
|
|
125
|
+
"securityGroupSelectorTerms": security_group_selector_terms,
|
|
126
|
+
"blockDeviceMappings": block_device_mappings,
|
|
127
|
+
"metadataOptions": metadata_options,
|
|
128
|
+
"tags": tags
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
# Add AMI family if not using alias
|
|
133
|
+
if args.ami_selector != "alias" and args.ami_family:
|
|
134
|
+
ec2nodeclass["spec"]["amiFamily"] = args.ami_family
|
|
135
|
+
|
|
136
|
+
# Add user data if specified
|
|
137
|
+
if args.user_data:
|
|
138
|
+
user_data_path = Path(args.user_data)
|
|
139
|
+
if user_data_path.exists():
|
|
140
|
+
ec2nodeclass["spec"]["userData"] = user_data_path.read_text()
|
|
141
|
+
|
|
142
|
+
return ec2nodeclass
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def main():
|
|
146
|
+
parser = argparse.ArgumentParser(description="Generate Karpenter EC2NodeClass YAML")
|
|
147
|
+
parser.add_argument("--name", required=True, help="EC2NodeClass name")
|
|
148
|
+
parser.add_argument("--cluster", required=True, help="EKS cluster name")
|
|
149
|
+
parser.add_argument("--role", help="IAM role name")
|
|
150
|
+
parser.add_argument("--ami-family", default="AL2023",
|
|
151
|
+
choices=["AL2", "AL2023", "Bottlerocket", "Ubuntu", "Windows2019", "Windows2022"])
|
|
152
|
+
parser.add_argument("--ami-selector", default="alias", choices=["alias", "tag", "id"])
|
|
153
|
+
parser.add_argument("--volume-size", type=int, default=100, help="Volume size in Gi")
|
|
154
|
+
parser.add_argument("--volume-type", default="gp3", choices=["gp2", "gp3", "io1", "io2"])
|
|
155
|
+
parser.add_argument("--encrypted", type=bool, default=True, help="Encrypt volumes")
|
|
156
|
+
parser.add_argument("--imdsv2", type=bool, default=True, help="Require IMDSv2")
|
|
157
|
+
parser.add_argument("--user-data", help="Path to user data script")
|
|
158
|
+
parser.add_argument("--tags", help="Comma-separated tags: key=value")
|
|
159
|
+
parser.add_argument("--output", help="Output file (default: stdout)")
|
|
160
|
+
parser.add_argument("--format", choices=["yaml", "json"], default="yaml", help="Output format")
|
|
161
|
+
|
|
162
|
+
args = parser.parse_args()
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
ec2nodeclass = generate_ec2nodeclass(args)
|
|
166
|
+
|
|
167
|
+
if args.format == "json":
|
|
168
|
+
output = json.dumps(ec2nodeclass, indent=2)
|
|
169
|
+
else:
|
|
170
|
+
output = yaml.dump(ec2nodeclass, default_flow_style=False, sort_keys=False)
|
|
171
|
+
|
|
172
|
+
if args.output:
|
|
173
|
+
with open(args.output, "w") as f:
|
|
174
|
+
f.write(output)
|
|
175
|
+
print(f"EC2NodeClass configuration written to: {args.output}")
|
|
176
|
+
else:
|
|
177
|
+
print(output)
|
|
178
|
+
|
|
179
|
+
sys.exit(0)
|
|
180
|
+
|
|
181
|
+
except Exception as e:
|
|
182
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
183
|
+
sys.exit(1)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
if __name__ == "__main__":
|
|
187
|
+
main()
|