agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,653 @@
1
+ # Capacity Planning
2
+
3
+ Comprehensive guidelines for planning, testing, and scaling system capacity.
4
+
5
+ ## Core Principles
6
+
7
+ 1. **Measure First** - Base capacity decisions on data, not guesses
8
+ 2. **Plan Ahead** - Provision for growth before you need it
9
+ 3. **Test Limits** - Know your breaking points before users find them
10
+ 4. **Right-Size** - Neither over-provision (waste) nor under-provision (outage)
11
+
12
+ ## Capacity Dimensions
13
+
14
+ ### Resource Types
15
+
16
+ ```yaml
17
+ compute:
18
+ metrics:
19
+ - "cpu_utilization_percent"
20
+ - "memory_utilization_percent"
21
+ - "pod_count"
22
+ - "node_count"
23
+ scaling:
24
+ vertical: "Larger instances"
25
+ horizontal: "More instances"
26
+ planning_factors:
27
+ - "Request processing requirements"
28
+ - "Background job load"
29
+ - "Peak vs average usage"
30
+
31
+ storage:
32
+ metrics:
33
+ - "disk_usage_percent"
34
+ - "iops_utilization"
35
+ - "throughput_mbps"
36
+ - "latency_ms"
37
+ scaling:
38
+ vertical: "Faster/larger disks"
39
+ horizontal: "Sharding, distribution"
40
+ planning_factors:
41
+ - "Data growth rate"
42
+ - "Retention requirements"
43
+ - "Backup storage"
44
+
45
+ network:
46
+ metrics:
47
+ - "bandwidth_utilization"
48
+ - "packet_rate"
49
+ - "connection_count"
50
+ - "latency_ms"
51
+ scaling:
52
+ vertical: "Faster network"
53
+ horizontal: "Multiple paths"
54
+ planning_factors:
55
+ - "Traffic patterns"
56
+ - "Geographic distribution"
57
+ - "External API calls"
58
+
59
+ database:
60
+ metrics:
61
+ - "connection_pool_usage"
62
+ - "query_latency_p99"
63
+ - "transactions_per_second"
64
+ - "replication_lag"
65
+ scaling:
66
+ vertical: "Larger instance"
67
+ horizontal: "Read replicas, sharding"
68
+ planning_factors:
69
+ - "Query complexity"
70
+ - "Data volume"
71
+ - "Read/write ratio"
72
+ ```
73
+
74
+ ### Capacity Thresholds
75
+
76
+ ```yaml
77
+ threshold_definitions:
78
+ nominal:
79
+ range: "0-60%"
80
+ status: "Healthy operation"
81
+ action: "Monitor"
82
+
83
+ elevated:
84
+ range: "60-75%"
85
+ status: "Above normal"
86
+ action: "Plan scaling"
87
+
88
+ warning:
89
+ range: "75-85%"
90
+ status: "Approaching limits"
91
+ action: "Scale soon"
92
+ alert: "Warning severity"
93
+
94
+ critical:
95
+ range: "85-95%"
96
+ status: "Near capacity"
97
+ action: "Scale immediately"
98
+ alert: "Critical severity"
99
+
100
+ saturated:
101
+ range: "95-100%"
102
+ status: "At capacity"
103
+ action: "Emergency scaling"
104
+ impact: "Performance degradation likely"
105
+
106
+ resource_specific_thresholds:
107
+ cpu:
108
+ warning: 75%
109
+ critical: 85%
110
+ note: "Sustained high CPU causes latency"
111
+
112
+ memory:
113
+ warning: 80%
114
+ critical: 90%
115
+ note: "OOM kills happen above 90%"
116
+
117
+ disk:
118
+ warning: 80%
119
+ critical: 90%
120
+ note: "Leave space for operations, logs"
121
+
122
+ connections:
123
+ warning: 70%
124
+ critical: 85%
125
+ note: "Connection storms can spike quickly"
126
+ ```
127
+
128
+ ## Load Testing
129
+
130
+ ### Testing Types
131
+
132
+ ```yaml
133
+ smoke_test:
134
+ purpose: "Verify system handles minimal load"
135
+ duration: "5-10 minutes"
136
+ load: "10-20 concurrent users"
137
+ when: "Every deployment"
138
+ success_criteria:
139
+ - "No errors"
140
+ - "Response times normal"
141
+ - "All endpoints accessible"
142
+
143
+ load_test:
144
+ purpose: "Verify system handles expected load"
145
+ duration: "30-60 minutes"
146
+ load: "Expected peak * 1.5"
147
+ when: "Weekly, before releases"
148
+ success_criteria:
149
+ - "Error rate < 1%"
150
+ - "P99 latency within SLO"
151
+ - "No resource saturation"
152
+
153
+ stress_test:
154
+ purpose: "Find breaking point"
155
+ duration: "Until failure"
156
+ load: "Ramp up continuously"
157
+ when: "Monthly, architecture changes"
158
+ success_criteria:
159
+ - "Identify failure point"
160
+ - "Graceful degradation"
161
+ - "Recovery after load removed"
162
+
163
+ soak_test:
164
+ purpose: "Find issues over time"
165
+ duration: "24-72 hours"
166
+ load: "Expected average"
167
+ when: "Before major releases"
168
+ success_criteria:
169
+ - "No memory leaks"
170
+ - "No connection leaks"
171
+ - "Performance stable over time"
172
+
173
+ spike_test:
174
+ purpose: "Verify handling of sudden load"
175
+ duration: "30 minutes"
176
+ load: "Sudden 10x spike"
177
+ when: "Before events, campaigns"
178
+ success_criteria:
179
+ - "Autoscaling responds"
180
+ - "No cascading failures"
181
+ - "Recovery after spike"
182
+ ```
183
+
184
+ ### k6 Load Testing
185
+
186
+ ```javascript
187
+ // load-test.js - k6 load test example
188
+ import http from 'k6/http';
189
+ import { check, sleep } from 'k6';
190
+ import { Rate, Trend } from 'k6/metrics';
191
+
192
+ // Custom metrics
193
+ const errorRate = new Rate('errors');
194
+ const apiDuration = new Trend('api_duration');
195
+
196
+ // Test configuration
197
+ export const options = {
198
+ stages: [
199
+ { duration: '2m', target: 100 }, // Ramp up
200
+ { duration: '10m', target: 100 }, // Stay at peak
201
+ { duration: '2m', target: 200 }, // Spike
202
+ { duration: '5m', target: 200 }, // Sustained spike
203
+ { duration: '2m', target: 100 }, // Return to normal
204
+ { duration: '5m', target: 100 }, // Sustained normal
205
+ { duration: '2m', target: 0 }, // Ramp down
206
+ ],
207
+ thresholds: {
208
+ http_req_duration: ['p(95)<500', 'p(99)<1000'],
209
+ http_req_failed: ['rate<0.01'],
210
+ errors: ['rate<0.01'],
211
+ },
212
+ };
213
+
214
+ const BASE_URL = __ENV.BASE_URL || 'https://api.example.com';
215
+
216
+ export default function () {
217
+ // Simulate user flow
218
+
219
+ // 1. Health check
220
+ const healthRes = http.get(`${BASE_URL}/health`);
221
+ check(healthRes, {
222
+ 'health check passed': (r) => r.status === 200,
223
+ });
224
+
225
+ // 2. List items (most common operation)
226
+ const listRes = http.get(`${BASE_URL}/api/v1/items`, {
227
+ headers: { 'Authorization': `Bearer ${__ENV.API_TOKEN}` },
228
+ });
229
+
230
+ check(listRes, {
231
+ 'list items succeeded': (r) => r.status === 200,
232
+ 'list returned data': (r) => JSON.parse(r.body).data.length > 0,
233
+ });
234
+
235
+ errorRate.add(listRes.status !== 200);
236
+ apiDuration.add(listRes.timings.duration);
237
+
238
+ // 3. Get single item (simulate user clicking)
239
+ if (listRes.status === 200) {
240
+ const items = JSON.parse(listRes.body).data;
241
+ const itemId = items[Math.floor(Math.random() * items.length)].id;
242
+
243
+ const itemRes = http.get(`${BASE_URL}/api/v1/items/${itemId}`, {
244
+ headers: { 'Authorization': `Bearer ${__ENV.API_TOKEN}` },
245
+ });
246
+
247
+ check(itemRes, {
248
+ 'get item succeeded': (r) => r.status === 200,
249
+ });
250
+
251
+ errorRate.add(itemRes.status !== 200);
252
+ }
253
+
254
+ // Think time between requests
255
+ sleep(Math.random() * 3 + 1);
256
+ }
257
+
258
+ // Summary output
259
+ export function handleSummary(data) {
260
+ return {
261
+ 'summary.json': JSON.stringify(data),
262
+ stdout: textSummary(data, { indent: ' ', enableColors: true }),
263
+ };
264
+ }
265
+ ```
266
+
267
+ ### Running Load Tests
268
+
269
+ ```yaml
270
+ load_test_process:
271
+ preparation:
272
+ - "Notify stakeholders"
273
+ - "Ensure monitoring is active"
274
+ - "Verify test environment matches production"
275
+ - "Prepare rollback plan if testing production"
276
+
277
+ execution:
278
+ - "Start with smoke test"
279
+ - "Gradually increase load"
280
+ - "Monitor dashboards during test"
281
+ - "Collect metrics and screenshots"
282
+
283
+ analysis:
284
+ - "Compare results to baseline"
285
+ - "Identify bottlenecks"
286
+ - "Document findings"
287
+ - "Create action items"
288
+
289
+ k6_commands:
290
+ smoke_test: |
291
+ k6 run --vus 10 --duration 5m load-test.js
292
+
293
+ load_test: |
294
+ k6 run load-test.js
295
+
296
+ stress_test: |
297
+ k6 run --vus 500 --duration 30m load-test.js
298
+
299
+ with_output: |
300
+ k6 run --out json=results.json --out influxdb=http://localhost:8086/k6 load-test.js
301
+ ```
302
+
303
+ ## Scaling Strategies
304
+
305
+ ### Horizontal vs Vertical
306
+
307
+ ```yaml
308
+ horizontal_scaling:
309
+ description: "Add more instances"
310
+ pros:
311
+ - "No downtime for scaling"
312
+ - "Better fault tolerance"
313
+ - "Theoretically unlimited"
314
+ cons:
315
+ - "Application must be stateless"
316
+ - "More complex architecture"
317
+ - "Coordination overhead"
318
+ best_for:
319
+ - "Stateless web servers"
320
+ - "API services"
321
+ - "Workers/processors"
322
+
323
+ vertical_scaling:
324
+ description: "Make instances bigger"
325
+ pros:
326
+ - "Simple implementation"
327
+ - "Works with stateful apps"
328
+ - "No architecture changes"
329
+ cons:
330
+ - "Hard limits on instance size"
331
+ - "Usually requires downtime"
332
+ - "Single point of failure"
333
+ best_for:
334
+ - "Databases"
335
+ - "Legacy applications"
336
+ - "Quick fixes"
337
+ ```
338
+
339
+ ### Kubernetes Autoscaling
340
+
341
+ ```yaml
342
+ # Horizontal Pod Autoscaler
343
+ apiVersion: autoscaling/v2
344
+ kind: HorizontalPodAutoscaler
345
+ metadata:
346
+ name: api-server-hpa
347
+ spec:
348
+ scaleTargetRef:
349
+ apiVersion: apps/v1
350
+ kind: Deployment
351
+ name: api-server
352
+ minReplicas: 3
353
+ maxReplicas: 50
354
+
355
+ metrics:
356
+ # CPU-based scaling
357
+ - type: Resource
358
+ resource:
359
+ name: cpu
360
+ target:
361
+ type: Utilization
362
+ averageUtilization: 70
363
+
364
+ # Memory-based scaling
365
+ - type: Resource
366
+ resource:
367
+ name: memory
368
+ target:
369
+ type: Utilization
370
+ averageUtilization: 80
371
+
372
+ # Custom metrics (requests per second)
373
+ - type: Pods
374
+ pods:
375
+ metric:
376
+ name: http_requests_per_second
377
+ target:
378
+ type: AverageValue
379
+ averageValue: "1000"
380
+
381
+ behavior:
382
+ scaleDown:
383
+ stabilizationWindowSeconds: 300 # Wait 5 min before scaling down
384
+ policies:
385
+ - type: Percent
386
+ value: 10 # Scale down 10% at a time
387
+ periodSeconds: 60
388
+ scaleUp:
389
+ stabilizationWindowSeconds: 0 # Scale up immediately
390
+ policies:
391
+ - type: Percent
392
+ value: 100 # Can double
393
+ periodSeconds: 15
394
+ - type: Pods
395
+ value: 4 # Or add 4 pods
396
+ periodSeconds: 15
397
+ selectPolicy: Max
398
+ ```
399
+
400
+ ### Database Scaling
401
+
402
+ ```yaml
403
+ read_replicas:
404
+ when: "Read-heavy workload"
405
+ approach:
406
+ - "Primary handles writes"
407
+ - "Replicas handle reads"
408
+ - "Application routes queries"
409
+ considerations:
410
+ - "Replication lag"
411
+ - "Consistency requirements"
412
+ - "Connection pooling"
413
+
414
+ connection_pooling:
415
+ when: "Connection limits reached"
416
+ tools:
417
+ - "PgBouncer (PostgreSQL)"
418
+ - "ProxySQL (MySQL)"
419
+ benefits:
420
+ - "Multiplexes connections"
421
+ - "Reduces database load"
422
+ - "Handles connection storms"
423
+
424
+ sharding:
425
+ when: "Data too large for single instance"
426
+ strategies:
427
+ - "Hash-based: Distribute by user_id % N"
428
+ - "Range-based: Data by date ranges"
429
+ - "Geography-based: Data by region"
430
+ considerations:
431
+ - "Cross-shard queries are complex"
432
+ - "Rebalancing is difficult"
433
+ - "Application complexity increases"
434
+ ```
435
+
436
+ ## Capacity Forecasting
437
+
438
+ ### Data Collection
439
+
440
+ ```yaml
441
+ metrics_to_track:
442
+ traffic:
443
+ - "requests_per_second"
444
+ - "active_users"
445
+ - "daily_active_users"
446
+ - "monthly_active_users"
447
+
448
+ resources:
449
+ - "cpu_utilization"
450
+ - "memory_utilization"
451
+ - "disk_usage"
452
+ - "network_throughput"
453
+
454
+ business:
455
+ - "new_signups"
456
+ - "transactions"
457
+ - "data_ingestion_rate"
458
+
459
+ historical_analysis:
460
+ timeframes:
461
+ - "Daily patterns (peak hours)"
462
+ - "Weekly patterns (weekday vs weekend)"
463
+ - "Monthly patterns (billing cycles)"
464
+ - "Yearly patterns (seasonal)"
465
+
466
+ identify:
467
+ - "Growth trends"
468
+ - "Cyclical patterns"
469
+ - "Anomalies"
470
+ - "Correlation with business metrics"
471
+ ```
472
+
473
+ ### Forecasting Methods
474
+
475
+ ```yaml
476
+ linear_projection:
477
+ description: "Simple trend extrapolation"
478
+ formula: "future_usage = current + (growth_rate * time)"
479
+ good_for: "Steady, predictable growth"
480
+ example: |
481
+ Current: 1000 requests/sec
482
+ Growth: 10% per month
483
+ In 6 months: 1000 * (1.1^6) = 1771 requests/sec
484
+
485
+ exponential_growth:
486
+ description: "Compound growth projection"
487
+ formula: "future = current * (1 + rate)^periods"
488
+ good_for: "Rapidly growing services"
489
+ warning: "Can overestimate"
490
+
491
+ event_based:
492
+ description: "Specific events that drive load"
493
+ examples:
494
+ - "Marketing campaign: +50% traffic"
495
+ - "Product launch: +100% traffic"
496
+ - "Holiday season: +200% traffic"
497
+ approach: "Add event-based capacity on top of baseline"
498
+ ```
499
+
500
+ ### Capacity Planning Process
501
+
502
+ ```yaml
503
+ quarterly_planning:
504
+ step_1_review:
505
+ - "Analyze last quarter's usage"
506
+ - "Compare forecast vs actual"
507
+ - "Identify forecast errors"
508
+
509
+ step_2_forecast:
510
+ - "Project next quarter growth"
511
+ - "Account for known events"
512
+ - "Add safety margin (20-50%)"
513
+
514
+ step_3_capacity:
515
+ - "Map usage to resource requirements"
516
+ - "Identify bottlenecks"
517
+ - "Plan scaling actions"
518
+
519
+ step_4_budget:
520
+ - "Calculate infrastructure costs"
521
+ - "Compare with reserved instances"
522
+ - "Get budget approval"
523
+
524
+ step_5_execute:
525
+ - "Schedule capacity additions"
526
+ - "Test new capacity"
527
+ - "Monitor after scaling"
528
+
529
+ capacity_buffer:
530
+ purpose: "Headroom for unexpected growth"
531
+ typical_values:
532
+ normal: "20% above projected peak"
533
+ critical_services: "50% above projected peak"
534
+ before_events: "100% above normal capacity"
535
+ ```
536
+
537
+ ## Cost Optimization
538
+
539
+ ### Right-Sizing
540
+
541
+ ```yaml
542
+ right_sizing_process:
543
+ identify_waste:
544
+ - "Instances with < 20% CPU utilization"
545
+ - "Over-provisioned memory"
546
+ - "Unused reserved capacity"
547
+
548
+ analyze:
549
+ - "Peak vs average usage"
550
+ - "Scaling patterns"
551
+ - "Cost per request"
552
+
553
+ optimize:
554
+ - "Downsize underutilized instances"
555
+ - "Use autoscaling instead of static"
556
+ - "Consider spot/preemptible instances"
557
+
558
+ instance_selection:
559
+ compute_optimized:
560
+ when: "CPU-bound workloads"
561
+ examples: "c5, c6i instances"
562
+
563
+ memory_optimized:
564
+ when: "Memory-bound workloads"
565
+ examples: "r5, r6i instances"
566
+
567
+ general_purpose:
568
+ when: "Balanced workloads"
569
+ examples: "m5, m6i instances"
570
+
571
+ burstable:
572
+ when: "Low baseline, occasional spikes"
573
+ examples: "t3, t4g instances"
574
+ ```
575
+
576
+ ### Cost Efficiency Metrics
577
+
578
+ ```yaml
579
+ cost_metrics:
580
+ cost_per_request:
581
+ formula: "monthly_cost / monthly_requests"
582
+ target: "Should decrease over time (efficiency)"
583
+
584
+ cost_per_user:
585
+ formula: "monthly_cost / monthly_active_users"
586
+ target: "Should be stable or decreasing"
587
+
588
+ utilization_efficiency:
589
+ formula: "actual_usage / provisioned_capacity"
590
+ target: "60-80% (leaves headroom)"
591
+
592
+ reserved_coverage:
593
+ formula: "reserved_capacity / baseline_usage"
594
+ target: "70-80% of baseline on reserved"
595
+ ```
596
+
597
+ ### Reserved vs On-Demand
598
+
599
+ ```yaml
600
+ capacity_mix:
601
+ reserved:
602
+ coverage: "60-70% of baseline"
603
+ discount: "30-60% vs on-demand"
604
+ commitment: "1-3 years"
605
+ use_for: "Steady-state workloads"
606
+
607
+ on_demand:
608
+ coverage: "Peak above baseline"
609
+ discount: "None"
610
+ commitment: "None"
611
+ use_for: "Variable/spiky workloads"
612
+
613
+ spot_preemptible:
614
+ coverage: "Fault-tolerant workloads"
615
+ discount: "60-90% vs on-demand"
616
+ commitment: "None (can be terminated)"
617
+ use_for: "Batch jobs, stateless workers"
618
+
619
+ strategy: |
620
+ Base load: Reserved instances (predictable cost)
621
+ Normal peaks: On-demand (flexibility)
622
+ Batch/background: Spot instances (cost savings)
623
+ Emergency: On-demand ready to scale
624
+ ```
625
+
626
+ ## Common Pitfalls
627
+
628
+ ```yaml
629
+ pitfall_planning_for_average:
630
+ problem: "Provisioning for average load"
631
+ impact: "Outages during peaks"
632
+ solution: "Plan for peak + headroom"
633
+
634
+ pitfall_ignoring_dependencies:
635
+ problem: "Scaling service but not database"
636
+ impact: "Database becomes bottleneck"
637
+ solution: "Consider all dependencies when scaling"
638
+
639
+ pitfall_no_testing:
640
+ problem: "Assuming capacity without testing"
641
+ impact: "Surprises under real load"
642
+ solution: "Regular load testing to validate"
643
+
644
+ pitfall_sudden_scaling:
645
+ problem: "Scaling from 10 to 100 instances instantly"
646
+ impact: "Thundering herd, cold cache"
647
+ solution: "Scale gradually, warm caches"
648
+
649
+ pitfall_cost_blindness:
650
+ problem: "Scaling without considering cost"
651
+ impact: "Surprise cloud bills"
652
+ solution: "Monitor cost metrics alongside performance"
653
+ ```