agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,653 @@
|
|
|
1
|
+
# Capacity Planning
|
|
2
|
+
|
|
3
|
+
Comprehensive guidelines for planning, testing, and scaling system capacity.
|
|
4
|
+
|
|
5
|
+
## Core Principles
|
|
6
|
+
|
|
7
|
+
1. **Measure First** - Base capacity decisions on data, not guesses
|
|
8
|
+
2. **Plan Ahead** - Provision for growth before you need it
|
|
9
|
+
3. **Test Limits** - Know your breaking points before users find them
|
|
10
|
+
4. **Right-Size** - Neither over-provision (waste) nor under-provision (outage)
|
|
11
|
+
|
|
12
|
+
## Capacity Dimensions
|
|
13
|
+
|
|
14
|
+
### Resource Types
|
|
15
|
+
|
|
16
|
+
```yaml
|
|
17
|
+
compute:
|
|
18
|
+
metrics:
|
|
19
|
+
- "cpu_utilization_percent"
|
|
20
|
+
- "memory_utilization_percent"
|
|
21
|
+
- "pod_count"
|
|
22
|
+
- "node_count"
|
|
23
|
+
scaling:
|
|
24
|
+
vertical: "Larger instances"
|
|
25
|
+
horizontal: "More instances"
|
|
26
|
+
planning_factors:
|
|
27
|
+
- "Request processing requirements"
|
|
28
|
+
- "Background job load"
|
|
29
|
+
- "Peak vs average usage"
|
|
30
|
+
|
|
31
|
+
storage:
|
|
32
|
+
metrics:
|
|
33
|
+
- "disk_usage_percent"
|
|
34
|
+
- "iops_utilization"
|
|
35
|
+
- "throughput_mbps"
|
|
36
|
+
- "latency_ms"
|
|
37
|
+
scaling:
|
|
38
|
+
vertical: "Faster/larger disks"
|
|
39
|
+
horizontal: "Sharding, distribution"
|
|
40
|
+
planning_factors:
|
|
41
|
+
- "Data growth rate"
|
|
42
|
+
- "Retention requirements"
|
|
43
|
+
- "Backup storage"
|
|
44
|
+
|
|
45
|
+
network:
|
|
46
|
+
metrics:
|
|
47
|
+
- "bandwidth_utilization"
|
|
48
|
+
- "packet_rate"
|
|
49
|
+
- "connection_count"
|
|
50
|
+
- "latency_ms"
|
|
51
|
+
scaling:
|
|
52
|
+
vertical: "Faster network"
|
|
53
|
+
horizontal: "Multiple paths"
|
|
54
|
+
planning_factors:
|
|
55
|
+
- "Traffic patterns"
|
|
56
|
+
- "Geographic distribution"
|
|
57
|
+
- "External API calls"
|
|
58
|
+
|
|
59
|
+
database:
|
|
60
|
+
metrics:
|
|
61
|
+
- "connection_pool_usage"
|
|
62
|
+
- "query_latency_p99"
|
|
63
|
+
- "transactions_per_second"
|
|
64
|
+
- "replication_lag"
|
|
65
|
+
scaling:
|
|
66
|
+
vertical: "Larger instance"
|
|
67
|
+
horizontal: "Read replicas, sharding"
|
|
68
|
+
planning_factors:
|
|
69
|
+
- "Query complexity"
|
|
70
|
+
- "Data volume"
|
|
71
|
+
- "Read/write ratio"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Capacity Thresholds
|
|
75
|
+
|
|
76
|
+
```yaml
|
|
77
|
+
threshold_definitions:
|
|
78
|
+
nominal:
|
|
79
|
+
range: "0-60%"
|
|
80
|
+
status: "Healthy operation"
|
|
81
|
+
action: "Monitor"
|
|
82
|
+
|
|
83
|
+
elevated:
|
|
84
|
+
range: "60-75%"
|
|
85
|
+
status: "Above normal"
|
|
86
|
+
action: "Plan scaling"
|
|
87
|
+
|
|
88
|
+
warning:
|
|
89
|
+
range: "75-85%"
|
|
90
|
+
status: "Approaching limits"
|
|
91
|
+
action: "Scale soon"
|
|
92
|
+
alert: "Warning severity"
|
|
93
|
+
|
|
94
|
+
critical:
|
|
95
|
+
range: "85-95%"
|
|
96
|
+
status: "Near capacity"
|
|
97
|
+
action: "Scale immediately"
|
|
98
|
+
alert: "Critical severity"
|
|
99
|
+
|
|
100
|
+
saturated:
|
|
101
|
+
range: "95-100%"
|
|
102
|
+
status: "At capacity"
|
|
103
|
+
action: "Emergency scaling"
|
|
104
|
+
impact: "Performance degradation likely"
|
|
105
|
+
|
|
106
|
+
resource_specific_thresholds:
|
|
107
|
+
cpu:
|
|
108
|
+
warning: 75%
|
|
109
|
+
critical: 85%
|
|
110
|
+
note: "Sustained high CPU causes latency"
|
|
111
|
+
|
|
112
|
+
memory:
|
|
113
|
+
warning: 80%
|
|
114
|
+
critical: 90%
|
|
115
|
+
note: "OOM kills happen above 90%"
|
|
116
|
+
|
|
117
|
+
disk:
|
|
118
|
+
warning: 80%
|
|
119
|
+
critical: 90%
|
|
120
|
+
note: "Leave space for operations, logs"
|
|
121
|
+
|
|
122
|
+
connections:
|
|
123
|
+
warning: 70%
|
|
124
|
+
critical: 85%
|
|
125
|
+
note: "Connection storms can spike quickly"
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Load Testing
|
|
129
|
+
|
|
130
|
+
### Testing Types
|
|
131
|
+
|
|
132
|
+
```yaml
|
|
133
|
+
smoke_test:
|
|
134
|
+
purpose: "Verify system handles minimal load"
|
|
135
|
+
duration: "5-10 minutes"
|
|
136
|
+
load: "10-20 concurrent users"
|
|
137
|
+
when: "Every deployment"
|
|
138
|
+
success_criteria:
|
|
139
|
+
- "No errors"
|
|
140
|
+
- "Response times normal"
|
|
141
|
+
- "All endpoints accessible"
|
|
142
|
+
|
|
143
|
+
load_test:
|
|
144
|
+
purpose: "Verify system handles expected load"
|
|
145
|
+
duration: "30-60 minutes"
|
|
146
|
+
load: "Expected peak * 1.5"
|
|
147
|
+
when: "Weekly, before releases"
|
|
148
|
+
success_criteria:
|
|
149
|
+
- "Error rate < 1%"
|
|
150
|
+
- "P99 latency within SLO"
|
|
151
|
+
- "No resource saturation"
|
|
152
|
+
|
|
153
|
+
stress_test:
|
|
154
|
+
purpose: "Find breaking point"
|
|
155
|
+
duration: "Until failure"
|
|
156
|
+
load: "Ramp up continuously"
|
|
157
|
+
when: "Monthly, architecture changes"
|
|
158
|
+
success_criteria:
|
|
159
|
+
- "Identify failure point"
|
|
160
|
+
- "Graceful degradation"
|
|
161
|
+
- "Recovery after load removed"
|
|
162
|
+
|
|
163
|
+
soak_test:
|
|
164
|
+
purpose: "Find issues over time"
|
|
165
|
+
duration: "24-72 hours"
|
|
166
|
+
load: "Expected average"
|
|
167
|
+
when: "Before major releases"
|
|
168
|
+
success_criteria:
|
|
169
|
+
- "No memory leaks"
|
|
170
|
+
- "No connection leaks"
|
|
171
|
+
- "Performance stable over time"
|
|
172
|
+
|
|
173
|
+
spike_test:
|
|
174
|
+
purpose: "Verify handling of sudden load"
|
|
175
|
+
duration: "30 minutes"
|
|
176
|
+
load: "Sudden 10x spike"
|
|
177
|
+
when: "Before events, campaigns"
|
|
178
|
+
success_criteria:
|
|
179
|
+
- "Autoscaling responds"
|
|
180
|
+
- "No cascading failures"
|
|
181
|
+
- "Recovery after spike"
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### k6 Load Testing
|
|
185
|
+
|
|
186
|
+
```javascript
|
|
187
|
+
// load-test.js - k6 load test example
|
|
188
|
+
import http from 'k6/http';
|
|
189
|
+
import { check, sleep } from 'k6';
|
|
190
|
+
import { Rate, Trend } from 'k6/metrics';
|
|
191
|
+
|
|
192
|
+
// Custom metrics
|
|
193
|
+
const errorRate = new Rate('errors');
|
|
194
|
+
const apiDuration = new Trend('api_duration');
|
|
195
|
+
|
|
196
|
+
// Test configuration
|
|
197
|
+
export const options = {
|
|
198
|
+
stages: [
|
|
199
|
+
{ duration: '2m', target: 100 }, // Ramp up
|
|
200
|
+
{ duration: '10m', target: 100 }, // Stay at peak
|
|
201
|
+
{ duration: '2m', target: 200 }, // Spike
|
|
202
|
+
{ duration: '5m', target: 200 }, // Sustained spike
|
|
203
|
+
{ duration: '2m', target: 100 }, // Return to normal
|
|
204
|
+
{ duration: '5m', target: 100 }, // Sustained normal
|
|
205
|
+
{ duration: '2m', target: 0 }, // Ramp down
|
|
206
|
+
],
|
|
207
|
+
thresholds: {
|
|
208
|
+
http_req_duration: ['p(95)<500', 'p(99)<1000'],
|
|
209
|
+
http_req_failed: ['rate<0.01'],
|
|
210
|
+
errors: ['rate<0.01'],
|
|
211
|
+
},
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
const BASE_URL = __ENV.BASE_URL || 'https://api.example.com';
|
|
215
|
+
|
|
216
|
+
export default function () {
|
|
217
|
+
// Simulate user flow
|
|
218
|
+
|
|
219
|
+
// 1. Health check
|
|
220
|
+
const healthRes = http.get(`${BASE_URL}/health`);
|
|
221
|
+
check(healthRes, {
|
|
222
|
+
'health check passed': (r) => r.status === 200,
|
|
223
|
+
});
|
|
224
|
+
|
|
225
|
+
// 2. List items (most common operation)
|
|
226
|
+
const listRes = http.get(`${BASE_URL}/api/v1/items`, {
|
|
227
|
+
headers: { 'Authorization': `Bearer ${__ENV.API_TOKEN}` },
|
|
228
|
+
});
|
|
229
|
+
|
|
230
|
+
check(listRes, {
|
|
231
|
+
'list items succeeded': (r) => r.status === 200,
|
|
232
|
+
'list returned data': (r) => JSON.parse(r.body).data.length > 0,
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
errorRate.add(listRes.status !== 200);
|
|
236
|
+
apiDuration.add(listRes.timings.duration);
|
|
237
|
+
|
|
238
|
+
// 3. Get single item (simulate user clicking)
|
|
239
|
+
if (listRes.status === 200) {
|
|
240
|
+
const items = JSON.parse(listRes.body).data;
|
|
241
|
+
const itemId = items[Math.floor(Math.random() * items.length)].id;
|
|
242
|
+
|
|
243
|
+
const itemRes = http.get(`${BASE_URL}/api/v1/items/${itemId}`, {
|
|
244
|
+
headers: { 'Authorization': `Bearer ${__ENV.API_TOKEN}` },
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
check(itemRes, {
|
|
248
|
+
'get item succeeded': (r) => r.status === 200,
|
|
249
|
+
});
|
|
250
|
+
|
|
251
|
+
errorRate.add(itemRes.status !== 200);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Think time between requests
|
|
255
|
+
sleep(Math.random() * 3 + 1);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// Summary output
|
|
259
|
+
export function handleSummary(data) {
|
|
260
|
+
return {
|
|
261
|
+
'summary.json': JSON.stringify(data),
|
|
262
|
+
stdout: textSummary(data, { indent: ' ', enableColors: true }),
|
|
263
|
+
};
|
|
264
|
+
}
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### Running Load Tests
|
|
268
|
+
|
|
269
|
+
```yaml
|
|
270
|
+
load_test_process:
|
|
271
|
+
preparation:
|
|
272
|
+
- "Notify stakeholders"
|
|
273
|
+
- "Ensure monitoring is active"
|
|
274
|
+
- "Verify test environment matches production"
|
|
275
|
+
- "Prepare rollback plan if testing production"
|
|
276
|
+
|
|
277
|
+
execution:
|
|
278
|
+
- "Start with smoke test"
|
|
279
|
+
- "Gradually increase load"
|
|
280
|
+
- "Monitor dashboards during test"
|
|
281
|
+
- "Collect metrics and screenshots"
|
|
282
|
+
|
|
283
|
+
analysis:
|
|
284
|
+
- "Compare results to baseline"
|
|
285
|
+
- "Identify bottlenecks"
|
|
286
|
+
- "Document findings"
|
|
287
|
+
- "Create action items"
|
|
288
|
+
|
|
289
|
+
k6_commands:
|
|
290
|
+
smoke_test: |
|
|
291
|
+
k6 run --vus 10 --duration 5m load-test.js
|
|
292
|
+
|
|
293
|
+
load_test: |
|
|
294
|
+
k6 run load-test.js
|
|
295
|
+
|
|
296
|
+
stress_test: |
|
|
297
|
+
k6 run --vus 500 --duration 30m load-test.js
|
|
298
|
+
|
|
299
|
+
with_output: |
|
|
300
|
+
k6 run --out json=results.json --out influxdb=http://localhost:8086/k6 load-test.js
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
## Scaling Strategies
|
|
304
|
+
|
|
305
|
+
### Horizontal vs Vertical
|
|
306
|
+
|
|
307
|
+
```yaml
|
|
308
|
+
horizontal_scaling:
|
|
309
|
+
description: "Add more instances"
|
|
310
|
+
pros:
|
|
311
|
+
- "No downtime for scaling"
|
|
312
|
+
- "Better fault tolerance"
|
|
313
|
+
- "Theoretically unlimited"
|
|
314
|
+
cons:
|
|
315
|
+
- "Application must be stateless"
|
|
316
|
+
- "More complex architecture"
|
|
317
|
+
- "Coordination overhead"
|
|
318
|
+
best_for:
|
|
319
|
+
- "Stateless web servers"
|
|
320
|
+
- "API services"
|
|
321
|
+
- "Workers/processors"
|
|
322
|
+
|
|
323
|
+
vertical_scaling:
|
|
324
|
+
description: "Make instances bigger"
|
|
325
|
+
pros:
|
|
326
|
+
- "Simple implementation"
|
|
327
|
+
- "Works with stateful apps"
|
|
328
|
+
- "No architecture changes"
|
|
329
|
+
cons:
|
|
330
|
+
- "Hard limits on instance size"
|
|
331
|
+
- "Usually requires downtime"
|
|
332
|
+
- "Single point of failure"
|
|
333
|
+
best_for:
|
|
334
|
+
- "Databases"
|
|
335
|
+
- "Legacy applications"
|
|
336
|
+
- "Quick fixes"
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### Kubernetes Autoscaling
|
|
340
|
+
|
|
341
|
+
```yaml
|
|
342
|
+
# Horizontal Pod Autoscaler
|
|
343
|
+
apiVersion: autoscaling/v2
|
|
344
|
+
kind: HorizontalPodAutoscaler
|
|
345
|
+
metadata:
|
|
346
|
+
name: api-server-hpa
|
|
347
|
+
spec:
|
|
348
|
+
scaleTargetRef:
|
|
349
|
+
apiVersion: apps/v1
|
|
350
|
+
kind: Deployment
|
|
351
|
+
name: api-server
|
|
352
|
+
minReplicas: 3
|
|
353
|
+
maxReplicas: 50
|
|
354
|
+
|
|
355
|
+
metrics:
|
|
356
|
+
# CPU-based scaling
|
|
357
|
+
- type: Resource
|
|
358
|
+
resource:
|
|
359
|
+
name: cpu
|
|
360
|
+
target:
|
|
361
|
+
type: Utilization
|
|
362
|
+
averageUtilization: 70
|
|
363
|
+
|
|
364
|
+
# Memory-based scaling
|
|
365
|
+
- type: Resource
|
|
366
|
+
resource:
|
|
367
|
+
name: memory
|
|
368
|
+
target:
|
|
369
|
+
type: Utilization
|
|
370
|
+
averageUtilization: 80
|
|
371
|
+
|
|
372
|
+
# Custom metrics (requests per second)
|
|
373
|
+
- type: Pods
|
|
374
|
+
pods:
|
|
375
|
+
metric:
|
|
376
|
+
name: http_requests_per_second
|
|
377
|
+
target:
|
|
378
|
+
type: AverageValue
|
|
379
|
+
averageValue: "1000"
|
|
380
|
+
|
|
381
|
+
behavior:
|
|
382
|
+
scaleDown:
|
|
383
|
+
stabilizationWindowSeconds: 300 # Wait 5 min before scaling down
|
|
384
|
+
policies:
|
|
385
|
+
- type: Percent
|
|
386
|
+
value: 10 # Scale down 10% at a time
|
|
387
|
+
periodSeconds: 60
|
|
388
|
+
scaleUp:
|
|
389
|
+
stabilizationWindowSeconds: 0 # Scale up immediately
|
|
390
|
+
policies:
|
|
391
|
+
- type: Percent
|
|
392
|
+
value: 100 # Can double
|
|
393
|
+
periodSeconds: 15
|
|
394
|
+
- type: Pods
|
|
395
|
+
value: 4 # Or add 4 pods
|
|
396
|
+
periodSeconds: 15
|
|
397
|
+
selectPolicy: Max
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
### Database Scaling
|
|
401
|
+
|
|
402
|
+
```yaml
|
|
403
|
+
read_replicas:
|
|
404
|
+
when: "Read-heavy workload"
|
|
405
|
+
approach:
|
|
406
|
+
- "Primary handles writes"
|
|
407
|
+
- "Replicas handle reads"
|
|
408
|
+
- "Application routes queries"
|
|
409
|
+
considerations:
|
|
410
|
+
- "Replication lag"
|
|
411
|
+
- "Consistency requirements"
|
|
412
|
+
- "Connection pooling"
|
|
413
|
+
|
|
414
|
+
connection_pooling:
|
|
415
|
+
when: "Connection limits reached"
|
|
416
|
+
tools:
|
|
417
|
+
- "PgBouncer (PostgreSQL)"
|
|
418
|
+
- "ProxySQL (MySQL)"
|
|
419
|
+
benefits:
|
|
420
|
+
- "Multiplexes connections"
|
|
421
|
+
- "Reduces database load"
|
|
422
|
+
- "Handles connection storms"
|
|
423
|
+
|
|
424
|
+
sharding:
|
|
425
|
+
when: "Data too large for single instance"
|
|
426
|
+
strategies:
|
|
427
|
+
- "Hash-based: Distribute by user_id % N"
|
|
428
|
+
- "Range-based: Data by date ranges"
|
|
429
|
+
- "Geography-based: Data by region"
|
|
430
|
+
considerations:
|
|
431
|
+
- "Cross-shard queries are complex"
|
|
432
|
+
- "Rebalancing is difficult"
|
|
433
|
+
- "Application complexity increases"
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
## Capacity Forecasting
|
|
437
|
+
|
|
438
|
+
### Data Collection
|
|
439
|
+
|
|
440
|
+
```yaml
|
|
441
|
+
metrics_to_track:
|
|
442
|
+
traffic:
|
|
443
|
+
- "requests_per_second"
|
|
444
|
+
- "active_users"
|
|
445
|
+
- "daily_active_users"
|
|
446
|
+
- "monthly_active_users"
|
|
447
|
+
|
|
448
|
+
resources:
|
|
449
|
+
- "cpu_utilization"
|
|
450
|
+
- "memory_utilization"
|
|
451
|
+
- "disk_usage"
|
|
452
|
+
- "network_throughput"
|
|
453
|
+
|
|
454
|
+
business:
|
|
455
|
+
- "new_signups"
|
|
456
|
+
- "transactions"
|
|
457
|
+
- "data_ingestion_rate"
|
|
458
|
+
|
|
459
|
+
historical_analysis:
|
|
460
|
+
timeframes:
|
|
461
|
+
- "Daily patterns (peak hours)"
|
|
462
|
+
- "Weekly patterns (weekday vs weekend)"
|
|
463
|
+
- "Monthly patterns (billing cycles)"
|
|
464
|
+
- "Yearly patterns (seasonal)"
|
|
465
|
+
|
|
466
|
+
identify:
|
|
467
|
+
- "Growth trends"
|
|
468
|
+
- "Cyclical patterns"
|
|
469
|
+
- "Anomalies"
|
|
470
|
+
- "Correlation with business metrics"
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
### Forecasting Methods
|
|
474
|
+
|
|
475
|
+
```yaml
|
|
476
|
+
linear_projection:
|
|
477
|
+
description: "Simple trend extrapolation"
|
|
478
|
+
formula: "future_usage = current + (growth_rate * time)"
|
|
479
|
+
good_for: "Steady, predictable growth"
|
|
480
|
+
example: |
|
|
481
|
+
Current: 1000 requests/sec
|
|
482
|
+
Growth: 10% per month
|
|
483
|
+
In 6 months: 1000 * (1.1^6) = 1771 requests/sec
|
|
484
|
+
|
|
485
|
+
exponential_growth:
|
|
486
|
+
description: "Compound growth projection"
|
|
487
|
+
formula: "future = current * (1 + rate)^periods"
|
|
488
|
+
good_for: "Rapidly growing services"
|
|
489
|
+
warning: "Can overestimate"
|
|
490
|
+
|
|
491
|
+
event_based:
|
|
492
|
+
description: "Specific events that drive load"
|
|
493
|
+
examples:
|
|
494
|
+
- "Marketing campaign: +50% traffic"
|
|
495
|
+
- "Product launch: +100% traffic"
|
|
496
|
+
- "Holiday season: +200% traffic"
|
|
497
|
+
approach: "Add event-based capacity on top of baseline"
|
|
498
|
+
```
|
|
499
|
+
|
|
500
|
+
### Capacity Planning Process
|
|
501
|
+
|
|
502
|
+
```yaml
|
|
503
|
+
quarterly_planning:
|
|
504
|
+
step_1_review:
|
|
505
|
+
- "Analyze last quarter's usage"
|
|
506
|
+
- "Compare forecast vs actual"
|
|
507
|
+
- "Identify forecast errors"
|
|
508
|
+
|
|
509
|
+
step_2_forecast:
|
|
510
|
+
- "Project next quarter growth"
|
|
511
|
+
- "Account for known events"
|
|
512
|
+
- "Add safety margin (20-50%)"
|
|
513
|
+
|
|
514
|
+
step_3_capacity:
|
|
515
|
+
- "Map usage to resource requirements"
|
|
516
|
+
- "Identify bottlenecks"
|
|
517
|
+
- "Plan scaling actions"
|
|
518
|
+
|
|
519
|
+
step_4_budget:
|
|
520
|
+
- "Calculate infrastructure costs"
|
|
521
|
+
- "Compare with reserved instances"
|
|
522
|
+
- "Get budget approval"
|
|
523
|
+
|
|
524
|
+
step_5_execute:
|
|
525
|
+
- "Schedule capacity additions"
|
|
526
|
+
- "Test new capacity"
|
|
527
|
+
- "Monitor after scaling"
|
|
528
|
+
|
|
529
|
+
capacity_buffer:
|
|
530
|
+
purpose: "Headroom for unexpected growth"
|
|
531
|
+
typical_values:
|
|
532
|
+
normal: "20% above projected peak"
|
|
533
|
+
critical_services: "50% above projected peak"
|
|
534
|
+
before_events: "100% above normal capacity"
|
|
535
|
+
```
|
|
536
|
+
|
|
537
|
+
## Cost Optimization
|
|
538
|
+
|
|
539
|
+
### Right-Sizing
|
|
540
|
+
|
|
541
|
+
```yaml
|
|
542
|
+
right_sizing_process:
|
|
543
|
+
identify_waste:
|
|
544
|
+
- "Instances with < 20% CPU utilization"
|
|
545
|
+
- "Over-provisioned memory"
|
|
546
|
+
- "Unused reserved capacity"
|
|
547
|
+
|
|
548
|
+
analyze:
|
|
549
|
+
- "Peak vs average usage"
|
|
550
|
+
- "Scaling patterns"
|
|
551
|
+
- "Cost per request"
|
|
552
|
+
|
|
553
|
+
optimize:
|
|
554
|
+
- "Downsize underutilized instances"
|
|
555
|
+
- "Use autoscaling instead of static"
|
|
556
|
+
- "Consider spot/preemptible instances"
|
|
557
|
+
|
|
558
|
+
instance_selection:
|
|
559
|
+
compute_optimized:
|
|
560
|
+
when: "CPU-bound workloads"
|
|
561
|
+
examples: "c5, c6i instances"
|
|
562
|
+
|
|
563
|
+
memory_optimized:
|
|
564
|
+
when: "Memory-bound workloads"
|
|
565
|
+
examples: "r5, r6i instances"
|
|
566
|
+
|
|
567
|
+
general_purpose:
|
|
568
|
+
when: "Balanced workloads"
|
|
569
|
+
examples: "m5, m6i instances"
|
|
570
|
+
|
|
571
|
+
burstable:
|
|
572
|
+
when: "Low baseline, occasional spikes"
|
|
573
|
+
examples: "t3, t4g instances"
|
|
574
|
+
```
|
|
575
|
+
|
|
576
|
+
### Cost Efficiency Metrics
|
|
577
|
+
|
|
578
|
+
```yaml
|
|
579
|
+
cost_metrics:
|
|
580
|
+
cost_per_request:
|
|
581
|
+
formula: "monthly_cost / monthly_requests"
|
|
582
|
+
target: "Should decrease over time (efficiency)"
|
|
583
|
+
|
|
584
|
+
cost_per_user:
|
|
585
|
+
formula: "monthly_cost / monthly_active_users"
|
|
586
|
+
target: "Should be stable or decreasing"
|
|
587
|
+
|
|
588
|
+
utilization_efficiency:
|
|
589
|
+
formula: "actual_usage / provisioned_capacity"
|
|
590
|
+
target: "60-80% (leaves headroom)"
|
|
591
|
+
|
|
592
|
+
reserved_coverage:
|
|
593
|
+
formula: "reserved_capacity / baseline_usage"
|
|
594
|
+
target: "70-80% of baseline on reserved"
|
|
595
|
+
```
|
|
596
|
+
|
|
597
|
+
### Reserved vs On-Demand
|
|
598
|
+
|
|
599
|
+
```yaml
|
|
600
|
+
capacity_mix:
|
|
601
|
+
reserved:
|
|
602
|
+
coverage: "60-70% of baseline"
|
|
603
|
+
discount: "30-60% vs on-demand"
|
|
604
|
+
commitment: "1-3 years"
|
|
605
|
+
use_for: "Steady-state workloads"
|
|
606
|
+
|
|
607
|
+
on_demand:
|
|
608
|
+
coverage: "Peak above baseline"
|
|
609
|
+
discount: "None"
|
|
610
|
+
commitment: "None"
|
|
611
|
+
use_for: "Variable/spiky workloads"
|
|
612
|
+
|
|
613
|
+
spot_preemptible:
|
|
614
|
+
coverage: "Fault-tolerant workloads"
|
|
615
|
+
discount: "60-90% vs on-demand"
|
|
616
|
+
commitment: "None (can be terminated)"
|
|
617
|
+
use_for: "Batch jobs, stateless workers"
|
|
618
|
+
|
|
619
|
+
strategy: |
|
|
620
|
+
Base load: Reserved instances (predictable cost)
|
|
621
|
+
Normal peaks: On-demand (flexibility)
|
|
622
|
+
Batch/background: Spot instances (cost savings)
|
|
623
|
+
Emergency: On-demand ready to scale
|
|
624
|
+
```
|
|
625
|
+
|
|
626
|
+
## Common Pitfalls
|
|
627
|
+
|
|
628
|
+
```yaml
|
|
629
|
+
pitfall_planning_for_average:
|
|
630
|
+
problem: "Provisioning for average load"
|
|
631
|
+
impact: "Outages during peaks"
|
|
632
|
+
solution: "Plan for peak + headroom"
|
|
633
|
+
|
|
634
|
+
pitfall_ignoring_dependencies:
|
|
635
|
+
problem: "Scaling service but not database"
|
|
636
|
+
impact: "Database becomes bottleneck"
|
|
637
|
+
solution: "Consider all dependencies when scaling"
|
|
638
|
+
|
|
639
|
+
pitfall_no_testing:
|
|
640
|
+
problem: "Assuming capacity without testing"
|
|
641
|
+
impact: "Surprises under real load"
|
|
642
|
+
solution: "Regular load testing to validate"
|
|
643
|
+
|
|
644
|
+
pitfall_sudden_scaling:
|
|
645
|
+
problem: "Scaling from 10 to 100 instances instantly"
|
|
646
|
+
impact: "Thundering herd, cold cache"
|
|
647
|
+
solution: "Scale gradually, warm caches"
|
|
648
|
+
|
|
649
|
+
pitfall_cost_blindness:
|
|
650
|
+
problem: "Scaling without considering cost"
|
|
651
|
+
impact: "Surprise cloud bills"
|
|
652
|
+
solution: "Monitor cost metrics alongside performance"
|
|
653
|
+
```
|