@patricio0312rev/skillset 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -0
- package/LICENSE +21 -0
- package/README.md +176 -0
- package/bin/cli.js +37 -0
- package/package.json +55 -0
- package/src/commands/init.js +301 -0
- package/src/index.js +168 -0
- package/src/lib/config.js +200 -0
- package/src/lib/generator.js +166 -0
- package/src/utils/display.js +95 -0
- package/src/utils/readme.js +196 -0
- package/src/utils/tool-specific.js +233 -0
- package/templates/ai-engineering/agent-orchestration-planner/ SKILL.md +266 -0
- package/templates/ai-engineering/cost-latency-optimizer/ SKILL.md +270 -0
- package/templates/ai-engineering/doc-to-vector-dataset-generator/ SKILL.md +239 -0
- package/templates/ai-engineering/evaluation-harness/ SKILL.md +219 -0
- package/templates/ai-engineering/guardrails-safety-filter-builder/ SKILL.md +226 -0
- package/templates/ai-engineering/llm-debugger/ SKILL.md +283 -0
- package/templates/ai-engineering/prompt-regression-tester/ SKILL.md +216 -0
- package/templates/ai-engineering/prompt-template-builder/ SKILL.md +393 -0
- package/templates/ai-engineering/rag-pipeline-builder/ SKILL.md +244 -0
- package/templates/ai-engineering/tool-function-schema-designer/ SKILL.md +219 -0
- package/templates/architecture/adr-writer/ SKILL.md +250 -0
- package/templates/architecture/api-versioning-deprecation-planner/ SKILL.md +331 -0
- package/templates/architecture/domain-model-boundaries-mapper/ SKILL.md +300 -0
- package/templates/architecture/migration-planner/ SKILL.md +376 -0
- package/templates/architecture/performance-budget-setter/ SKILL.md +318 -0
- package/templates/architecture/reliability-strategy-builder/ SKILL.md +286 -0
- package/templates/architecture/rfc-generator/ SKILL.md +362 -0
- package/templates/architecture/scalability-playbook/ SKILL.md +279 -0
- package/templates/architecture/system-design-generator/ SKILL.md +339 -0
- package/templates/architecture/tech-debt-prioritizer/ SKILL.md +329 -0
- package/templates/backend/api-contract-normalizer/ SKILL.md +487 -0
- package/templates/backend/api-endpoint-generator/ SKILL.md +415 -0
- package/templates/backend/auth-module-builder/ SKILL.md +99 -0
- package/templates/backend/background-jobs-designer/ SKILL.md +166 -0
- package/templates/backend/caching-strategist/ SKILL.md +190 -0
- package/templates/backend/error-handling-standardizer/ SKILL.md +174 -0
- package/templates/backend/rate-limiting-abuse-protection/ SKILL.md +147 -0
- package/templates/backend/rbac-permissions-builder/ SKILL.md +158 -0
- package/templates/backend/service-layer-extractor/ SKILL.md +269 -0
- package/templates/backend/webhook-receiver-hardener/ SKILL.md +211 -0
- package/templates/ci-cd/artifact-sbom-publisher/ SKILL.md +236 -0
- package/templates/ci-cd/caching-strategy-optimizer/ SKILL.md +195 -0
- package/templates/ci-cd/deployment-checklist-generator/ SKILL.md +381 -0
- package/templates/ci-cd/github-actions-pipeline-creator/ SKILL.md +348 -0
- package/templates/ci-cd/monorepo-ci-optimizer/ SKILL.md +298 -0
- package/templates/ci-cd/preview-environments-builder/ SKILL.md +187 -0
- package/templates/ci-cd/quality-gates-enforcer/ SKILL.md +342 -0
- package/templates/ci-cd/release-automation-builder/ SKILL.md +281 -0
- package/templates/ci-cd/rollback-workflow-builder/ SKILL.md +372 -0
- package/templates/ci-cd/secrets-env-manager/ SKILL.md +242 -0
- package/templates/db-management/backup-restore-runbook-generator/ SKILL.md +505 -0
- package/templates/db-management/data-integrity-auditor/ SKILL.md +505 -0
- package/templates/db-management/data-retention-archiving-planner/ SKILL.md +430 -0
- package/templates/db-management/data-seeding-fixtures-builder/ SKILL.md +375 -0
- package/templates/db-management/db-performance-watchlist/ SKILL.md +425 -0
- package/templates/db-management/etl-sync-job-builder/ SKILL.md +457 -0
- package/templates/db-management/multi-tenant-safety-checker/ SKILL.md +398 -0
- package/templates/db-management/prisma-migration-assistant/ SKILL.md +379 -0
- package/templates/db-management/schema-consistency-checker/ SKILL.md +440 -0
- package/templates/db-management/sql-query-optimizer/ SKILL.md +324 -0
- package/templates/foundation/changelog-writer/ SKILL.md +431 -0
- package/templates/foundation/code-formatter-installer/ SKILL.md +320 -0
- package/templates/foundation/codebase-summarizer/ SKILL.md +360 -0
- package/templates/foundation/dependency-doctor/ SKILL.md +163 -0
- package/templates/foundation/dev-environment-bootstrapper/ SKILL.md +259 -0
- package/templates/foundation/dev-onboarding-builder/ SKILL.md +556 -0
- package/templates/foundation/docs-starter-kit/ SKILL.md +574 -0
- package/templates/foundation/explaining-code/SKILL.md +13 -0
- package/templates/foundation/git-hygiene-enforcer/ SKILL.md +455 -0
- package/templates/foundation/project-scaffolder/ SKILL.md +65 -0
- package/templates/foundation/project-scaffolder/references/templates.md +126 -0
- package/templates/foundation/repo-structure-linter/ SKILL.md +0 -0
- package/templates/foundation/repo-structure-linter/references/conventions.md +98 -0
- package/templates/frontend/animation-micro-interaction-pack/ SKILL.md +41 -0
- package/templates/frontend/component-scaffold-generator/ SKILL.md +562 -0
- package/templates/frontend/design-to-component-translator/ SKILL.md +547 -0
- package/templates/frontend/form-wizard-builder/ SKILL.md +553 -0
- package/templates/frontend/frontend-refactor-planner/ SKILL.md +37 -0
- package/templates/frontend/i18n-frontend-implementer/ SKILL.md +44 -0
- package/templates/frontend/modal-drawer-system/ SKILL.md +377 -0
- package/templates/frontend/page-layout-builder/ SKILL.md +630 -0
- package/templates/frontend/state-ux-flow-builder/ SKILL.md +23 -0
- package/templates/frontend/table-builder/ SKILL.md +350 -0
- package/templates/performance/alerting-dashboard-builder/ SKILL.md +162 -0
- package/templates/performance/backend-latency-profiler-helper/ SKILL.md +108 -0
- package/templates/performance/caching-cdn-strategy-planner/ SKILL.md +150 -0
- package/templates/performance/capacity-planning-helper/ SKILL.md +242 -0
- package/templates/performance/core-web-vitals-tuner/ SKILL.md +126 -0
- package/templates/performance/incident-runbook-generator/ SKILL.md +162 -0
- package/templates/performance/load-test-scenario-builder/ SKILL.md +256 -0
- package/templates/performance/observability-setup/ SKILL.md +232 -0
- package/templates/performance/postmortem-writer/ SKILL.md +203 -0
- package/templates/performance/structured-logging-standardizer/ SKILL.md +122 -0
- package/templates/security/auth-security-reviewer/ SKILL.md +428 -0
- package/templates/security/dependency-vulnerability-triage/ SKILL.md +495 -0
- package/templates/security/input-validation-sanitization-auditor/ SKILL.md +76 -0
- package/templates/security/pii-redaction-logging-policy-builder/ SKILL.md +65 -0
- package/templates/security/rbac-policy-tester/ SKILL.md +80 -0
- package/templates/security/secrets-scanner/ SKILL.md +462 -0
- package/templates/security/secure-headers-csp-builder/ SKILL.md +404 -0
- package/templates/security/security-incident-playbook-generator/ SKILL.md +76 -0
- package/templates/security/security-pr-checklist-skill/ SKILL.md +62 -0
- package/templates/security/threat-model-generator/ SKILL.md +394 -0
- package/templates/testing/contract-testing-builder/ SKILL.md +492 -0
- package/templates/testing/coverage-strategist/ SKILL.md +436 -0
- package/templates/testing/e2e-test-builder/ SKILL.md +382 -0
- package/templates/testing/flaky-test-detective/ SKILL.md +416 -0
- package/templates/testing/integration-test-builder/ SKILL.md +525 -0
- package/templates/testing/mocking-assistant/ SKILL.md +383 -0
- package/templates/testing/snapshot-test-refactorer/ SKILL.md +375 -0
- package/templates/testing/test-data-factory-builder/ SKILL.md +449 -0
- package/templates/testing/test-reporting-triage-skill/ SKILL.md +469 -0
- package/templates/testing/unit-test-generator/ SKILL.md +548 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: load-test-scenario-builder
|
|
3
|
+
description: Creates comprehensive load test plans with realistic scenarios, traffic models, k6 scripts, and success criteria. Use for "load testing", "performance testing", "capacity validation", or "stress testing".
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Load Test Scenario Builder
|
|
7
|
+
|
|
8
|
+
Validate system capacity with realistic load tests.
|
|
9
|
+
|
|
10
|
+
## Load Test Scenarios
|
|
11
|
+
|
|
12
|
+
```typescript
|
|
13
|
+
interface LoadTestScenario {
|
|
14
|
+
name: string;
|
|
15
|
+
description: string;
|
|
16
|
+
virtualUsers: number;
|
|
17
|
+
duration: string;
|
|
18
|
+
rampUp: string;
|
|
19
|
+
successCriteria: {
|
|
20
|
+
p95Latency: number;
|
|
21
|
+
errorRate: number;
|
|
22
|
+
throughput: number;
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const scenarios: LoadTestScenario[] = [
|
|
27
|
+
{
|
|
28
|
+
name: "Baseline Load",
|
|
29
|
+
description: "Normal traffic pattern",
|
|
30
|
+
virtualUsers: 100,
|
|
31
|
+
duration: "10m",
|
|
32
|
+
rampUp: "2m",
|
|
33
|
+
successCriteria: {
|
|
34
|
+
p95Latency: 500, // ms
|
|
35
|
+
errorRate: 0.01, // 1%
|
|
36
|
+
throughput: 1000, // req/s
|
|
37
|
+
},
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
name: "Peak Load",
|
|
41
|
+
description: "Black Friday traffic",
|
|
42
|
+
virtualUsers: 1000,
|
|
43
|
+
duration: "30m",
|
|
44
|
+
rampUp: "5m",
|
|
45
|
+
successCriteria: {
|
|
46
|
+
p95Latency: 2000,
|
|
47
|
+
errorRate: 0.05,
|
|
48
|
+
throughput: 5000,
|
|
49
|
+
},
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
name: "Stress Test",
|
|
53
|
+
description: "Find breaking point",
|
|
54
|
+
virtualUsers: 5000,
|
|
55
|
+
duration: "20m",
|
|
56
|
+
rampUp: "10m",
|
|
57
|
+
successCriteria: {
|
|
58
|
+
p95Latency: 5000,
|
|
59
|
+
errorRate: 0.1,
|
|
60
|
+
throughput: 10000,
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
];
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## K6 Load Test Script
|
|
67
|
+
|
|
68
|
+
```javascript
|
|
69
|
+
// load-tests/checkout-flow.js
|
|
70
|
+
import http from "k6/http";
|
|
71
|
+
import { check, sleep } from "k6";
|
|
72
|
+
import { Rate } from "k6/metrics";
|
|
73
|
+
|
|
74
|
+
const errorRate = new Rate("errors");
|
|
75
|
+
|
|
76
|
+
export let options = {
|
|
77
|
+
stages: [
|
|
78
|
+
{ duration: "2m", target: 100 }, // Ramp up
|
|
79
|
+
{ duration: "10m", target: 100 }, // Stay at 100
|
|
80
|
+
{ duration: "2m", target: 0 }, // Ramp down
|
|
81
|
+
],
|
|
82
|
+
thresholds: {
|
|
83
|
+
http_req_duration: ["p(95)<500"], // 95% under 500ms
|
|
84
|
+
errors: ["rate<0.01"], // Error rate <1%
|
|
85
|
+
},
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
export default function () {
|
|
89
|
+
// 1. Browse products
|
|
90
|
+
let browseRes = http.get("https://api.example.com/products");
|
|
91
|
+
check(browseRes, {
|
|
92
|
+
"browse status 200": (r) => r.status === 200,
|
|
93
|
+
}) || errorRate.add(1);
|
|
94
|
+
sleep(1);
|
|
95
|
+
|
|
96
|
+
// 2. Add to cart
|
|
97
|
+
let addCartRes = http.post(
|
|
98
|
+
"https://api.example.com/cart",
|
|
99
|
+
JSON.stringify({
|
|
100
|
+
productId: "123",
|
|
101
|
+
quantity: 1,
|
|
102
|
+
}),
|
|
103
|
+
{
|
|
104
|
+
headers: { "Content-Type": "application/json" },
|
|
105
|
+
}
|
|
106
|
+
);
|
|
107
|
+
check(addCartRes, {
|
|
108
|
+
"add cart status 201": (r) => r.status === 201,
|
|
109
|
+
}) || errorRate.add(1);
|
|
110
|
+
sleep(2);
|
|
111
|
+
|
|
112
|
+
// 3. Checkout
|
|
113
|
+
let checkoutRes = http.post(
|
|
114
|
+
"https://api.example.com/checkout",
|
|
115
|
+
JSON.stringify({
|
|
116
|
+
paymentMethod: "card",
|
|
117
|
+
}),
|
|
118
|
+
{
|
|
119
|
+
headers: { "Content-Type": "application/json" },
|
|
120
|
+
}
|
|
121
|
+
);
|
|
122
|
+
check(checkoutRes, {
|
|
123
|
+
"checkout status 200": (r) => r.status === 200,
|
|
124
|
+
"checkout success": (r) => r.json("status") === "success",
|
|
125
|
+
}) || errorRate.add(1);
|
|
126
|
+
sleep(3);
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Traffic Models
|
|
131
|
+
|
|
132
|
+
```javascript
|
|
133
|
+
// Realistic traffic patterns
|
|
134
|
+
export const trafficModels = {
|
|
135
|
+
// Steady state
|
|
136
|
+
steadyState: {
|
|
137
|
+
stages: [{ duration: "30m", target: 500 }],
|
|
138
|
+
},
|
|
139
|
+
|
|
140
|
+
// Gradual ramp
|
|
141
|
+
gradualRamp: {
|
|
142
|
+
stages: [
|
|
143
|
+
{ duration: "5m", target: 100 },
|
|
144
|
+
{ duration: "5m", target: 300 },
|
|
145
|
+
{ duration: "5m", target: 500 },
|
|
146
|
+
{ duration: "10m", target: 500 },
|
|
147
|
+
{ duration: "5m", target: 0 },
|
|
148
|
+
],
|
|
149
|
+
},
|
|
150
|
+
|
|
151
|
+
// Spike test
|
|
152
|
+
spikeTest: {
|
|
153
|
+
stages: [
|
|
154
|
+
{ duration: "2m", target: 100 },
|
|
155
|
+
{ duration: "1m", target: 2000 }, // Sudden spike
|
|
156
|
+
{ duration: "2m", target: 100 },
|
|
157
|
+
],
|
|
158
|
+
},
|
|
159
|
+
|
|
160
|
+
// Soak test (endurance)
|
|
161
|
+
soakTest: {
|
|
162
|
+
stages: [
|
|
163
|
+
{ duration: "5m", target: 500 },
|
|
164
|
+
{ duration: "4h", target: 500 }, // Long duration
|
|
165
|
+
{ duration: "5m", target: 0 },
|
|
166
|
+
],
|
|
167
|
+
},
|
|
168
|
+
};
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Success Thresholds
|
|
172
|
+
|
|
173
|
+
```javascript
|
|
174
|
+
export const thresholds = {
|
|
175
|
+
// Latency
|
|
176
|
+
http_req_duration: [
|
|
177
|
+
"p(50)<200", // 50% under 200ms
|
|
178
|
+
"p(95)<500", // 95% under 500ms
|
|
179
|
+
"p(99)<1000", // 99% under 1s
|
|
180
|
+
],
|
|
181
|
+
|
|
182
|
+
// Error rate
|
|
183
|
+
http_req_failed: ["rate<0.01"], // <1% errors
|
|
184
|
+
|
|
185
|
+
// Throughput
|
|
186
|
+
http_reqs: ["rate>1000"], // >1000 req/s
|
|
187
|
+
|
|
188
|
+
// Custom metrics
|
|
189
|
+
checkout_duration: ["p(95)<2000"],
|
|
190
|
+
checkout_success_rate: ["rate>0.95"],
|
|
191
|
+
};
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Running Load Tests
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
#!/bin/bash
|
|
198
|
+
# scripts/run-load-tests.sh
|
|
199
|
+
|
|
200
|
+
echo "Running load tests..."
|
|
201
|
+
|
|
202
|
+
# Baseline test
|
|
203
|
+
k6 run --vus 100 --duration 10m load-tests/checkout-flow.js
|
|
204
|
+
|
|
205
|
+
# Peak load test
|
|
206
|
+
k6 run --vus 1000 --duration 30m load-tests/checkout-flow.js
|
|
207
|
+
|
|
208
|
+
# Stress test (find breaking point)
|
|
209
|
+
k6 run --vus 5000 --duration 20m load-tests/stress-test.js
|
|
210
|
+
|
|
211
|
+
# Generate report
|
|
212
|
+
k6 run --out json=results.json load-tests/checkout-flow.js
|
|
213
|
+
k6 run --out influxdb=http://localhost:8086 load-tests/checkout-flow.js
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## Result Analysis
|
|
217
|
+
|
|
218
|
+
```typescript
|
|
219
|
+
interface LoadTestResult {
|
|
220
|
+
scenario: string;
|
|
221
|
+
timestamp: Date;
|
|
222
|
+
metrics: {
|
|
223
|
+
p50Latency: number;
|
|
224
|
+
p95Latency: number;
|
|
225
|
+
p99Latency: number;
|
|
226
|
+
errorRate: number;
|
|
227
|
+
throughput: number;
|
|
228
|
+
maxVUs: number;
|
|
229
|
+
};
|
|
230
|
+
passed: boolean;
|
|
231
|
+
notes: string[];
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
function analyzeResults(results: LoadTestResult) {
|
|
235
|
+
console.log(\`Load Test: \${results.scenario}\`);
|
|
236
|
+
console.log(\`Status: \${results.passed ? '✅ PASS' : '❌ FAIL'}\`);
|
|
237
|
+
console.log(\`p95 Latency: \${results.metrics.p95Latency}ms\`);
|
|
238
|
+
console.log(\`Error Rate: \${(results.metrics.errorRate * 100).toFixed(2)}%\`);
|
|
239
|
+
console.log(\`Throughput: \${results.metrics.throughput} req/s\`);
|
|
240
|
+
|
|
241
|
+
if (!results.passed) {
|
|
242
|
+
console.log('Failed criteria:');
|
|
243
|
+
results.notes.forEach(note => console.log(\` - \${note}\`));
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Output Checklist
|
|
249
|
+
|
|
250
|
+
- [ ] Scenarios defined
|
|
251
|
+
- [ ] k6 scripts created
|
|
252
|
+
- [ ] Traffic models configured
|
|
253
|
+
- [ ] Success criteria set
|
|
254
|
+
- [ ] CI integration
|
|
255
|
+
- [ ] Results analysis
|
|
256
|
+
ENDFILE
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: observability-setup
|
|
3
|
+
description: Implements comprehensive observability with OpenTelemetry tracing, Prometheus metrics, and structured logging. Includes instrumentation plans, sample dashboards, and alert candidates. Use for "observability", "monitoring", "tracing", or "metrics".
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Observability Setup
|
|
7
|
+
|
|
8
|
+
Implement the three pillars: Traces, Metrics, and Logs.
|
|
9
|
+
|
|
10
|
+
## OpenTelemetry Tracing
|
|
11
|
+
|
|
12
|
+
```typescript
|
|
13
|
+
// tracing.ts
|
|
14
|
+
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
15
|
+
import { Resource } from "@opentelemetry/resources";
|
|
16
|
+
import { SemanticResourceAttributes } from "@opentelemetry/semantic-conventions";
|
|
17
|
+
import { registerInstrumentations } from "@opentelemetry/instrumentation";
|
|
18
|
+
import { HttpInstrumentation } from "@opentelemetry/instrumentation-http";
|
|
19
|
+
import { ExpressInstrumentation } from "@opentelemetry/instrumentation-express";
|
|
20
|
+
import { PrismaInstrumentation } from "@prisma/instrumentation";
|
|
21
|
+
|
|
22
|
+
const provider = new NodeTracerProvider({
|
|
23
|
+
resource: new Resource({
|
|
24
|
+
[SemanticResourceAttributes.SERVICE_NAME]: "my-api",
|
|
25
|
+
[SemanticResourceAttributes.SERVICE_VERSION]: "1.0.0",
|
|
26
|
+
}),
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
registerInstrumentations({
|
|
30
|
+
instrumentations: [
|
|
31
|
+
new HttpInstrumentation(),
|
|
32
|
+
new ExpressInstrumentation(),
|
|
33
|
+
new PrismaInstrumentation(),
|
|
34
|
+
],
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
provider.register();
|
|
38
|
+
|
|
39
|
+
// Custom spans
|
|
40
|
+
import { trace } from "@opentelemetry/api";
|
|
41
|
+
|
|
42
|
+
const tracer = trace.getTracer("my-app");
|
|
43
|
+
|
|
44
|
+
async function processOrder(orderId: string) {
|
|
45
|
+
const span = tracer.startSpan("processOrder");
|
|
46
|
+
span.setAttribute("order.id", orderId);
|
|
47
|
+
|
|
48
|
+
try {
|
|
49
|
+
await validateOrder(orderId);
|
|
50
|
+
await chargePayment(orderId);
|
|
51
|
+
await fulfillOrder(orderId);
|
|
52
|
+
span.setStatus({ code: 0 }); // OK
|
|
53
|
+
} catch (error) {
|
|
54
|
+
span.setStatus({ code: 2, message: error.message }); // ERROR
|
|
55
|
+
throw error;
|
|
56
|
+
} finally {
|
|
57
|
+
span.end();
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Prometheus Metrics
|
|
63
|
+
|
|
64
|
+
```typescript
|
|
65
|
+
// metrics.ts
|
|
66
|
+
import { Registry, Counter, Histogram, Gauge } from "prom-client";
|
|
67
|
+
|
|
68
|
+
const register = new Registry();
|
|
69
|
+
|
|
70
|
+
// HTTP request counter
|
|
71
|
+
export const httpRequestCounter = new Counter({
|
|
72
|
+
name: "http_requests_total",
|
|
73
|
+
help: "Total HTTP requests",
|
|
74
|
+
labelNames: ["method", "route", "status_code"],
|
|
75
|
+
registers: [register],
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
// HTTP request duration
|
|
79
|
+
export const httpRequestDuration = new Histogram({
|
|
80
|
+
name: "http_request_duration_seconds",
|
|
81
|
+
help: "HTTP request duration in seconds",
|
|
82
|
+
labelNames: ["method", "route", "status_code"],
|
|
83
|
+
buckets: [0.1, 0.5, 1, 2, 5, 10],
|
|
84
|
+
registers: [register],
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
// Active connections
|
|
88
|
+
export const activeConnections = new Gauge({
|
|
89
|
+
name: "active_connections",
|
|
90
|
+
help: "Number of active connections",
|
|
91
|
+
registers: [register],
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
// Business metrics
|
|
95
|
+
export const ordersProcessed = new Counter({
|
|
96
|
+
name: "orders_processed_total",
|
|
97
|
+
help: "Total orders processed",
|
|
98
|
+
labelNames: ["status"],
|
|
99
|
+
registers: [register],
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// Middleware
|
|
103
|
+
app.use((req, res, next) => {
|
|
104
|
+
const start = Date.now();
|
|
105
|
+
|
|
106
|
+
res.on("finish", () => {
|
|
107
|
+
const duration = (Date.now() - start) / 1000;
|
|
108
|
+
const route = req.route?.path || "unknown";
|
|
109
|
+
|
|
110
|
+
httpRequestCounter.inc({
|
|
111
|
+
method: req.method,
|
|
112
|
+
route,
|
|
113
|
+
status_code: res.statusCode,
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
httpRequestDuration.observe(
|
|
117
|
+
{ method: req.method, route, status_code: res.statusCode },
|
|
118
|
+
duration
|
|
119
|
+
);
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
next();
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
// Metrics endpoint
|
|
126
|
+
app.get("/metrics", async (req, res) => {
|
|
127
|
+
res.set("Content-Type", register.contentType);
|
|
128
|
+
res.end(await register.metrics());
|
|
129
|
+
});
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Structured Logging
|
|
133
|
+
|
|
134
|
+
```typescript
|
|
135
|
+
// logger.ts
|
|
136
|
+
import pino from "pino";
|
|
137
|
+
|
|
138
|
+
export const logger = pino({
|
|
139
|
+
level: process.env.LOG_LEVEL || "info",
|
|
140
|
+
formatters: {
|
|
141
|
+
level: (label) => ({ level: label }),
|
|
142
|
+
},
|
|
143
|
+
base: {
|
|
144
|
+
service: "my-api",
|
|
145
|
+
environment: process.env.NODE_ENV,
|
|
146
|
+
},
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
// Usage
|
|
150
|
+
logger.info({ userId: "123", action: "login" }, "User logged in");
|
|
151
|
+
logger.error({ err: error, orderId: "456" }, "Order processing failed");
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Sample Dashboard (Grafana)
|
|
155
|
+
|
|
156
|
+
```json
|
|
157
|
+
{
|
|
158
|
+
"dashboard": {
|
|
159
|
+
"title": "API Overview",
|
|
160
|
+
"panels": [
|
|
161
|
+
{
|
|
162
|
+
"title": "Request Rate",
|
|
163
|
+
"targets": [{
|
|
164
|
+
"expr": "rate(http_requests_total[5m])"
|
|
165
|
+
}]
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
"title": "Error Rate",
|
|
169
|
+
"targets": [{
|
|
170
|
+
"expr": "rate(http_requests_total{status_code=~"5.."}[5m])"
|
|
171
|
+
}]
|
|
172
|
+
},
|
|
173
|
+
{
|
|
174
|
+
"title": "p95 Latency",
|
|
175
|
+
"targets": [{
|
|
176
|
+
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"
|
|
177
|
+
}]
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
"title": "Active Connections",
|
|
181
|
+
"targets": [{
|
|
182
|
+
"expr": "active_connections"
|
|
183
|
+
}]
|
|
184
|
+
}
|
|
185
|
+
]
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Alert Candidates
|
|
191
|
+
|
|
192
|
+
```yaml
|
|
193
|
+
# alerts.yml
|
|
194
|
+
groups:
|
|
195
|
+
- name: api_alerts
|
|
196
|
+
interval: 30s
|
|
197
|
+
rules:
|
|
198
|
+
- alert: HighErrorRate
|
|
199
|
+
expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.05
|
|
200
|
+
for: 5m
|
|
201
|
+
labels:
|
|
202
|
+
severity: critical
|
|
203
|
+
annotations:
|
|
204
|
+
summary: "High error rate detected"
|
|
205
|
+
|
|
206
|
+
- alert: HighLatency
|
|
207
|
+
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
|
208
|
+
for: 10m
|
|
209
|
+
labels:
|
|
210
|
+
severity: warning
|
|
211
|
+
annotations:
|
|
212
|
+
summary: "p95 latency above 2s"
|
|
213
|
+
|
|
214
|
+
- alert: LowAvailability
|
|
215
|
+
expr: rate(http_requests_total{status_code="200"}[5m]) / rate(http_requests_total[5m]) < 0.95
|
|
216
|
+
for: 5m
|
|
217
|
+
labels:
|
|
218
|
+
severity: critical
|
|
219
|
+
annotations:
|
|
220
|
+
summary: "Availability below 95%"
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Output Checklist
|
|
224
|
+
|
|
225
|
+
- [ ] OpenTelemetry tracing configured
|
|
226
|
+
- [ ] Prometheus metrics instrumented
|
|
227
|
+
- [ ] Structured logging implemented
|
|
228
|
+
- [ ] Sample dashboards created
|
|
229
|
+
- [ ] Alert rules defined
|
|
230
|
+
- [ ] Metrics endpoint exposed
|
|
231
|
+
- [ ] Instrumentation tested
|
|
232
|
+
ENDFILE
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: postmortem-writer
|
|
3
|
+
description: Creates comprehensive post-incident documents with timeline, root cause analysis, contributing factors, action items, and ownership. Follows SRE best practices for blameless postmortems. Use for "postmortem", "incident review", "RCA", or "post-incident".
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Postmortem Writer
|
|
7
|
+
|
|
8
|
+
Document incidents for learning and improvement.
|
|
9
|
+
|
|
10
|
+
## Postmortem Template
|
|
11
|
+
|
|
12
|
+
```markdown
|
|
13
|
+
# Postmortem: API Outage - Database Connection Pool Exhausted
|
|
14
|
+
|
|
15
|
+
**Date:** 2024-01-15
|
|
16
|
+
**Authors:** Jane Doe (On-call), John Smith (DBA)
|
|
17
|
+
**Status:** Complete
|
|
18
|
+
**Severity:** P1 (Critical)
|
|
19
|
+
|
|
20
|
+
## Summary
|
|
21
|
+
|
|
22
|
+
On January 15, 2024, our API experienced a complete outage for 25 minutes (14:32 - 14:57 UTC) affecting 100% of users. The root cause was database connection pool exhaustion triggered by a connection leak introduced in deployment v2.3.4.
|
|
23
|
+
|
|
24
|
+
**Impact:**
|
|
25
|
+
|
|
26
|
+
- Duration: 25 minutes
|
|
27
|
+
- Users affected: ~50,000
|
|
28
|
+
- Requests failed: ~125,000
|
|
29
|
+
- Revenue impact: ~$15,000
|
|
30
|
+
|
|
31
|
+
## Timeline (All times UTC)
|
|
32
|
+
|
|
33
|
+
| Time | Event |
|
|
34
|
+
| ----- | ------------------------------------------------ |
|
|
35
|
+
| 14:15 | v2.3.4 deployed to production |
|
|
36
|
+
| 14:32 | First CloudWatch alarm: HighErrorRate |
|
|
37
|
+
| 14:33 | PagerDuty alert sent to on-call (Jane) |
|
|
38
|
+
| 14:35 | Jane acknowledges, begins investigation |
|
|
39
|
+
| 14:38 | Identified: Database connection pool at 100% |
|
|
40
|
+
| 14:40 | Attempted: Kill long-running queries (no effect) |
|
|
41
|
+
| 14:43 | Decision: Rollback to v2.3.3 |
|
|
42
|
+
| 14:45 | Rollback initiated |
|
|
43
|
+
| 14:47 | Rollback complete, connections dropping |
|
|
44
|
+
| 14:50 | Error rate returning to normal |
|
|
45
|
+
| 14:57 | All systems recovered, incident closed |
|
|
46
|
+
| 15:30 | Postmortem meeting scheduled |
|
|
47
|
+
|
|
48
|
+
## Root Cause
|
|
49
|
+
|
|
50
|
+
A code change in v2.3.4 introduced a connection leak in the user profile endpoint. The new caching layer was not properly releasing database connections after queries completed.
|
|
51
|
+
|
|
52
|
+
**Code diff:**
|
|
53
|
+
\`\`\`diff
|
|
54
|
+
|
|
55
|
+
- await prisma.user.findUnique({ where: { id } });
|
|
56
|
+
|
|
57
|
+
* const client = await pool.connect();
|
|
58
|
+
* const user = await client.query('SELECT \* FROM users WHERE id = $1', [id]);
|
|
59
|
+
* // Missing: client.release() ❌
|
|
60
|
+
\`\`\`
|
|
61
|
+
|
|
62
|
+
## Contributing Factors
|
|
63
|
+
|
|
64
|
+
1. **Insufficient testing:** Load tests didn't catch the leak
|
|
65
|
+
|
|
66
|
+
- Tests only ran for 5 minutes
|
|
67
|
+
- Not enough concurrent connections to exhaust pool
|
|
68
|
+
|
|
69
|
+
2. **Missing monitoring:** No alerts on connection pool metrics
|
|
70
|
+
|
|
71
|
+
- Had alarms for query latency
|
|
72
|
+
- No alarms for active connections count
|
|
73
|
+
|
|
74
|
+
3. **Inadequate code review:** Reviewer didn't spot missing release()
|
|
75
|
+
|
|
76
|
+
- PR approved without running locally
|
|
77
|
+
- No checklist for connection management
|
|
78
|
+
|
|
79
|
+
4. **Deployment process:** No gradual rollout
|
|
80
|
+
- Deployed to 100% of production immediately
|
|
81
|
+
- No canary deployment
|
|
82
|
+
|
|
83
|
+
## What Went Well
|
|
84
|
+
|
|
85
|
+
1. ✅ **Fast detection:** Alert fired within 3 minutes
|
|
86
|
+
2. ✅ **Clear runbook:** DBA runbook had exact steps to follow
|
|
87
|
+
3. ✅ **Quick decision:** Made rollback decision in 8 minutes
|
|
88
|
+
4. ✅ **Communication:** Status page updated every 5 minutes
|
|
89
|
+
5. ✅ **Rollback capability:** Automated rollback took <2 minutes
|
|
90
|
+
|
|
91
|
+
## What Went Wrong
|
|
92
|
+
|
|
93
|
+
1. ❌ **Code review missed bug:** Connection leak not caught
|
|
94
|
+
2. ❌ **Testing gaps:** Load tests insufficient duration
|
|
95
|
+
3. ❌ **No canary:** Deployed to all instances at once
|
|
96
|
+
4. ❌ **Late detection:** 17 minutes between deploy and alert
|
|
97
|
+
|
|
98
|
+
## Action Items
|
|
99
|
+
|
|
100
|
+
| Action | Owner | Due Date | Priority | Status |
|
|
101
|
+
| --------------------------------------------- | ------- | ---------- | -------- | -------------- |
|
|
102
|
+
| Add connection pool metrics to dashboards | Jane | 2024-01-20 | P0 | ✅ Done |
|
|
103
|
+
| Create PR checklist for connection management | John | 2024-01-22 | P0 | ✅ Done |
|
|
104
|
+
| Extend load tests to 30 minutes minimum | QA Team | 2024-01-25 | P1 | 🔄 In Progress |
|
|
105
|
+
| Implement canary deployment (10% → 100%) | DevOps | 2024-02-01 | P1 | 📋 Planned |
|
|
106
|
+
| Add connection leak detection to tests | Jane | 2024-01-27 | P1 | 🔄 In Progress |
|
|
107
|
+
| Review all DB connection usage patterns | John | 2024-02-05 | P2 | 📋 Planned |
|
|
108
|
+
| Improve alert routing (faster escalation) | DevOps | 2024-02-10 | P2 | 📋 Planned |
|
|
109
|
+
|
|
110
|
+
## Lessons Learned
|
|
111
|
+
|
|
112
|
+
1. **Code review checklists work:** Need specific items for common issues
|
|
113
|
+
2. **Load tests need realistic duration:** 5min insufficient for leaks
|
|
114
|
+
3. **Gradual rollouts catch issues:** 10% canary would have limited impact
|
|
115
|
+
4. **Monitoring gaps are dangerous:** Add metrics before you need them
|
|
116
|
+
5. **Runbooks save time:** Clear procedures enabled fast response
|
|
117
|
+
|
|
118
|
+
## Related Incidents
|
|
119
|
+
|
|
120
|
+
- [2023-11-20] Database CPU spike (similar connection pool issue)
|
|
121
|
+
- [2023-08-15] Memory leak in cache layer
|
|
122
|
+
|
|
123
|
+
## Prevention
|
|
124
|
+
|
|
125
|
+
To prevent similar incidents:
|
|
126
|
+
|
|
127
|
+
1. ✅ Add connection management to code review checklist
|
|
128
|
+
2. ✅ Monitor connection pool utilization
|
|
129
|
+
3. ✅ Extend load test duration
|
|
130
|
+
4. ✅ Implement canary deployments
|
|
131
|
+
5. ✅ Add automated connection leak detection
|
|
132
|
+
|
|
133
|
+
## Appendix
|
|
134
|
+
|
|
135
|
+
### Monitoring Graphs
|
|
136
|
+
|
|
137
|
+
[Insert graphs of connection pool, error rates, latency during incident]
|
|
138
|
+
|
|
139
|
+
### Communication Log
|
|
140
|
+
|
|
141
|
+
[Insert status page updates and customer communication]
|
|
142
|
+
|
|
143
|
+
### Code Fix
|
|
144
|
+
|
|
145
|
+
PR #1235: Fix connection leak in user profile endpoint
|
|
146
|
+
\`\`\`typescript
|
|
147
|
+
const client = await pool.connect();
|
|
148
|
+
try {
|
|
149
|
+
const user = await client.query('SELECT \* FROM users WHERE id = $1', [id]);
|
|
150
|
+
return user;
|
|
151
|
+
} finally {
|
|
152
|
+
client.release(); // ✅ Always release
|
|
153
|
+
}
|
|
154
|
+
\`\`\`
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Postmortem Best Practices
|
|
158
|
+
|
|
159
|
+
```markdown
|
|
160
|
+
# Blameless Postmortem Guidelines
|
|
161
|
+
|
|
162
|
+
## Do ✅
|
|
163
|
+
|
|
164
|
+
- Focus on systems and processes, not people
|
|
165
|
+
- Use timeline with exact timestamps
|
|
166
|
+
- Identify contributing factors, not just root cause
|
|
167
|
+
- Create actionable items with owners and dates
|
|
168
|
+
- Document what went well (positive reinforcement)
|
|
169
|
+
- Share widely for organizational learning
|
|
170
|
+
|
|
171
|
+
## Don't ❌
|
|
172
|
+
|
|
173
|
+
- Blame individuals or teams
|
|
174
|
+
- Hide or minimize the incident
|
|
175
|
+
- Skip the postmortem (even for small incidents)
|
|
176
|
+
- Create action items without owners
|
|
177
|
+
- Forget to follow up on action items
|
|
178
|
+
- Make it a blame session
|
|
179
|
+
|
|
180
|
+
## Template Sections
|
|
181
|
+
|
|
182
|
+
1. **Summary** (2-3 sentences)
|
|
183
|
+
2. **Impact** (numbers: users, revenue, duration)
|
|
184
|
+
3. **Timeline** (chronological events)
|
|
185
|
+
4. **Root Cause** (technical explanation)
|
|
186
|
+
5. **Contributing Factors** (broader context)
|
|
187
|
+
6. **What Went Well** (positive reinforcement)
|
|
188
|
+
7. **What Went Wrong** (improvement areas)
|
|
189
|
+
8. **Action Items** (concrete, owned, dated)
|
|
190
|
+
9. **Lessons Learned** (key takeaways)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Output Checklist
|
|
194
|
+
|
|
195
|
+
- [ ] Timeline created
|
|
196
|
+
- [ ] Root cause identified
|
|
197
|
+
- [ ] Contributing factors documented
|
|
198
|
+
- [ ] Action items with owners
|
|
199
|
+
- [ ] Lessons learned captured
|
|
200
|
+
- [ ] Postmortem meeting held
|
|
201
|
+
- [ ] Document shared widely
|
|
202
|
+
- [ ] Follow-up scheduled
|
|
203
|
+
ENDFILE
|