@patricio0312rev/skillset 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/CHANGELOG.md +29 -0
  2. package/LICENSE +21 -0
  3. package/README.md +176 -0
  4. package/bin/cli.js +37 -0
  5. package/package.json +55 -0
  6. package/src/commands/init.js +301 -0
  7. package/src/index.js +168 -0
  8. package/src/lib/config.js +200 -0
  9. package/src/lib/generator.js +166 -0
  10. package/src/utils/display.js +95 -0
  11. package/src/utils/readme.js +196 -0
  12. package/src/utils/tool-specific.js +233 -0
  13. package/templates/ai-engineering/agent-orchestration-planner/ SKILL.md +266 -0
  14. package/templates/ai-engineering/cost-latency-optimizer/ SKILL.md +270 -0
  15. package/templates/ai-engineering/doc-to-vector-dataset-generator/ SKILL.md +239 -0
  16. package/templates/ai-engineering/evaluation-harness/ SKILL.md +219 -0
  17. package/templates/ai-engineering/guardrails-safety-filter-builder/ SKILL.md +226 -0
  18. package/templates/ai-engineering/llm-debugger/ SKILL.md +283 -0
  19. package/templates/ai-engineering/prompt-regression-tester/ SKILL.md +216 -0
  20. package/templates/ai-engineering/prompt-template-builder/ SKILL.md +393 -0
  21. package/templates/ai-engineering/rag-pipeline-builder/ SKILL.md +244 -0
  22. package/templates/ai-engineering/tool-function-schema-designer/ SKILL.md +219 -0
  23. package/templates/architecture/adr-writer/ SKILL.md +250 -0
  24. package/templates/architecture/api-versioning-deprecation-planner/ SKILL.md +331 -0
  25. package/templates/architecture/domain-model-boundaries-mapper/ SKILL.md +300 -0
  26. package/templates/architecture/migration-planner/ SKILL.md +376 -0
  27. package/templates/architecture/performance-budget-setter/ SKILL.md +318 -0
  28. package/templates/architecture/reliability-strategy-builder/ SKILL.md +286 -0
  29. package/templates/architecture/rfc-generator/ SKILL.md +362 -0
  30. package/templates/architecture/scalability-playbook/ SKILL.md +279 -0
  31. package/templates/architecture/system-design-generator/ SKILL.md +339 -0
  32. package/templates/architecture/tech-debt-prioritizer/ SKILL.md +329 -0
  33. package/templates/backend/api-contract-normalizer/ SKILL.md +487 -0
  34. package/templates/backend/api-endpoint-generator/ SKILL.md +415 -0
  35. package/templates/backend/auth-module-builder/ SKILL.md +99 -0
  36. package/templates/backend/background-jobs-designer/ SKILL.md +166 -0
  37. package/templates/backend/caching-strategist/ SKILL.md +190 -0
  38. package/templates/backend/error-handling-standardizer/ SKILL.md +174 -0
  39. package/templates/backend/rate-limiting-abuse-protection/ SKILL.md +147 -0
  40. package/templates/backend/rbac-permissions-builder/ SKILL.md +158 -0
  41. package/templates/backend/service-layer-extractor/ SKILL.md +269 -0
  42. package/templates/backend/webhook-receiver-hardener/ SKILL.md +211 -0
  43. package/templates/ci-cd/artifact-sbom-publisher/ SKILL.md +236 -0
  44. package/templates/ci-cd/caching-strategy-optimizer/ SKILL.md +195 -0
  45. package/templates/ci-cd/deployment-checklist-generator/ SKILL.md +381 -0
  46. package/templates/ci-cd/github-actions-pipeline-creator/ SKILL.md +348 -0
  47. package/templates/ci-cd/monorepo-ci-optimizer/ SKILL.md +298 -0
  48. package/templates/ci-cd/preview-environments-builder/ SKILL.md +187 -0
  49. package/templates/ci-cd/quality-gates-enforcer/ SKILL.md +342 -0
  50. package/templates/ci-cd/release-automation-builder/ SKILL.md +281 -0
  51. package/templates/ci-cd/rollback-workflow-builder/ SKILL.md +372 -0
  52. package/templates/ci-cd/secrets-env-manager/ SKILL.md +242 -0
  53. package/templates/db-management/backup-restore-runbook-generator/ SKILL.md +505 -0
  54. package/templates/db-management/data-integrity-auditor/ SKILL.md +505 -0
  55. package/templates/db-management/data-retention-archiving-planner/ SKILL.md +430 -0
  56. package/templates/db-management/data-seeding-fixtures-builder/ SKILL.md +375 -0
  57. package/templates/db-management/db-performance-watchlist/ SKILL.md +425 -0
  58. package/templates/db-management/etl-sync-job-builder/ SKILL.md +457 -0
  59. package/templates/db-management/multi-tenant-safety-checker/ SKILL.md +398 -0
  60. package/templates/db-management/prisma-migration-assistant/ SKILL.md +379 -0
  61. package/templates/db-management/schema-consistency-checker/ SKILL.md +440 -0
  62. package/templates/db-management/sql-query-optimizer/ SKILL.md +324 -0
  63. package/templates/foundation/changelog-writer/ SKILL.md +431 -0
  64. package/templates/foundation/code-formatter-installer/ SKILL.md +320 -0
  65. package/templates/foundation/codebase-summarizer/ SKILL.md +360 -0
  66. package/templates/foundation/dependency-doctor/ SKILL.md +163 -0
  67. package/templates/foundation/dev-environment-bootstrapper/ SKILL.md +259 -0
  68. package/templates/foundation/dev-onboarding-builder/ SKILL.md +556 -0
  69. package/templates/foundation/docs-starter-kit/ SKILL.md +574 -0
  70. package/templates/foundation/explaining-code/SKILL.md +13 -0
  71. package/templates/foundation/git-hygiene-enforcer/ SKILL.md +455 -0
  72. package/templates/foundation/project-scaffolder/ SKILL.md +65 -0
  73. package/templates/foundation/project-scaffolder/references/templates.md +126 -0
  74. package/templates/foundation/repo-structure-linter/ SKILL.md +0 -0
  75. package/templates/foundation/repo-structure-linter/references/conventions.md +98 -0
  76. package/templates/frontend/animation-micro-interaction-pack/ SKILL.md +41 -0
  77. package/templates/frontend/component-scaffold-generator/ SKILL.md +562 -0
  78. package/templates/frontend/design-to-component-translator/ SKILL.md +547 -0
  79. package/templates/frontend/form-wizard-builder/ SKILL.md +553 -0
  80. package/templates/frontend/frontend-refactor-planner/ SKILL.md +37 -0
  81. package/templates/frontend/i18n-frontend-implementer/ SKILL.md +44 -0
  82. package/templates/frontend/modal-drawer-system/ SKILL.md +377 -0
  83. package/templates/frontend/page-layout-builder/ SKILL.md +630 -0
  84. package/templates/frontend/state-ux-flow-builder/ SKILL.md +23 -0
  85. package/templates/frontend/table-builder/ SKILL.md +350 -0
  86. package/templates/performance/alerting-dashboard-builder/ SKILL.md +162 -0
  87. package/templates/performance/backend-latency-profiler-helper/ SKILL.md +108 -0
  88. package/templates/performance/caching-cdn-strategy-planner/ SKILL.md +150 -0
  89. package/templates/performance/capacity-planning-helper/ SKILL.md +242 -0
  90. package/templates/performance/core-web-vitals-tuner/ SKILL.md +126 -0
  91. package/templates/performance/incident-runbook-generator/ SKILL.md +162 -0
  92. package/templates/performance/load-test-scenario-builder/ SKILL.md +256 -0
  93. package/templates/performance/observability-setup/ SKILL.md +232 -0
  94. package/templates/performance/postmortem-writer/ SKILL.md +203 -0
  95. package/templates/performance/structured-logging-standardizer/ SKILL.md +122 -0
  96. package/templates/security/auth-security-reviewer/ SKILL.md +428 -0
  97. package/templates/security/dependency-vulnerability-triage/ SKILL.md +495 -0
  98. package/templates/security/input-validation-sanitization-auditor/ SKILL.md +76 -0
  99. package/templates/security/pii-redaction-logging-policy-builder/ SKILL.md +65 -0
  100. package/templates/security/rbac-policy-tester/ SKILL.md +80 -0
  101. package/templates/security/secrets-scanner/ SKILL.md +462 -0
  102. package/templates/security/secure-headers-csp-builder/ SKILL.md +404 -0
  103. package/templates/security/security-incident-playbook-generator/ SKILL.md +76 -0
  104. package/templates/security/security-pr-checklist-skill/ SKILL.md +62 -0
  105. package/templates/security/threat-model-generator/ SKILL.md +394 -0
  106. package/templates/testing/contract-testing-builder/ SKILL.md +492 -0
  107. package/templates/testing/coverage-strategist/ SKILL.md +436 -0
  108. package/templates/testing/e2e-test-builder/ SKILL.md +382 -0
  109. package/templates/testing/flaky-test-detective/ SKILL.md +416 -0
  110. package/templates/testing/integration-test-builder/ SKILL.md +525 -0
  111. package/templates/testing/mocking-assistant/ SKILL.md +383 -0
  112. package/templates/testing/snapshot-test-refactorer/ SKILL.md +375 -0
  113. package/templates/testing/test-data-factory-builder/ SKILL.md +449 -0
  114. package/templates/testing/test-reporting-triage-skill/ SKILL.md +469 -0
  115. package/templates/testing/unit-test-generator/ SKILL.md +548 -0
@@ -0,0 +1,256 @@
1
+ ---
2
+ name: load-test-scenario-builder
3
+ description: Creates comprehensive load test plans with realistic scenarios, traffic models, k6 scripts, and success criteria. Use for "load testing", "performance testing", "capacity validation", or "stress testing".
4
+ ---
5
+
6
+ # Load Test Scenario Builder
7
+
8
+ Validate system capacity with realistic load tests.
9
+
10
+ ## Load Test Scenarios
11
+
12
+ ```typescript
13
+ interface LoadTestScenario {
14
+ name: string;
15
+ description: string;
16
+ virtualUsers: number;
17
+ duration: string;
18
+ rampUp: string;
19
+ successCriteria: {
20
+ p95Latency: number;
21
+ errorRate: number;
22
+ throughput: number;
23
+ };
24
+ }
25
+
26
+ const scenarios: LoadTestScenario[] = [
27
+ {
28
+ name: "Baseline Load",
29
+ description: "Normal traffic pattern",
30
+ virtualUsers: 100,
31
+ duration: "10m",
32
+ rampUp: "2m",
33
+ successCriteria: {
34
+ p95Latency: 500, // ms
35
+ errorRate: 0.01, // 1%
36
+ throughput: 1000, // req/s
37
+ },
38
+ },
39
+ {
40
+ name: "Peak Load",
41
+ description: "Black Friday traffic",
42
+ virtualUsers: 1000,
43
+ duration: "30m",
44
+ rampUp: "5m",
45
+ successCriteria: {
46
+ p95Latency: 2000,
47
+ errorRate: 0.05,
48
+ throughput: 5000,
49
+ },
50
+ },
51
+ {
52
+ name: "Stress Test",
53
+ description: "Find breaking point",
54
+ virtualUsers: 5000,
55
+ duration: "20m",
56
+ rampUp: "10m",
57
+ successCriteria: {
58
+ p95Latency: 5000,
59
+ errorRate: 0.1,
60
+ throughput: 10000,
61
+ },
62
+ },
63
+ ];
64
+ ```
65
+
66
+ ## K6 Load Test Script
67
+
68
+ ```javascript
69
+ // load-tests/checkout-flow.js
70
+ import http from "k6/http";
71
+ import { check, sleep } from "k6";
72
+ import { Rate } from "k6/metrics";
73
+
74
+ const errorRate = new Rate("errors");
75
+
76
+ export let options = {
77
+ stages: [
78
+ { duration: "2m", target: 100 }, // Ramp up
79
+ { duration: "10m", target: 100 }, // Stay at 100
80
+ { duration: "2m", target: 0 }, // Ramp down
81
+ ],
82
+ thresholds: {
83
+ http_req_duration: ["p(95)<500"], // 95% under 500ms
84
+ errors: ["rate<0.01"], // Error rate <1%
85
+ },
86
+ };
87
+
88
+ export default function () {
89
+ // 1. Browse products
90
+ let browseRes = http.get("https://api.example.com/products");
91
+ check(browseRes, {
92
+ "browse status 200": (r) => r.status === 200,
93
+ }) || errorRate.add(1);
94
+ sleep(1);
95
+
96
+ // 2. Add to cart
97
+ let addCartRes = http.post(
98
+ "https://api.example.com/cart",
99
+ JSON.stringify({
100
+ productId: "123",
101
+ quantity: 1,
102
+ }),
103
+ {
104
+ headers: { "Content-Type": "application/json" },
105
+ }
106
+ );
107
+ check(addCartRes, {
108
+ "add cart status 201": (r) => r.status === 201,
109
+ }) || errorRate.add(1);
110
+ sleep(2);
111
+
112
+ // 3. Checkout
113
+ let checkoutRes = http.post(
114
+ "https://api.example.com/checkout",
115
+ JSON.stringify({
116
+ paymentMethod: "card",
117
+ }),
118
+ {
119
+ headers: { "Content-Type": "application/json" },
120
+ }
121
+ );
122
+ check(checkoutRes, {
123
+ "checkout status 200": (r) => r.status === 200,
124
+ "checkout success": (r) => r.json("status") === "success",
125
+ }) || errorRate.add(1);
126
+ sleep(3);
127
+ }
128
+ ```
129
+
130
+ ## Traffic Models
131
+
132
+ ```javascript
133
+ // Realistic traffic patterns
134
+ export const trafficModels = {
135
+ // Steady state
136
+ steadyState: {
137
+ stages: [{ duration: "30m", target: 500 }],
138
+ },
139
+
140
+ // Gradual ramp
141
+ gradualRamp: {
142
+ stages: [
143
+ { duration: "5m", target: 100 },
144
+ { duration: "5m", target: 300 },
145
+ { duration: "5m", target: 500 },
146
+ { duration: "10m", target: 500 },
147
+ { duration: "5m", target: 0 },
148
+ ],
149
+ },
150
+
151
+ // Spike test
152
+ spikeTest: {
153
+ stages: [
154
+ { duration: "2m", target: 100 },
155
+ { duration: "1m", target: 2000 }, // Sudden spike
156
+ { duration: "2m", target: 100 },
157
+ ],
158
+ },
159
+
160
+ // Soak test (endurance)
161
+ soakTest: {
162
+ stages: [
163
+ { duration: "5m", target: 500 },
164
+ { duration: "4h", target: 500 }, // Long duration
165
+ { duration: "5m", target: 0 },
166
+ ],
167
+ },
168
+ };
169
+ ```
170
+
171
+ ## Success Thresholds
172
+
173
+ ```javascript
174
+ export const thresholds = {
175
+ // Latency
176
+ http_req_duration: [
177
+ "p(50)<200", // 50% under 200ms
178
+ "p(95)<500", // 95% under 500ms
179
+ "p(99)<1000", // 99% under 1s
180
+ ],
181
+
182
+ // Error rate
183
+ http_req_failed: ["rate<0.01"], // <1% errors
184
+
185
+ // Throughput
186
+ http_reqs: ["rate>1000"], // >1000 req/s
187
+
188
+ // Custom metrics
189
+ checkout_duration: ["p(95)<2000"],
190
+ checkout_success_rate: ["rate>0.95"],
191
+ };
192
+ ```
193
+
194
+ ## Running Load Tests
195
+
196
+ ```bash
197
+ #!/bin/bash
198
+ # scripts/run-load-tests.sh
199
+
200
+ echo "Running load tests..."
201
+
202
+ # Baseline test
203
+ k6 run --vus 100 --duration 10m load-tests/checkout-flow.js
204
+
205
+ # Peak load test
206
+ k6 run --vus 1000 --duration 30m load-tests/checkout-flow.js
207
+
208
+ # Stress test (find breaking point)
209
+ k6 run --vus 5000 --duration 20m load-tests/stress-test.js
210
+
211
+ # Generate report
212
+ k6 run --out json=results.json load-tests/checkout-flow.js
213
+ k6 run --out influxdb=http://localhost:8086 load-tests/checkout-flow.js
214
+ ```
215
+
216
+ ## Result Analysis
217
+
218
+ ```typescript
219
+ interface LoadTestResult {
220
+ scenario: string;
221
+ timestamp: Date;
222
+ metrics: {
223
+ p50Latency: number;
224
+ p95Latency: number;
225
+ p99Latency: number;
226
+ errorRate: number;
227
+ throughput: number;
228
+ maxVUs: number;
229
+ };
230
+ passed: boolean;
231
+ notes: string[];
232
+ }
233
+
234
+ function analyzeResults(results: LoadTestResult) {
235
+ console.log(\`Load Test: \${results.scenario}\`);
236
+ console.log(\`Status: \${results.passed ? '✅ PASS' : '❌ FAIL'}\`);
237
+ console.log(\`p95 Latency: \${results.metrics.p95Latency}ms\`);
238
+ console.log(\`Error Rate: \${(results.metrics.errorRate * 100).toFixed(2)}%\`);
239
+ console.log(\`Throughput: \${results.metrics.throughput} req/s\`);
240
+
241
+ if (!results.passed) {
242
+ console.log('Failed criteria:');
243
+ results.notes.forEach(note => console.log(\` - \${note}\`));
244
+ }
245
+ }
246
+ ```
247
+
248
+ ## Output Checklist
249
+
250
+ - [ ] Scenarios defined
251
+ - [ ] k6 scripts created
252
+ - [ ] Traffic models configured
253
+ - [ ] Success criteria set
254
+ - [ ] CI integration
255
+ - [ ] Results analysis
256
+ ENDFILE
@@ -0,0 +1,232 @@
1
+ ---
2
+ name: observability-setup
3
+ description: Implements comprehensive observability with OpenTelemetry tracing, Prometheus metrics, and structured logging. Includes instrumentation plans, sample dashboards, and alert candidates. Use for "observability", "monitoring", "tracing", or "metrics".
4
+ ---
5
+
6
+ # Observability Setup
7
+
8
+ Implement the three pillars: Traces, Metrics, and Logs.
9
+
10
+ ## OpenTelemetry Tracing
11
+
12
+ ```typescript
13
+ // tracing.ts
14
+ import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
15
+ import { Resource } from "@opentelemetry/resources";
16
+ import { SemanticResourceAttributes } from "@opentelemetry/semantic-conventions";
17
+ import { registerInstrumentations } from "@opentelemetry/instrumentation";
18
+ import { HttpInstrumentation } from "@opentelemetry/instrumentation-http";
19
+ import { ExpressInstrumentation } from "@opentelemetry/instrumentation-express";
20
+ import { PrismaInstrumentation } from "@prisma/instrumentation";
21
+
22
+ const provider = new NodeTracerProvider({
23
+ resource: new Resource({
24
+ [SemanticResourceAttributes.SERVICE_NAME]: "my-api",
25
+ [SemanticResourceAttributes.SERVICE_VERSION]: "1.0.0",
26
+ }),
27
+ });
28
+
29
+ registerInstrumentations({
30
+ instrumentations: [
31
+ new HttpInstrumentation(),
32
+ new ExpressInstrumentation(),
33
+ new PrismaInstrumentation(),
34
+ ],
35
+ });
36
+
37
+ provider.register();
38
+
39
+ // Custom spans
40
+ import { trace } from "@opentelemetry/api";
41
+
42
+ const tracer = trace.getTracer("my-app");
43
+
44
+ async function processOrder(orderId: string) {
45
+ const span = tracer.startSpan("processOrder");
46
+ span.setAttribute("order.id", orderId);
47
+
48
+ try {
49
+ await validateOrder(orderId);
50
+ await chargePayment(orderId);
51
+ await fulfillOrder(orderId);
52
+ span.setStatus({ code: 0 }); // OK
53
+ } catch (error) {
54
+ span.setStatus({ code: 2, message: error.message }); // ERROR
55
+ throw error;
56
+ } finally {
57
+ span.end();
58
+ }
59
+ }
60
+ ```
61
+
62
+ ## Prometheus Metrics
63
+
64
+ ```typescript
65
+ // metrics.ts
66
+ import { Registry, Counter, Histogram, Gauge } from "prom-client";
67
+
68
+ const register = new Registry();
69
+
70
+ // HTTP request counter
71
+ export const httpRequestCounter = new Counter({
72
+ name: "http_requests_total",
73
+ help: "Total HTTP requests",
74
+ labelNames: ["method", "route", "status_code"],
75
+ registers: [register],
76
+ });
77
+
78
+ // HTTP request duration
79
+ export const httpRequestDuration = new Histogram({
80
+ name: "http_request_duration_seconds",
81
+ help: "HTTP request duration in seconds",
82
+ labelNames: ["method", "route", "status_code"],
83
+ buckets: [0.1, 0.5, 1, 2, 5, 10],
84
+ registers: [register],
85
+ });
86
+
87
+ // Active connections
88
+ export const activeConnections = new Gauge({
89
+ name: "active_connections",
90
+ help: "Number of active connections",
91
+ registers: [register],
92
+ });
93
+
94
+ // Business metrics
95
+ export const ordersProcessed = new Counter({
96
+ name: "orders_processed_total",
97
+ help: "Total orders processed",
98
+ labelNames: ["status"],
99
+ registers: [register],
100
+ });
101
+
102
+ // Middleware
103
+ app.use((req, res, next) => {
104
+ const start = Date.now();
105
+
106
+ res.on("finish", () => {
107
+ const duration = (Date.now() - start) / 1000;
108
+ const route = req.route?.path || "unknown";
109
+
110
+ httpRequestCounter.inc({
111
+ method: req.method,
112
+ route,
113
+ status_code: res.statusCode,
114
+ });
115
+
116
+ httpRequestDuration.observe(
117
+ { method: req.method, route, status_code: res.statusCode },
118
+ duration
119
+ );
120
+ });
121
+
122
+ next();
123
+ });
124
+
125
+ // Metrics endpoint
126
+ app.get("/metrics", async (req, res) => {
127
+ res.set("Content-Type", register.contentType);
128
+ res.end(await register.metrics());
129
+ });
130
+ ```
131
+
132
+ ## Structured Logging
133
+
134
+ ```typescript
135
+ // logger.ts
136
+ import pino from "pino";
137
+
138
+ export const logger = pino({
139
+ level: process.env.LOG_LEVEL || "info",
140
+ formatters: {
141
+ level: (label) => ({ level: label }),
142
+ },
143
+ base: {
144
+ service: "my-api",
145
+ environment: process.env.NODE_ENV,
146
+ },
147
+ });
148
+
149
+ // Usage
150
+ logger.info({ userId: "123", action: "login" }, "User logged in");
151
+ logger.error({ err: error, orderId: "456" }, "Order processing failed");
152
+ ```
153
+
154
+ ## Sample Dashboard (Grafana)
155
+
156
+ ```json
157
+ {
158
+ "dashboard": {
159
+ "title": "API Overview",
160
+ "panels": [
161
+ {
162
+ "title": "Request Rate",
163
+ "targets": [{
164
+ "expr": "rate(http_requests_total[5m])"
165
+ }]
166
+ },
167
+ {
168
+ "title": "Error Rate",
169
+ "targets": [{
170
+ "expr": "rate(http_requests_total{status_code=~"5.."}[5m])"
171
+ }]
172
+ },
173
+ {
174
+ "title": "p95 Latency",
175
+ "targets": [{
176
+ "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"
177
+ }]
178
+ },
179
+ {
180
+ "title": "Active Connections",
181
+ "targets": [{
182
+ "expr": "active_connections"
183
+ }]
184
+ }
185
+ ]
186
+ }
187
+ }
188
+ ```
189
+
190
+ ## Alert Candidates
191
+
192
+ ```yaml
193
+ # alerts.yml
194
+ groups:
195
+ - name: api_alerts
196
+ interval: 30s
197
+ rules:
198
+ - alert: HighErrorRate
199
+ expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.05
200
+ for: 5m
201
+ labels:
202
+ severity: critical
203
+ annotations:
204
+ summary: "High error rate detected"
205
+
206
+ - alert: HighLatency
207
+ expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
208
+ for: 10m
209
+ labels:
210
+ severity: warning
211
+ annotations:
212
+ summary: "p95 latency above 2s"
213
+
214
+ - alert: LowAvailability
215
+ expr: rate(http_requests_total{status_code="200"}[5m]) / rate(http_requests_total[5m]) < 0.95
216
+ for: 5m
217
+ labels:
218
+ severity: critical
219
+ annotations:
220
+ summary: "Availability below 95%"
221
+ ```
222
+
223
+ ## Output Checklist
224
+
225
+ - [ ] OpenTelemetry tracing configured
226
+ - [ ] Prometheus metrics instrumented
227
+ - [ ] Structured logging implemented
228
+ - [ ] Sample dashboards created
229
+ - [ ] Alert rules defined
230
+ - [ ] Metrics endpoint exposed
231
+ - [ ] Instrumentation tested
232
+ ENDFILE
@@ -0,0 +1,203 @@
1
+ ---
2
+ name: postmortem-writer
3
+ description: Creates comprehensive post-incident documents with timeline, root cause analysis, contributing factors, action items, and ownership. Follows SRE best practices for blameless postmortems. Use for "postmortem", "incident review", "RCA", or "post-incident".
4
+ ---
5
+
6
+ # Postmortem Writer
7
+
8
+ Document incidents for learning and improvement.
9
+
10
+ ## Postmortem Template
11
+
12
+ ```markdown
13
+ # Postmortem: API Outage - Database Connection Pool Exhausted
14
+
15
+ **Date:** 2024-01-15
16
+ **Authors:** Jane Doe (On-call), John Smith (DBA)
17
+ **Status:** Complete
18
+ **Severity:** P1 (Critical)
19
+
20
+ ## Summary
21
+
22
+ On January 15, 2024, our API experienced a complete outage for 25 minutes (14:32 - 14:57 UTC) affecting 100% of users. The root cause was database connection pool exhaustion triggered by a connection leak introduced in deployment v2.3.4.
23
+
24
+ **Impact:**
25
+
26
+ - Duration: 25 minutes
27
+ - Users affected: ~50,000
28
+ - Requests failed: ~125,000
29
+ - Revenue impact: ~$15,000
30
+
31
+ ## Timeline (All times UTC)
32
+
33
+ | Time | Event |
34
+ | ----- | ------------------------------------------------ |
35
+ | 14:15 | v2.3.4 deployed to production |
36
+ | 14:32 | First CloudWatch alarm: HighErrorRate |
37
+ | 14:33 | PagerDuty alert sent to on-call (Jane) |
38
+ | 14:35 | Jane acknowledges, begins investigation |
39
+ | 14:38 | Identified: Database connection pool at 100% |
40
+ | 14:40 | Attempted: Kill long-running queries (no effect) |
41
+ | 14:43 | Decision: Rollback to v2.3.3 |
42
+ | 14:45 | Rollback initiated |
43
+ | 14:47 | Rollback complete, connections dropping |
44
+ | 14:50 | Error rate returning to normal |
45
+ | 14:57 | All systems recovered, incident closed |
46
+ | 15:30 | Postmortem meeting scheduled |
47
+
48
+ ## Root Cause
49
+
50
+ A code change in v2.3.4 introduced a connection leak in the user profile endpoint. The new caching layer was not properly releasing database connections after queries completed.
51
+
52
+ **Code diff:**
53
+ \`\`\`diff
54
+
55
+ - await prisma.user.findUnique({ where: { id } });
56
+
57
+ * const client = await pool.connect();
58
+ * const user = await client.query('SELECT \* FROM users WHERE id = $1', [id]);
59
+ * // Missing: client.release() ❌
60
+ \`\`\`
61
+
62
+ ## Contributing Factors
63
+
64
+ 1. **Insufficient testing:** Load tests didn't catch the leak
65
+
66
+ - Tests only ran for 5 minutes
67
+ - Not enough concurrent connections to exhaust pool
68
+
69
+ 2. **Missing monitoring:** No alerts on connection pool metrics
70
+
71
+ - Had alarms for query latency
72
+ - No alarms for active connections count
73
+
74
+ 3. **Inadequate code review:** Reviewer didn't spot missing release()
75
+
76
+ - PR approved without running locally
77
+ - No checklist for connection management
78
+
79
+ 4. **Deployment process:** No gradual rollout
80
+ - Deployed to 100% of production immediately
81
+ - No canary deployment
82
+
83
+ ## What Went Well
84
+
85
+ 1. ✅ **Fast detection:** Alert fired within 3 minutes
86
+ 2. ✅ **Clear runbook:** DBA runbook had exact steps to follow
87
+ 3. ✅ **Quick decision:** Made rollback decision in 8 minutes
88
+ 4. ✅ **Communication:** Status page updated every 5 minutes
89
+ 5. ✅ **Rollback capability:** Automated rollback took <2 minutes
90
+
91
+ ## What Went Wrong
92
+
93
+ 1. ❌ **Code review missed bug:** Connection leak not caught
94
+ 2. ❌ **Testing gaps:** Load tests insufficient duration
95
+ 3. ❌ **No canary:** Deployed to all instances at once
96
+ 4. ❌ **Late detection:** 17 minutes between deploy and alert
97
+
98
+ ## Action Items
99
+
100
+ | Action | Owner | Due Date | Priority | Status |
101
+ | --------------------------------------------- | ------- | ---------- | -------- | -------------- |
102
+ | Add connection pool metrics to dashboards | Jane | 2024-01-20 | P0 | ✅ Done |
103
+ | Create PR checklist for connection management | John | 2024-01-22 | P0 | ✅ Done |
104
+ | Extend load tests to 30 minutes minimum | QA Team | 2024-01-25 | P1 | 🔄 In Progress |
105
+ | Implement canary deployment (10% → 100%) | DevOps | 2024-02-01 | P1 | 📋 Planned |
106
+ | Add connection leak detection to tests | Jane | 2024-01-27 | P1 | 🔄 In Progress |
107
+ | Review all DB connection usage patterns | John | 2024-02-05 | P2 | 📋 Planned |
108
+ | Improve alert routing (faster escalation) | DevOps | 2024-02-10 | P2 | 📋 Planned |
109
+
110
+ ## Lessons Learned
111
+
112
+ 1. **Code review checklists work:** Need specific items for common issues
113
+ 2. **Load tests need realistic duration:** 5min insufficient for leaks
114
+ 3. **Gradual rollouts catch issues:** 10% canary would have limited impact
115
+ 4. **Monitoring gaps are dangerous:** Add metrics before you need them
116
+ 5. **Runbooks save time:** Clear procedures enabled fast response
117
+
118
+ ## Related Incidents
119
+
120
+ - [2023-11-20] Database CPU spike (similar connection pool issue)
121
+ - [2023-08-15] Memory leak in cache layer
122
+
123
+ ## Prevention
124
+
125
+ To prevent similar incidents:
126
+
127
+ 1. ✅ Add connection management to code review checklist
128
+ 2. ✅ Monitor connection pool utilization
129
+ 3. ✅ Extend load test duration
130
+ 4. ✅ Implement canary deployments
131
+ 5. ✅ Add automated connection leak detection
132
+
133
+ ## Appendix
134
+
135
+ ### Monitoring Graphs
136
+
137
+ [Insert graphs of connection pool, error rates, latency during incident]
138
+
139
+ ### Communication Log
140
+
141
+ [Insert status page updates and customer communication]
142
+
143
+ ### Code Fix
144
+
145
+ PR #1235: Fix connection leak in user profile endpoint
146
+ \`\`\`typescript
147
+ const client = await pool.connect();
148
+ try {
149
+ const user = await client.query('SELECT \* FROM users WHERE id = $1', [id]);
150
+ return user;
151
+ } finally {
152
+ client.release(); // ✅ Always release
153
+ }
154
+ \`\`\`
155
+ ```
156
+
157
+ ## Postmortem Best Practices
158
+
159
+ ```markdown
160
+ # Blameless Postmortem Guidelines
161
+
162
+ ## Do ✅
163
+
164
+ - Focus on systems and processes, not people
165
+ - Use timeline with exact timestamps
166
+ - Identify contributing factors, not just root cause
167
+ - Create actionable items with owners and dates
168
+ - Document what went well (positive reinforcement)
169
+ - Share widely for organizational learning
170
+
171
+ ## Don't ❌
172
+
173
+ - Blame individuals or teams
174
+ - Hide or minimize the incident
175
+ - Skip the postmortem (even for small incidents)
176
+ - Create action items without owners
177
+ - Forget to follow up on action items
178
+ - Make it a blame session
179
+
180
+ ## Template Sections
181
+
182
+ 1. **Summary** (2-3 sentences)
183
+ 2. **Impact** (numbers: users, revenue, duration)
184
+ 3. **Timeline** (chronological events)
185
+ 4. **Root Cause** (technical explanation)
186
+ 5. **Contributing Factors** (broader context)
187
+ 6. **What Went Well** (positive reinforcement)
188
+ 7. **What Went Wrong** (improvement areas)
189
+ 8. **Action Items** (concrete, owned, dated)
190
+ 9. **Lessons Learned** (key takeaways)
191
+ ```
192
+
193
+ ## Output Checklist
194
+
195
+ - [ ] Timeline created
196
+ - [ ] Root cause identified
197
+ - [ ] Contributing factors documented
198
+ - [ ] Action items with owners
199
+ - [ ] Lessons learned captured
200
+ - [ ] Postmortem meeting held
201
+ - [ ] Document shared widely
202
+ - [ ] Follow-up scheduled
203
+ ENDFILE