omgkit 2.0.7 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/plugin/skills/backend/api-architecture/SKILL.md +857 -0
- package/plugin/skills/backend/caching-strategies/SKILL.md +755 -0
- package/plugin/skills/backend/event-driven-architecture/SKILL.md +753 -0
- package/plugin/skills/backend/real-time-systems/SKILL.md +635 -0
- package/plugin/skills/databases/database-optimization/SKILL.md +571 -0
- package/plugin/skills/databases/postgresql/SKILL.md +494 -18
- package/plugin/skills/devops/docker/SKILL.md +466 -18
- package/plugin/skills/devops/monorepo-management/SKILL.md +595 -0
- package/plugin/skills/devops/observability/SKILL.md +622 -0
- package/plugin/skills/devops/performance-profiling/SKILL.md +905 -0
- package/plugin/skills/frameworks/nextjs/SKILL.md +407 -44
- package/plugin/skills/frameworks/react/SKILL.md +1006 -32
- package/plugin/skills/frontend/advanced-ui-design/SKILL.md +426 -0
- package/plugin/skills/integrations/ai-integration/SKILL.md +730 -0
- package/plugin/skills/integrations/payment-integration/SKILL.md +735 -0
- package/plugin/skills/languages/python/SKILL.md +489 -25
- package/plugin/skills/languages/typescript/SKILL.md +379 -30
- package/plugin/skills/methodology/problem-solving/SKILL.md +355 -0
- package/plugin/skills/methodology/research-validation/SKILL.md +668 -0
- package/plugin/skills/methodology/sequential-thinking/SKILL.md +260 -0
- package/plugin/skills/mobile/mobile-development/SKILL.md +756 -0
- package/plugin/skills/security/security-hardening/SKILL.md +633 -0
- package/plugin/skills/tools/document-processing/SKILL.md +916 -0
- package/plugin/skills/tools/image-processing/SKILL.md +748 -0
- package/plugin/skills/tools/mcp-development/SKILL.md +883 -0
- package/plugin/skills/tools/media-processing/SKILL.md +831 -0
|
@@ -0,0 +1,622 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: observability
|
|
3
|
+
description: Production observability with structured logging, metrics collection, distributed tracing, and alerting
|
|
4
|
+
category: devops
|
|
5
|
+
triggers:
|
|
6
|
+
- observability
|
|
7
|
+
- logging
|
|
8
|
+
- monitoring
|
|
9
|
+
- distributed tracing
|
|
10
|
+
- metrics
|
|
11
|
+
- prometheus
|
|
12
|
+
- opentelemetry
|
|
13
|
+
- alerting
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
# Observability
|
|
17
|
+
|
|
18
|
+
Implement **production observability** with structured logging, metrics, distributed tracing, and alerting. This skill covers the three pillars of observability for production systems.
|
|
19
|
+
|
|
20
|
+
## Purpose
|
|
21
|
+
|
|
22
|
+
Understand and debug production systems:
|
|
23
|
+
|
|
24
|
+
- Implement structured, searchable logging
|
|
25
|
+
- Collect and visualize application metrics
|
|
26
|
+
- Trace requests across distributed services
|
|
27
|
+
- Set up meaningful alerts
|
|
28
|
+
- Create actionable dashboards
|
|
29
|
+
- Debug production issues efficiently
|
|
30
|
+
|
|
31
|
+
## Features
|
|
32
|
+
|
|
33
|
+
### 1. Structured Logging
|
|
34
|
+
|
|
35
|
+
```typescript
|
|
36
|
+
import pino from 'pino';
|
|
37
|
+
import { v4 as uuid } from 'uuid';
|
|
38
|
+
|
|
39
|
+
// Logger configuration
|
|
40
|
+
const logger = pino({
|
|
41
|
+
level: process.env.LOG_LEVEL || 'info',
|
|
42
|
+
formatters: {
|
|
43
|
+
level: (label) => ({ level: label }),
|
|
44
|
+
bindings: () => ({}),
|
|
45
|
+
},
|
|
46
|
+
timestamp: () => `,"timestamp":"${new Date().toISOString()}"`,
|
|
47
|
+
base: {
|
|
48
|
+
service: process.env.SERVICE_NAME,
|
|
49
|
+
environment: process.env.NODE_ENV,
|
|
50
|
+
version: process.env.APP_VERSION,
|
|
51
|
+
},
|
|
52
|
+
redact: ['password', 'token', 'authorization', 'cookie', '*.password'],
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
// Child logger with context
|
|
56
|
+
function createRequestLogger(req: Request) {
|
|
57
|
+
return logger.child({
|
|
58
|
+
requestId: req.headers['x-request-id'] || uuid(),
|
|
59
|
+
userId: req.user?.id,
|
|
60
|
+
path: req.path,
|
|
61
|
+
method: req.method,
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Logging middleware
|
|
66
|
+
function loggingMiddleware(req: Request, res: Response, next: NextFunction) {
|
|
67
|
+
const log = createRequestLogger(req);
|
|
68
|
+
req.log = log;
|
|
69
|
+
|
|
70
|
+
const startTime = Date.now();
|
|
71
|
+
|
|
72
|
+
// Log request
|
|
73
|
+
log.info({ type: 'request' }, 'Incoming request');
|
|
74
|
+
|
|
75
|
+
// Log response
|
|
76
|
+
res.on('finish', () => {
|
|
77
|
+
const duration = Date.now() - startTime;
|
|
78
|
+
const logData = {
|
|
79
|
+
type: 'response',
|
|
80
|
+
statusCode: res.statusCode,
|
|
81
|
+
duration,
|
|
82
|
+
contentLength: res.get('content-length'),
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
if (res.statusCode >= 500) {
|
|
86
|
+
log.error(logData, 'Request failed');
|
|
87
|
+
} else if (res.statusCode >= 400) {
|
|
88
|
+
log.warn(logData, 'Request error');
|
|
89
|
+
} else {
|
|
90
|
+
log.info(logData, 'Request completed');
|
|
91
|
+
}
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
next();
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Structured error logging
|
|
98
|
+
function logError(error: Error, context?: Record<string, any>) {
|
|
99
|
+
logger.error({
|
|
100
|
+
error: {
|
|
101
|
+
message: error.message,
|
|
102
|
+
name: error.name,
|
|
103
|
+
stack: error.stack,
|
|
104
|
+
...(error as any).details,
|
|
105
|
+
},
|
|
106
|
+
...context,
|
|
107
|
+
}, 'Error occurred');
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Business event logging
|
|
111
|
+
interface BusinessEvent {
|
|
112
|
+
event: string;
|
|
113
|
+
userId?: string;
|
|
114
|
+
data: Record<string, any>;
|
|
115
|
+
tags?: string[];
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function logBusinessEvent(event: BusinessEvent) {
|
|
119
|
+
logger.info({
|
|
120
|
+
type: 'business_event',
|
|
121
|
+
event: event.event,
|
|
122
|
+
userId: event.userId,
|
|
123
|
+
data: event.data,
|
|
124
|
+
tags: event.tags,
|
|
125
|
+
}, `Business event: ${event.event}`);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Usage
|
|
129
|
+
logBusinessEvent({
|
|
130
|
+
event: 'order.completed',
|
|
131
|
+
userId: 'user_123',
|
|
132
|
+
data: {
|
|
133
|
+
orderId: 'order_456',
|
|
134
|
+
total: 99.99,
|
|
135
|
+
items: 3,
|
|
136
|
+
},
|
|
137
|
+
tags: ['checkout', 'revenue'],
|
|
138
|
+
});
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### 2. Metrics with Prometheus
|
|
142
|
+
|
|
143
|
+
```typescript
|
|
144
|
+
import { Registry, Counter, Histogram, Gauge, collectDefaultMetrics } from 'prom-client';
|
|
145
|
+
|
|
146
|
+
// Create registry
|
|
147
|
+
const register = new Registry();
|
|
148
|
+
|
|
149
|
+
// Collect Node.js metrics
|
|
150
|
+
collectDefaultMetrics({ register });
|
|
151
|
+
|
|
152
|
+
// Custom metrics
|
|
153
|
+
const httpRequestDuration = new Histogram({
|
|
154
|
+
name: 'http_request_duration_seconds',
|
|
155
|
+
help: 'Duration of HTTP requests in seconds',
|
|
156
|
+
labelNames: ['method', 'route', 'status_code'],
|
|
157
|
+
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
|
|
158
|
+
registers: [register],
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
const httpRequestTotal = new Counter({
|
|
162
|
+
name: 'http_requests_total',
|
|
163
|
+
help: 'Total number of HTTP requests',
|
|
164
|
+
labelNames: ['method', 'route', 'status_code'],
|
|
165
|
+
registers: [register],
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
const activeConnections = new Gauge({
|
|
169
|
+
name: 'active_connections',
|
|
170
|
+
help: 'Number of active connections',
|
|
171
|
+
registers: [register],
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
const businessMetrics = {
|
|
175
|
+
ordersTotal: new Counter({
|
|
176
|
+
name: 'orders_total',
|
|
177
|
+
help: 'Total number of orders',
|
|
178
|
+
labelNames: ['status', 'payment_method'],
|
|
179
|
+
registers: [register],
|
|
180
|
+
}),
|
|
181
|
+
orderValue: new Histogram({
|
|
182
|
+
name: 'order_value_dollars',
|
|
183
|
+
help: 'Value of orders in dollars',
|
|
184
|
+
buckets: [10, 50, 100, 250, 500, 1000],
|
|
185
|
+
registers: [register],
|
|
186
|
+
}),
|
|
187
|
+
activeUsers: new Gauge({
|
|
188
|
+
name: 'active_users',
|
|
189
|
+
help: 'Number of active users',
|
|
190
|
+
registers: [register],
|
|
191
|
+
}),
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
// Metrics middleware
|
|
195
|
+
function metricsMiddleware(req: Request, res: Response, next: NextFunction) {
|
|
196
|
+
const start = Date.now();
|
|
197
|
+
|
|
198
|
+
res.on('finish', () => {
|
|
199
|
+
const duration = (Date.now() - start) / 1000;
|
|
200
|
+
const route = req.route?.path || req.path;
|
|
201
|
+
|
|
202
|
+
httpRequestDuration
|
|
203
|
+
.labels(req.method, route, res.statusCode.toString())
|
|
204
|
+
.observe(duration);
|
|
205
|
+
|
|
206
|
+
httpRequestTotal
|
|
207
|
+
.labels(req.method, route, res.statusCode.toString())
|
|
208
|
+
.inc();
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
next();
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Expose metrics endpoint
|
|
215
|
+
app.get('/metrics', async (req, res) => {
|
|
216
|
+
res.set('Content-Type', register.contentType);
|
|
217
|
+
res.end(await register.metrics());
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
// Usage in business logic
|
|
221
|
+
async function completeOrder(order: Order) {
|
|
222
|
+
await saveOrder(order);
|
|
223
|
+
|
|
224
|
+
businessMetrics.ordersTotal
|
|
225
|
+
.labels('completed', order.paymentMethod)
|
|
226
|
+
.inc();
|
|
227
|
+
|
|
228
|
+
businessMetrics.orderValue.observe(order.total);
|
|
229
|
+
}
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### 3. Distributed Tracing with OpenTelemetry
|
|
233
|
+
|
|
234
|
+
```typescript
|
|
235
|
+
import { NodeSDK } from '@opentelemetry/sdk-node';
|
|
236
|
+
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
|
|
237
|
+
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
|
|
238
|
+
import { Resource } from '@opentelemetry/resources';
|
|
239
|
+
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
|
|
240
|
+
import { trace, SpanStatusCode, context } from '@opentelemetry/api';
|
|
241
|
+
|
|
242
|
+
// Initialize OpenTelemetry
|
|
243
|
+
const sdk = new NodeSDK({
|
|
244
|
+
resource: new Resource({
|
|
245
|
+
[SemanticResourceAttributes.SERVICE_NAME]: process.env.SERVICE_NAME,
|
|
246
|
+
[SemanticResourceAttributes.SERVICE_VERSION]: process.env.APP_VERSION,
|
|
247
|
+
[SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV,
|
|
248
|
+
}),
|
|
249
|
+
traceExporter: new OTLPTraceExporter({
|
|
250
|
+
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
|
|
251
|
+
}),
|
|
252
|
+
instrumentations: [
|
|
253
|
+
getNodeAutoInstrumentations({
|
|
254
|
+
'@opentelemetry/instrumentation-http': {
|
|
255
|
+
ignoreIncomingRequestHook: (req) => req.url === '/health',
|
|
256
|
+
},
|
|
257
|
+
'@opentelemetry/instrumentation-express': {},
|
|
258
|
+
'@opentelemetry/instrumentation-pg': {},
|
|
259
|
+
'@opentelemetry/instrumentation-redis': {},
|
|
260
|
+
}),
|
|
261
|
+
],
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
sdk.start();
|
|
265
|
+
|
|
266
|
+
// Custom spans
|
|
267
|
+
const tracer = trace.getTracer('my-service');
|
|
268
|
+
|
|
269
|
+
async function processOrder(orderId: string): Promise<void> {
|
|
270
|
+
return tracer.startActiveSpan('processOrder', async (span) => {
|
|
271
|
+
try {
|
|
272
|
+
span.setAttributes({
|
|
273
|
+
'order.id': orderId,
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
// Child span for validation
|
|
277
|
+
await tracer.startActiveSpan('validateOrder', async (validationSpan) => {
|
|
278
|
+
const isValid = await validateOrder(orderId);
|
|
279
|
+
validationSpan.setAttributes({ 'order.valid': isValid });
|
|
280
|
+
validationSpan.end();
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
// Child span for payment
|
|
284
|
+
await tracer.startActiveSpan('processPayment', async (paymentSpan) => {
|
|
285
|
+
const payment = await chargeCard(orderId);
|
|
286
|
+
paymentSpan.setAttributes({
|
|
287
|
+
'payment.id': payment.id,
|
|
288
|
+
'payment.amount': payment.amount,
|
|
289
|
+
});
|
|
290
|
+
paymentSpan.end();
|
|
291
|
+
});
|
|
292
|
+
|
|
293
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
294
|
+
} catch (error) {
|
|
295
|
+
span.setStatus({
|
|
296
|
+
code: SpanStatusCode.ERROR,
|
|
297
|
+
message: error.message,
|
|
298
|
+
});
|
|
299
|
+
span.recordException(error);
|
|
300
|
+
throw error;
|
|
301
|
+
} finally {
|
|
302
|
+
span.end();
|
|
303
|
+
}
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Propagate context across services
|
|
308
|
+
async function callExternalService(endpoint: string, data: any) {
|
|
309
|
+
return tracer.startActiveSpan('externalServiceCall', async (span) => {
|
|
310
|
+
span.setAttributes({
|
|
311
|
+
'http.url': endpoint,
|
|
312
|
+
'http.method': 'POST',
|
|
313
|
+
});
|
|
314
|
+
|
|
315
|
+
// Inject trace context into headers
|
|
316
|
+
const headers: Record<string, string> = {};
|
|
317
|
+
const propagator = trace.getTracerProvider();
|
|
318
|
+
|
|
319
|
+
const response = await fetch(endpoint, {
|
|
320
|
+
method: 'POST',
|
|
321
|
+
headers: {
|
|
322
|
+
'Content-Type': 'application/json',
|
|
323
|
+
...headers,
|
|
324
|
+
},
|
|
325
|
+
body: JSON.stringify(data),
|
|
326
|
+
});
|
|
327
|
+
|
|
328
|
+
span.setAttributes({
|
|
329
|
+
'http.status_code': response.status,
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
span.end();
|
|
333
|
+
return response;
|
|
334
|
+
});
|
|
335
|
+
}
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### 4. Error Tracking
|
|
339
|
+
|
|
340
|
+
```typescript
|
|
341
|
+
import * as Sentry from '@sentry/node';
|
|
342
|
+
import { ProfilingIntegration } from '@sentry/profiling-node';
|
|
343
|
+
|
|
344
|
+
// Initialize Sentry
|
|
345
|
+
Sentry.init({
|
|
346
|
+
dsn: process.env.SENTRY_DSN,
|
|
347
|
+
environment: process.env.NODE_ENV,
|
|
348
|
+
release: process.env.APP_VERSION,
|
|
349
|
+
integrations: [
|
|
350
|
+
new ProfilingIntegration(),
|
|
351
|
+
],
|
|
352
|
+
tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0,
|
|
353
|
+
profilesSampleRate: 0.1,
|
|
354
|
+
beforeSend(event, hint) {
|
|
355
|
+
// Filter out known errors
|
|
356
|
+
const error = hint.originalException as Error;
|
|
357
|
+
if (error?.message?.includes('ECONNRESET')) {
|
|
358
|
+
return null;
|
|
359
|
+
}
|
|
360
|
+
return event;
|
|
361
|
+
},
|
|
362
|
+
});
|
|
363
|
+
|
|
364
|
+
// Error handler middleware
|
|
365
|
+
app.use(Sentry.Handlers.errorHandler({
|
|
366
|
+
shouldHandleError(error) {
|
|
367
|
+
// Only report 5xx errors
|
|
368
|
+
return error.status >= 500;
|
|
369
|
+
},
|
|
370
|
+
}));
|
|
371
|
+
|
|
372
|
+
// Custom error reporting
|
|
373
|
+
function reportError(error: Error, context?: Record<string, any>) {
|
|
374
|
+
Sentry.withScope((scope) => {
|
|
375
|
+
if (context) {
|
|
376
|
+
scope.setExtras(context);
|
|
377
|
+
}
|
|
378
|
+
scope.setTags({
|
|
379
|
+
component: context?.component || 'unknown',
|
|
380
|
+
});
|
|
381
|
+
Sentry.captureException(error);
|
|
382
|
+
});
|
|
383
|
+
|
|
384
|
+
// Also log
|
|
385
|
+
logError(error, context);
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Capture user context
|
|
389
|
+
app.use((req, res, next) => {
|
|
390
|
+
if (req.user) {
|
|
391
|
+
Sentry.setUser({
|
|
392
|
+
id: req.user.id,
|
|
393
|
+
email: req.user.email,
|
|
394
|
+
});
|
|
395
|
+
}
|
|
396
|
+
next();
|
|
397
|
+
});
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
### 5. Alerting Configuration
|
|
401
|
+
|
|
402
|
+
```yaml
|
|
403
|
+
# prometheus/alerts.yml
|
|
404
|
+
groups:
|
|
405
|
+
- name: application
|
|
406
|
+
rules:
|
|
407
|
+
# High error rate
|
|
408
|
+
- alert: HighErrorRate
|
|
409
|
+
expr: |
|
|
410
|
+
sum(rate(http_requests_total{status_code=~"5.."}[5m]))
|
|
411
|
+
/
|
|
412
|
+
sum(rate(http_requests_total[5m]))
|
|
413
|
+
> 0.05
|
|
414
|
+
for: 5m
|
|
415
|
+
labels:
|
|
416
|
+
severity: critical
|
|
417
|
+
annotations:
|
|
418
|
+
summary: High error rate detected
|
|
419
|
+
description: Error rate is {{ $value | humanizePercentage }}
|
|
420
|
+
|
|
421
|
+
# Slow response time
|
|
422
|
+
- alert: SlowResponseTime
|
|
423
|
+
expr: |
|
|
424
|
+
histogram_quantile(0.95,
|
|
425
|
+
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
|
|
426
|
+
) > 2
|
|
427
|
+
for: 5m
|
|
428
|
+
labels:
|
|
429
|
+
severity: warning
|
|
430
|
+
annotations:
|
|
431
|
+
summary: Slow response times detected
|
|
432
|
+
description: 95th percentile latency is {{ $value }}s
|
|
433
|
+
|
|
434
|
+
# High memory usage
|
|
435
|
+
- alert: HighMemoryUsage
|
|
436
|
+
expr: |
|
|
437
|
+
process_resident_memory_bytes / 1024 / 1024 / 1024 > 2
|
|
438
|
+
for: 10m
|
|
439
|
+
labels:
|
|
440
|
+
severity: warning
|
|
441
|
+
annotations:
|
|
442
|
+
summary: High memory usage
|
|
443
|
+
description: Memory usage is {{ $value | humanize }}GB
|
|
444
|
+
|
|
445
|
+
# Service down
|
|
446
|
+
- alert: ServiceDown
|
|
447
|
+
expr: up == 0
|
|
448
|
+
for: 1m
|
|
449
|
+
labels:
|
|
450
|
+
severity: critical
|
|
451
|
+
annotations:
|
|
452
|
+
summary: Service is down
|
|
453
|
+
description: "{{ $labels.instance }} has been down for more than 1 minute"
|
|
454
|
+
|
|
455
|
+
- name: business
|
|
456
|
+
rules:
|
|
457
|
+
# Low order volume
|
|
458
|
+
- alert: LowOrderVolume
|
|
459
|
+
expr: |
|
|
460
|
+
sum(rate(orders_total{status="completed"}[1h])) < 10
|
|
461
|
+
for: 30m
|
|
462
|
+
labels:
|
|
463
|
+
severity: warning
|
|
464
|
+
annotations:
|
|
465
|
+
summary: Low order volume
|
|
466
|
+
description: Order rate is {{ $value }} orders/hour
|
|
467
|
+
|
|
468
|
+
# Payment failures
|
|
469
|
+
- alert: HighPaymentFailureRate
|
|
470
|
+
expr: |
|
|
471
|
+
sum(rate(orders_total{status="payment_failed"}[15m]))
|
|
472
|
+
/
|
|
473
|
+
sum(rate(orders_total[15m]))
|
|
474
|
+
> 0.1
|
|
475
|
+
for: 5m
|
|
476
|
+
labels:
|
|
477
|
+
severity: critical
|
|
478
|
+
annotations:
|
|
479
|
+
summary: High payment failure rate
|
|
480
|
+
description: Payment failure rate is {{ $value | humanizePercentage }}
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
### 6. Health Checks
|
|
484
|
+
|
|
485
|
+
```typescript
|
|
486
|
+
import { HealthCheck, HealthCheckResult, HttpHealthIndicator, DiskHealthIndicator, MemoryHealthIndicator } from '@nestjs/terminus';
|
|
487
|
+
|
|
488
|
+
interface HealthStatus {
|
|
489
|
+
status: 'healthy' | 'degraded' | 'unhealthy';
|
|
490
|
+
checks: Record<string, CheckResult>;
|
|
491
|
+
timestamp: string;
|
|
492
|
+
version: string;
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
interface CheckResult {
|
|
496
|
+
status: 'pass' | 'fail' | 'warn';
|
|
497
|
+
duration: number;
|
|
498
|
+
message?: string;
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
async function healthCheck(): Promise<HealthStatus> {
|
|
502
|
+
const checks: Record<string, CheckResult> = {};
|
|
503
|
+
let overallStatus: HealthStatus['status'] = 'healthy';
|
|
504
|
+
|
|
505
|
+
// Database check
|
|
506
|
+
checks.database = await checkDatabase();
|
|
507
|
+
if (checks.database.status === 'fail') overallStatus = 'unhealthy';
|
|
508
|
+
|
|
509
|
+
// Redis check
|
|
510
|
+
checks.redis = await checkRedis();
|
|
511
|
+
if (checks.redis.status === 'fail') overallStatus = 'unhealthy';
|
|
512
|
+
|
|
513
|
+
// External API check
|
|
514
|
+
checks.externalApi = await checkExternalApi();
|
|
515
|
+
if (checks.externalApi.status === 'warn') {
|
|
516
|
+
overallStatus = overallStatus === 'healthy' ? 'degraded' : overallStatus;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
// Memory check
|
|
520
|
+
checks.memory = checkMemory();
|
|
521
|
+
if (checks.memory.status === 'warn') {
|
|
522
|
+
overallStatus = overallStatus === 'healthy' ? 'degraded' : overallStatus;
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
return {
|
|
526
|
+
status: overallStatus,
|
|
527
|
+
checks,
|
|
528
|
+
timestamp: new Date().toISOString(),
|
|
529
|
+
version: process.env.APP_VERSION || 'unknown',
|
|
530
|
+
};
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
async function checkDatabase(): Promise<CheckResult> {
|
|
534
|
+
const start = Date.now();
|
|
535
|
+
try {
|
|
536
|
+
await db.$queryRaw`SELECT 1`;
|
|
537
|
+
return {
|
|
538
|
+
status: 'pass',
|
|
539
|
+
duration: Date.now() - start,
|
|
540
|
+
};
|
|
541
|
+
} catch (error) {
|
|
542
|
+
return {
|
|
543
|
+
status: 'fail',
|
|
544
|
+
duration: Date.now() - start,
|
|
545
|
+
message: error.message,
|
|
546
|
+
};
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// Endpoints
|
|
551
|
+
app.get('/health', async (req, res) => {
|
|
552
|
+
const health = await healthCheck();
|
|
553
|
+
const statusCode = health.status === 'healthy' ? 200 :
|
|
554
|
+
health.status === 'degraded' ? 200 : 503;
|
|
555
|
+
res.status(statusCode).json(health);
|
|
556
|
+
});
|
|
557
|
+
|
|
558
|
+
app.get('/health/live', (req, res) => {
|
|
559
|
+
res.status(200).json({ status: 'alive' });
|
|
560
|
+
});
|
|
561
|
+
|
|
562
|
+
app.get('/health/ready', async (req, res) => {
|
|
563
|
+
const health = await healthCheck();
|
|
564
|
+
res.status(health.status === 'unhealthy' ? 503 : 200).json(health);
|
|
565
|
+
});
|
|
566
|
+
```
|
|
567
|
+
|
|
568
|
+
## Use Cases
|
|
569
|
+
|
|
570
|
+
### 1. Debugging Production Issues
|
|
571
|
+
|
|
572
|
+
```typescript
|
|
573
|
+
// Correlation ID for request tracing
|
|
574
|
+
const correlationId = req.headers['x-correlation-id'] || uuid();
|
|
575
|
+
req.log = logger.child({ correlationId });
|
|
576
|
+
|
|
577
|
+
// Log at decision points
|
|
578
|
+
req.log.info({ userId, action: 'checkout.started' });
|
|
579
|
+
// ... process ...
|
|
580
|
+
req.log.info({ orderId, action: 'order.created' });
|
|
581
|
+
```
|
|
582
|
+
|
|
583
|
+
### 2. Performance Monitoring
|
|
584
|
+
|
|
585
|
+
```typescript
|
|
586
|
+
// Track key business metrics
|
|
587
|
+
businessMetrics.apiLatency.observe(duration);
|
|
588
|
+
businessMetrics.cacheHitRate.set(hits / total);
|
|
589
|
+
```
|
|
590
|
+
|
|
591
|
+
## Best Practices
|
|
592
|
+
|
|
593
|
+
### Do's
|
|
594
|
+
|
|
595
|
+
- **Use correlation IDs** - Trace requests across services
|
|
596
|
+
- **Log at appropriate levels** - Don't log everything as error
|
|
597
|
+
- **Set meaningful alerts** - Alert on symptoms, not causes
|
|
598
|
+
- **Create actionable dashboards** - Show what matters
|
|
599
|
+
- **Implement log rotation** - Prevent disk exhaustion
|
|
600
|
+
- **Sample high-volume traces** - Balance detail vs cost
|
|
601
|
+
|
|
602
|
+
### Don'ts
|
|
603
|
+
|
|
604
|
+
- Don't log sensitive data
|
|
605
|
+
- Don't ignore alert fatigue
|
|
606
|
+
- Don't skip structured logging
|
|
607
|
+
- Don't forget log levels
|
|
608
|
+
- Don't alert on every error
|
|
609
|
+
- Don't neglect log retention policies
|
|
610
|
+
|
|
611
|
+
## Related Skills
|
|
612
|
+
|
|
613
|
+
- **kubernetes** - Container orchestration
|
|
614
|
+
- **backend-development** - Application code
|
|
615
|
+
- **performance-profiling** - Performance analysis
|
|
616
|
+
|
|
617
|
+
## Reference Resources
|
|
618
|
+
|
|
619
|
+
- [OpenTelemetry Documentation](https://opentelemetry.io/docs/)
|
|
620
|
+
- [Prometheus Documentation](https://prometheus.io/docs/)
|
|
621
|
+
- [Grafana Documentation](https://grafana.com/docs/)
|
|
622
|
+
- [Pino Logger](https://getpino.io/)
|