agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,714 @@
1
+ # Observability
2
+
3
+ Comprehensive guidelines for implementing metrics, logs, traces, and alerting.
4
+
5
+ ## Core Principles
6
+
7
+ 1. **Three Pillars** - Metrics, logs, and traces together provide complete visibility
8
+ 2. **User-Centric** - Measure what matters to users, not just infrastructure
9
+ 3. **Actionable Data** - Every dashboard and alert should drive action
10
+ 4. **Correlation** - Connect signals across the stack for faster debugging
11
+
12
+ ## The Three Pillars
13
+
14
+ ### Metrics
15
+
16
+ ```yaml
17
+ metrics_overview:
18
+ purpose: "Numerical measurements over time"
19
+ good_for:
20
+ - "Trends and patterns"
21
+ - "Alerting on thresholds"
22
+ - "Capacity planning"
23
+ - "SLO tracking"
24
+ not_good_for:
25
+ - "Understanding why something failed"
26
+ - "Request-level debugging"
27
+
28
+ metric_types:
29
+ counter:
30
+ description: "Cumulative value that only increases"
31
+ examples:
32
+ - "http_requests_total"
33
+ - "errors_total"
34
+ - "bytes_sent_total"
35
+ usage: "Rate of change, totals over time"
36
+
37
+ gauge:
38
+ description: "Value that can go up or down"
39
+ examples:
40
+ - "temperature_celsius"
41
+ - "queue_depth"
42
+ - "active_connections"
43
+ usage: "Current state, point-in-time values"
44
+
45
+ histogram:
46
+ description: "Distribution of values in buckets"
47
+ examples:
48
+ - "http_request_duration_seconds"
49
+ - "response_size_bytes"
50
+ usage: "Percentiles, distribution analysis"
51
+
52
+ summary:
53
+ description: "Pre-calculated percentiles"
54
+ examples:
55
+ - "request_latency_quantiles"
56
+ usage: "When you need exact percentiles, not histograms"
57
+ ```
58
+
59
+ ### Logs
60
+
61
+ ```yaml
62
+ logs_overview:
63
+ purpose: "Discrete events with context"
64
+ good_for:
65
+ - "Debugging specific issues"
66
+ - "Audit trails"
67
+ - "Understanding what happened"
68
+ - "Request tracing"
69
+ not_good_for:
70
+ - "Aggregate trends"
71
+ - "Real-time alerting at scale"
72
+
73
+ structured_logging:
74
+ format: "JSON"
75
+ benefits:
76
+ - "Machine parseable"
77
+ - "Consistent fields"
78
+ - "Easy to query"
79
+ - "Extensible"
80
+
81
+ required_fields:
82
+ - timestamp: "ISO 8601 format"
83
+ - level: "debug, info, warn, error"
84
+ - message: "Human-readable description"
85
+ - service: "Service name"
86
+ - trace_id: "Distributed trace ID"
87
+
88
+ example: |
89
+ {
90
+ "timestamp": "2025-01-15T14:30:45.123Z",
91
+ "level": "error",
92
+ "message": "Failed to process payment",
93
+ "service": "payment-service",
94
+ "trace_id": "abc123def456",
95
+ "span_id": "789xyz",
96
+ "user_id": "user_12345",
97
+ "payment_id": "pay_67890",
98
+ "error": {
99
+ "type": "PaymentDeclinedException",
100
+ "message": "Insufficient funds",
101
+ "code": "INSUFFICIENT_FUNDS"
102
+ },
103
+ "duration_ms": 234
104
+ }
105
+ ```
106
+
107
+ ### Traces
108
+
109
+ ```yaml
110
+ traces_overview:
111
+ purpose: "Request flow across services"
112
+ good_for:
113
+ - "Understanding distributed systems"
114
+ - "Finding bottlenecks"
115
+ - "Debugging latency issues"
116
+ - "Service dependency mapping"
117
+ not_good_for:
118
+ - "Aggregate metrics"
119
+ - "Long-term trend analysis"
120
+
121
+ trace_concepts:
122
+ trace:
123
+ description: "End-to-end request journey"
124
+ contains: "Multiple spans"
125
+
126
+ span:
127
+ description: "Single operation within a trace"
128
+ attributes:
129
+ - "Operation name"
130
+ - "Start/end time"
131
+ - "Status"
132
+ - "Tags/attributes"
133
+
134
+ context_propagation:
135
+ description: "Passing trace context between services"
136
+ headers:
137
+ - "traceparent (W3C)"
138
+ - "X-B3-TraceId (Zipkin)"
139
+
140
+ opentelemetry_example: |
141
+ import { trace } from '@opentelemetry/api';
142
+
143
+ const tracer = trace.getTracer('payment-service');
144
+
145
+ async function processPayment(request) {
146
+ return tracer.startActiveSpan('processPayment', async (span) => {
147
+ try {
148
+ span.setAttribute('payment.amount', request.amount);
149
+ span.setAttribute('payment.currency', request.currency);
150
+
151
+ // Downstream calls automatically get trace context
152
+ const result = await paymentGateway.charge(request);
153
+
154
+ span.setAttribute('payment.status', 'success');
155
+ return result;
156
+ } catch (error) {
157
+ span.recordException(error);
158
+ span.setStatus({ code: SpanStatusCode.ERROR });
159
+ throw error;
160
+ } finally {
161
+ span.end();
162
+ }
163
+ });
164
+ }
165
+ ```
166
+
167
+ ## The Four Golden Signals
168
+
169
+ ```yaml
170
+ golden_signals:
171
+ latency:
172
+ description: "Time to service a request"
173
+ why_important: "Users notice slow responses"
174
+ what_to_measure:
175
+ - "Successful request latency"
176
+ - "Failed request latency (often faster!)"
177
+ - "Percentiles (p50, p90, p95, p99)"
178
+ metrics:
179
+ - "http_request_duration_seconds (histogram)"
180
+ alerts:
181
+ - "p50 > 200ms"
182
+ - "p99 > 1s"
183
+
184
+ traffic:
185
+ description: "Demand on the system"
186
+ why_important: "Context for other metrics"
187
+ what_to_measure:
188
+ - "Requests per second"
189
+ - "Concurrent users"
190
+ - "Transactions per second"
191
+ metrics:
192
+ - "http_requests_total (counter)"
193
+ alerts:
194
+ - "Traffic drop > 50% (possible outage)"
195
+ - "Traffic spike > 200% (possible attack/event)"
196
+
197
+ errors:
198
+ description: "Rate of failed requests"
199
+ why_important: "Direct user impact"
200
+ what_to_measure:
201
+ - "HTTP 5xx rate"
202
+ - "Application errors"
203
+ - "Timeout rate"
204
+ metrics:
205
+ - "http_requests_total{status=~'5..'}"
206
+ alerts:
207
+ - "Error rate > 1%"
208
+ - "Error rate > 5% (critical)"
209
+
210
+ saturation:
211
+ description: "How full the system is"
212
+ why_important: "Predicts future problems"
213
+ what_to_measure:
214
+ - "CPU utilization"
215
+ - "Memory utilization"
216
+ - "Disk I/O"
217
+ - "Network bandwidth"
218
+ - "Connection pools"
219
+ metrics:
220
+ - "process_cpu_seconds_total"
221
+ - "process_resident_memory_bytes"
222
+ alerts:
223
+ - "CPU > 80%"
224
+ - "Memory > 85%"
225
+ - "Disk > 90%"
226
+ ```
227
+
228
+ ## Metrics Best Practices
229
+
230
+ ### Naming Conventions
231
+
232
+ ```yaml
233
+ prometheus_naming:
234
+ format: "{namespace}_{subsystem}_{name}_{unit}"
235
+
236
+ examples:
237
+ good:
238
+ - "http_requests_total"
239
+ - "http_request_duration_seconds"
240
+ - "process_cpu_seconds_total"
241
+ - "node_memory_bytes"
242
+
243
+ bad:
244
+ - "requests" # Too vague
245
+ - "httpRequestsTotal" # Wrong case
246
+ - "request_time_ms" # Use base units
247
+
248
+ units:
249
+ - "seconds (not milliseconds)"
250
+ - "bytes (not kilobytes)"
251
+ - "Use _total suffix for counters"
252
+
253
+ label_best_practices:
254
+ do:
255
+ - "Use labels for dimensions (status, method, path)"
256
+ - "Keep cardinality bounded"
257
+ - "Use consistent label names"
258
+
259
+ dont:
260
+ - "User IDs as labels (unbounded cardinality)"
261
+ - "Request IDs as labels"
262
+ - "High-cardinality values (URLs with IDs)"
263
+
264
+ cardinality_guidelines:
265
+ low: "< 10 values (method: GET, POST, etc.)"
266
+ medium: "< 100 values (endpoint paths)"
267
+ high: "> 100 values (AVOID - causes performance issues)"
268
+ ```
269
+
270
+ ### RED Method (Request-Driven)
271
+
272
+ ```yaml
273
+ red_method:
274
+ description: "For request-driven services"
275
+
276
+ rate:
277
+ what: "Requests per second"
278
+ metric: "rate(http_requests_total[5m])"
279
+
280
+ errors:
281
+ what: "Failed requests per second"
282
+ metric: "rate(http_requests_total{status=~'5..'}[5m])"
283
+
284
+ duration:
285
+ what: "Distribution of request latency"
286
+ metric: "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))"
287
+ ```
288
+
289
+ ### USE Method (Resource-Driven)
290
+
291
+ ```yaml
292
+ use_method:
293
+ description: "For infrastructure resources"
294
+
295
+ utilization:
296
+ what: "Percentage of resource used"
297
+ examples:
298
+ - "CPU: 100 - (avg(irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)"
299
+ - "Memory: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes"
300
+ - "Disk: node_filesystem_avail_bytes / node_filesystem_size_bytes"
301
+
302
+ saturation:
303
+ what: "Amount of work queued"
304
+ examples:
305
+ - "CPU: node_load1 / count(node_cpu_seconds_total{mode='idle'})"
306
+ - "Disk: rate(node_disk_io_time_weighted_seconds_total[5m])"
307
+
308
+ errors:
309
+ what: "Error events"
310
+ examples:
311
+ - "Disk: rate(node_disk_read_errors_total[5m])"
312
+ - "Network: rate(node_network_receive_errs_total[5m])"
313
+ ```
314
+
315
+ ## Logging Best Practices
316
+
317
+ ### Log Levels
318
+
319
+ ```yaml
320
+ log_levels:
321
+ debug:
322
+ when: "Detailed diagnostic information"
323
+ examples:
324
+ - "Function entry/exit"
325
+ - "Variable values"
326
+ - "SQL queries"
327
+ production: "Usually disabled"
328
+
329
+ info:
330
+ when: "Normal operations worth noting"
331
+ examples:
332
+ - "Service started"
333
+ - "Request processed"
334
+ - "Configuration loaded"
335
+ production: "Enabled"
336
+
337
+ warn:
338
+ when: "Something unexpected but recoverable"
339
+ examples:
340
+ - "Retry attempted"
341
+ - "Deprecation warning"
342
+ - "Resource approaching limit"
343
+ production: "Enabled, may alert"
344
+
345
+ error:
346
+ when: "Something failed that shouldn't have"
347
+ examples:
348
+ - "Request failed"
349
+ - "External service error"
350
+ - "Unhandled exception"
351
+ production: "Enabled, often alerts"
352
+ ```
353
+
354
+ ### What to Log
355
+
356
+ ```yaml
357
+ always_log:
358
+ - "Service startup/shutdown"
359
+ - "Configuration changes"
360
+ - "Authentication events"
361
+ - "Authorization failures"
362
+ - "External service calls (start, end, errors)"
363
+ - "Business transactions"
364
+ - "Errors and exceptions"
365
+
366
+ never_log:
367
+ - "Passwords or secrets"
368
+ - "Full credit card numbers"
369
+ - "Personal health information"
370
+ - "Social security numbers"
371
+ - "Session tokens or API keys"
372
+ - "PII without consent"
373
+
374
+ log_correlation:
375
+ required_fields:
376
+ - "trace_id: Link to distributed trace"
377
+ - "span_id: Current operation"
378
+ - "request_id: Unique request identifier"
379
+ - "user_id: Who made the request (if applicable)"
380
+ ```
381
+
382
+ ### Log Aggregation Patterns
383
+
384
+ ```yaml
385
+ log_pipeline:
386
+ collection:
387
+ agents:
388
+ - "Fluentd/Fluent Bit"
389
+ - "Vector"
390
+ - "Filebeat"
391
+ patterns:
392
+ - "Sidecar container"
393
+ - "DaemonSet"
394
+ - "Direct shipping"
395
+
396
+ processing:
397
+ tasks:
398
+ - "Parse structured logs"
399
+ - "Enrich with metadata"
400
+ - "Filter noise"
401
+ - "Sample high-volume logs"
402
+
403
+ storage:
404
+ options:
405
+ - "Loki (log aggregation)"
406
+ - "Elasticsearch (full-text search)"
407
+ - "S3 (archive)"
408
+ retention:
409
+ hot: "7 days (fast query)"
410
+ warm: "30 days (slower query)"
411
+ cold: "1 year (archive only)"
412
+ ```
413
+
414
+ ## Alerting Best Practices
415
+
416
+ ### Alert Quality
417
+
418
+ ```yaml
419
+ good_alert_characteristics:
420
+ actionable:
421
+ description: "Someone needs to do something"
422
+ test: "If the on-call can't act, don't alert"
423
+
424
+ urgent:
425
+ description: "Needs attention now"
426
+ test: "If it can wait until morning, don't page"
427
+
428
+ relevant:
429
+ description: "Indicates real user impact"
430
+ test: "Does this affect users or business?"
431
+
432
+ clear:
433
+ description: "Alert tells you what's wrong"
434
+ test: "Can on-call understand without context?"
435
+
436
+ alert_anti_patterns:
437
+ flapping_alerts:
438
+ problem: "Alert fires and resolves repeatedly"
439
+ solution: "Add hysteresis (for: 5m in Prometheus)"
440
+
441
+ noisy_alerts:
442
+ problem: "Too many alerts, alert fatigue"
443
+ solution: "Tune thresholds, aggregate related alerts"
444
+
445
+ ambiguous_alerts:
446
+ problem: "Alert doesn't explain what's wrong"
447
+ solution: "Include summary, runbook link, relevant values"
448
+
449
+ orphan_alerts:
450
+ problem: "No runbook or documentation"
451
+ solution: "Every alert must link to a runbook"
452
+ ```
453
+
454
+ ### Alert Structure
455
+
456
+ ```yaml
457
+ prometheus_alert_template:
458
+ example: |
459
+ groups:
460
+ - name: api-server
461
+ rules:
462
+ - alert: APIHighErrorRate
463
+ expr: |
464
+ sum(rate(http_requests_total{job="api-server",status=~"5.."}[5m]))
465
+ / sum(rate(http_requests_total{job="api-server"}[5m]))
466
+ > 0.01
467
+ for: 5m
468
+ labels:
469
+ severity: warning
470
+ team: backend
471
+ service: api-server
472
+ annotations:
473
+ summary: "API error rate above 1%"
474
+ description: "Error rate is {{ $value | humanizePercentage }} over the last 5 minutes"
475
+ runbook_url: "https://wiki.example.com/runbooks/api-high-error-rate"
476
+ dashboard_url: "https://grafana.example.com/d/api-server"
477
+
478
+ required_annotations:
479
+ summary: "One-line description of the problem"
480
+ description: "Detailed description with current values"
481
+ runbook_url: "Link to troubleshooting guide"
482
+ dashboard_url: "Link to relevant dashboard"
483
+ ```
484
+
485
+ ### Alert Routing
486
+
487
+ ```yaml
488
+ alertmanager_routing:
489
+ example: |
490
+ route:
491
+ receiver: 'default-slack'
492
+ group_by: ['alertname', 'service']
493
+ group_wait: 30s
494
+ group_interval: 5m
495
+ repeat_interval: 4h
496
+
497
+ routes:
498
+ # Critical alerts page immediately
499
+ - match:
500
+ severity: critical
501
+ receiver: 'pagerduty-critical'
502
+ continue: true
503
+
504
+ # Warning alerts go to Slack
505
+ - match:
506
+ severity: warning
507
+ receiver: 'slack-warnings'
508
+
509
+ # Team-specific routing
510
+ - match:
511
+ team: backend
512
+ receiver: 'backend-slack'
513
+ routes:
514
+ - match:
515
+ severity: critical
516
+ receiver: 'backend-pagerduty'
517
+
518
+ receivers:
519
+ - name: 'pagerduty-critical'
520
+ pagerduty_configs:
521
+ - service_key: '<key>'
522
+ severity: critical
523
+
524
+ - name: 'slack-warnings'
525
+ slack_configs:
526
+ - channel: '#alerts-warnings'
527
+ title: '{{ .GroupLabels.alertname }}'
528
+ text: '{{ .Annotations.summary }}'
529
+ ```
530
+
531
+ ## Dashboard Design
532
+
533
+ ### Dashboard Hierarchy
534
+
535
+ ```yaml
536
+ dashboard_levels:
537
+ executive:
538
+ audience: "Leadership, non-technical"
539
+ content:
540
+ - "Overall service health (green/yellow/red)"
541
+ - "SLO status"
542
+ - "Business metrics"
543
+ refresh: "5 minutes"
544
+
545
+ service:
546
+ audience: "On-call engineers, service owners"
547
+ content:
548
+ - "Golden signals for the service"
549
+ - "Error budget status"
550
+ - "Recent deployments"
551
+ - "Dependency health"
552
+ refresh: "30 seconds"
553
+
554
+ debug:
555
+ audience: "Engineers debugging issues"
556
+ content:
557
+ - "Detailed metrics"
558
+ - "Per-instance breakdowns"
559
+ - "Resource utilization"
560
+ - "Log links"
561
+ refresh: "10 seconds"
562
+ ```
563
+
564
+ ### Dashboard Best Practices
565
+
566
+ ```yaml
567
+ layout_guidelines:
568
+ top_row: "Key indicators and health status"
569
+ middle: "Detailed metrics and graphs"
570
+ bottom: "Context (deployments, changes, dependencies)"
571
+
572
+ visualization_choices:
573
+ stat_panel: "Current value, health indicator"
574
+ time_series: "Trends over time"
575
+ heatmap: "Distribution (latency buckets)"
576
+ table: "Detailed breakdown"
577
+ gauge: "Percentage of capacity"
578
+
579
+ common_mistakes:
580
+ - "Too many panels (information overload)"
581
+ - "No clear hierarchy"
582
+ - "Missing time range selector"
583
+ - "No links between dashboards"
584
+ - "No documentation/descriptions"
585
+
586
+ template_variables:
587
+ recommended:
588
+ - "environment: dev, staging, production"
589
+ - "service: service name"
590
+ - "instance: specific instance"
591
+ usage: "Allow filtering without editing dashboard"
592
+ ```
593
+
594
+ ## Correlation and Context
595
+
596
+ ### Connecting the Pillars
597
+
598
+ ```yaml
599
+ correlation_patterns:
600
+ trace_to_logs:
601
+ method: "Include trace_id in all log entries"
602
+ query: "Search logs where trace_id = X"
603
+
604
+ logs_to_metrics:
605
+ method: "Extract metrics from logs"
606
+ example: "Count ERROR logs as error_count metric"
607
+
608
+ metrics_to_traces:
609
+ method: "Exemplars link metrics to traces"
610
+ example: "High latency metric point links to slow trace"
611
+
612
+ alerting_to_dashboards:
613
+ method: "Include dashboard links in alerts"
614
+ example: "Alert annotation includes Grafana link"
615
+
616
+ exemplars:
617
+ description: "Link between metrics and traces"
618
+ usage: |
619
+ # Prometheus histogram with exemplar
620
+ http_request_duration_seconds_bucket{le="0.5"} 1000 # {trace_id="abc123"}
621
+ grafana: "Click on data point to see trace"
622
+ ```
623
+
624
+ ### Context Propagation
625
+
626
+ ```yaml
627
+ propagation_headers:
628
+ w3c_trace_context:
629
+ header: "traceparent"
630
+ format: "00-{trace_id}-{span_id}-{flags}"
631
+
632
+ baggage:
633
+ header: "baggage"
634
+ usage: "Propagate application-specific context"
635
+ example: "user_id=123,feature_flag=new_checkout"
636
+
637
+ implementation: |
638
+ // Middleware to propagate context
639
+ function tracingMiddleware(req, res, next) {
640
+ const traceId = req.headers['traceparent']
641
+ ? extractTraceId(req.headers['traceparent'])
642
+ : generateTraceId();
643
+
644
+ // Add to request context
645
+ req.traceId = traceId;
646
+
647
+ // Add to response headers
648
+ res.setHeader('traceparent', formatTraceParent(traceId));
649
+
650
+ // Add to logger context
651
+ req.logger = logger.child({ traceId });
652
+
653
+ next();
654
+ }
655
+ ```
656
+
657
+ ## Common Pitfalls
658
+
659
+ ### Metrics Pitfalls
660
+
661
+ ```yaml
662
+ pitfall_cardinality_explosion:
663
+ problem: "Labels with unbounded values (user_id, url)"
664
+ impact: "Memory exhaustion, slow queries"
665
+ solution: "Keep labels bounded, use logs for high-cardinality"
666
+
667
+ pitfall_not_using_histograms:
668
+ problem: "Calculating averages instead of percentiles"
669
+ impact: "Missing tail latency issues"
670
+ solution: "Use histograms, look at p95/p99"
671
+
672
+ pitfall_missing_labels:
673
+ problem: "Can't break down by status code, method"
674
+ impact: "Can't diagnose issues"
675
+ solution: "Include useful dimensions as labels"
676
+ ```
677
+
678
+ ### Logging Pitfalls
679
+
680
+ ```yaml
681
+ pitfall_log_and_throw:
682
+ problem: "Logging error then throwing (double logging)"
683
+ impact: "Duplicate log entries, confusion"
684
+ solution: "Log at the handling point only"
685
+
686
+ pitfall_no_context:
687
+ problem: "Logs don't include trace_id or request context"
688
+ impact: "Can't correlate logs"
689
+ solution: "Always include trace_id, request_id"
690
+
691
+ pitfall_logging_pii:
692
+ problem: "Personal data in logs"
693
+ impact: "Compliance violations"
694
+ solution: "Sanitize logs, use allowlist for fields"
695
+ ```
696
+
697
+ ### Alerting Pitfalls
698
+
699
+ ```yaml
700
+ pitfall_alert_on_causes:
701
+ problem: "Alert on symptoms (high CPU) not impact (slow requests)"
702
+ impact: "Alerts without user impact"
703
+ solution: "Alert on user-facing metrics first"
704
+
705
+ pitfall_no_runbook:
706
+ problem: "Alert fires, on-call doesn't know what to do"
707
+ impact: "Longer MTTR"
708
+ solution: "Every alert must have a runbook link"
709
+
710
+ pitfall_too_sensitive:
711
+ problem: "Alert thresholds too tight"
712
+ impact: "Alert fatigue, ignored alerts"
713
+ solution: "Tune thresholds based on actual impact"
714
+ ```