agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,747 @@
1
+ # Observability
2
+
3
+ Guidelines for implementing comprehensive observability across the platform.
4
+
5
+ ## Core Principles
6
+
7
+ 1. **Three Pillars** - Metrics, logs, and traces working together
8
+ 2. **SLO-Driven** - Define what matters before instrumenting everything
9
+ 3. **Context Propagation** - Trace requests across service boundaries
10
+ 4. **Actionable Alerts** - Every alert should have a clear response
11
+
12
+ ## The Three Pillars
13
+
14
+ ### Metrics (Prometheus)
15
+
16
+ ```yaml
17
+ # ServiceMonitor for automatic discovery
18
+ apiVersion: monitoring.coreos.com/v1
19
+ kind: ServiceMonitor
20
+ metadata:
21
+ name: api-server
22
+ labels:
23
+ release: prometheus
24
+ spec:
25
+ selector:
26
+ matchLabels:
27
+ app.kubernetes.io/name: api-server
28
+ namespaceSelector:
29
+ matchNames:
30
+ - production
31
+ endpoints:
32
+ - port: metrics
33
+ interval: 30s
34
+ path: /metrics
35
+ scrapeTimeout: 10s
36
+
37
+ # Relabeling
38
+ relabelings:
39
+ - sourceLabels: [__meta_kubernetes_pod_label_app_kubernetes_io_version]
40
+ targetLabel: version
41
+
42
+ # Metric relabeling (drop high-cardinality)
43
+ metricRelabelings:
44
+ - sourceLabels: [__name__]
45
+ regex: 'go_gc_.*'
46
+ action: drop
47
+ ```
48
+
49
+ ### RED Method (Request-oriented)
50
+
51
+ ```go
52
+ // Instrument HTTP handlers with RED metrics
53
+ var (
54
+ requestsTotal = prometheus.NewCounterVec(
55
+ prometheus.CounterOpts{
56
+ Name: "http_requests_total",
57
+ Help: "Total HTTP requests",
58
+ },
59
+ []string{"method", "path", "status"},
60
+ )
61
+
62
+ requestDuration = prometheus.NewHistogramVec(
63
+ prometheus.HistogramOpts{
64
+ Name: "http_request_duration_seconds",
65
+ Help: "HTTP request duration",
66
+ Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
67
+ },
68
+ []string{"method", "path"},
69
+ )
70
+
71
+ requestsInFlight = prometheus.NewGauge(
72
+ prometheus.GaugeOpts{
73
+ Name: "http_requests_in_flight",
74
+ Help: "Current number of HTTP requests being processed",
75
+ },
76
+ )
77
+ )
78
+
79
+ func instrumentHandler(next http.Handler) http.Handler {
80
+ return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
81
+ requestsInFlight.Inc()
82
+ defer requestsInFlight.Dec()
83
+
84
+ start := time.Now()
85
+ wrapped := &responseWriter{ResponseWriter: w, statusCode: 200}
86
+
87
+ next.ServeHTTP(wrapped, r)
88
+
89
+ duration := time.Since(start).Seconds()
90
+ path := normalizePath(r.URL.Path) // Avoid high cardinality
91
+
92
+ requestsTotal.WithLabelValues(r.Method, path, strconv.Itoa(wrapped.statusCode)).Inc()
93
+ requestDuration.WithLabelValues(r.Method, path).Observe(duration)
94
+ })
95
+ }
96
+ ```
97
+
98
+ ### USE Method (Resource-oriented)
99
+
100
+ ```yaml
101
+ # Resource utilization metrics
102
+ groups:
103
+ - name: resource-metrics
104
+ rules:
105
+ # CPU Utilization
106
+ - record: instance:node_cpu_utilization:ratio
107
+ expr: |
108
+ 1 - avg by (instance) (
109
+ rate(node_cpu_seconds_total{mode="idle"}[5m])
110
+ )
111
+
112
+ # Memory Utilization
113
+ - record: instance:node_memory_utilization:ratio
114
+ expr: |
115
+ 1 - (
116
+ node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes
117
+ )
118
+
119
+ # Disk Utilization
120
+ - record: instance:node_disk_utilization:ratio
121
+ expr: |
122
+ 1 - (
123
+ node_filesystem_avail_bytes{mountpoint="/"} /
124
+ node_filesystem_size_bytes{mountpoint="/"}
125
+ )
126
+ ```
127
+
128
+ ### Logs (Loki)
129
+
130
+ ```yaml
131
+ # Promtail configuration for log collection
132
+ apiVersion: v1
133
+ kind: ConfigMap
134
+ metadata:
135
+ name: promtail-config
136
+ data:
137
+ promtail.yaml: |
138
+ server:
139
+ http_listen_port: 9080
140
+
141
+ positions:
142
+ filename: /tmp/positions.yaml
143
+
144
+ clients:
145
+ - url: http://loki:3100/loki/api/v1/push
146
+
147
+ scrape_configs:
148
+ - job_name: kubernetes-pods
149
+ kubernetes_sd_configs:
150
+ - role: pod
151
+
152
+ relabel_configs:
153
+ # Keep only pods with logging enabled
154
+ - source_labels: [__meta_kubernetes_pod_annotation_logging_enabled]
155
+ action: keep
156
+ regex: true
157
+
158
+ # Add namespace label
159
+ - source_labels: [__meta_kubernetes_namespace]
160
+ target_label: namespace
161
+
162
+ # Add pod name label
163
+ - source_labels: [__meta_kubernetes_pod_name]
164
+ target_label: pod
165
+
166
+ # Add container name label
167
+ - source_labels: [__meta_kubernetes_pod_container_name]
168
+ target_label: container
169
+
170
+ # Add app label
171
+ - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
172
+ target_label: app
173
+
174
+ pipeline_stages:
175
+ # Parse JSON logs
176
+ - json:
177
+ expressions:
178
+ level: level
179
+ message: msg
180
+ trace_id: trace_id
181
+ span_id: span_id
182
+
183
+ # Add labels from parsed JSON
184
+ - labels:
185
+ level:
186
+ trace_id:
187
+
188
+ # Parse timestamp
189
+ - timestamp:
190
+ source: time
191
+ format: RFC3339Nano
192
+ ```
193
+
194
+ ### Structured Logging
195
+
196
+ ```go
197
+ // Always use structured logging
198
+ import "go.uber.org/zap"
199
+
200
+ logger, _ := zap.NewProduction()
201
+ defer logger.Sync()
202
+
203
+ // Good - structured with context
204
+ logger.Info("request processed",
205
+ zap.String("method", r.Method),
206
+ zap.String("path", r.URL.Path),
207
+ zap.Int("status", status),
208
+ zap.Duration("duration", duration),
209
+ zap.String("trace_id", traceID),
210
+ zap.String("user_id", userID),
211
+ )
212
+
213
+ // Bad - unstructured
214
+ logger.Info(fmt.Sprintf("processed %s %s in %v", r.Method, r.URL.Path, duration))
215
+ ```
216
+
217
+ ### Log Levels
218
+
219
+ ```go
220
+ // Use appropriate log levels
221
+ logger.Debug("detailed debugging info") // Development/troubleshooting
222
+ logger.Info("normal operation events") // Request processed, job completed
223
+ logger.Warn("recoverable issues") // Retry succeeded, deprecated API used
224
+ logger.Error("errors requiring attention", // Request failed, connection lost
225
+ zap.Error(err))
226
+ // Fatal/Panic - avoid in production; let orchestrator handle restarts
227
+ ```
228
+
229
+ ### Traces (OpenTelemetry)
230
+
231
+ ```go
232
+ // OpenTelemetry setup
233
+ import (
234
+ "go.opentelemetry.io/otel"
235
+ "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
236
+ "go.opentelemetry.io/otel/sdk/trace"
237
+ )
238
+
239
+ func initTracer() (*trace.TracerProvider, error) {
240
+ exporter, err := otlptracegrpc.New(ctx,
241
+ otlptracegrpc.WithEndpoint("tempo:4317"),
242
+ otlptracegrpc.WithInsecure(),
243
+ )
244
+ if err != nil {
245
+ return nil, err
246
+ }
247
+
248
+ tp := trace.NewTracerProvider(
249
+ trace.WithBatcher(exporter),
250
+ trace.WithResource(resource.NewWithAttributes(
251
+ semconv.SchemaURL,
252
+ semconv.ServiceNameKey.String("api-server"),
253
+ semconv.ServiceVersionKey.String(version),
254
+ semconv.DeploymentEnvironmentKey.String(env),
255
+ )),
256
+ trace.WithSampler(trace.ParentBased(
257
+ trace.TraceIDRatioBased(0.1), // 10% sampling
258
+ )),
259
+ )
260
+
261
+ otel.SetTracerProvider(tp)
262
+ otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
263
+ propagation.TraceContext{},
264
+ propagation.Baggage{},
265
+ ))
266
+
267
+ return tp, nil
268
+ }
269
+
270
+ // Create spans for operations
271
+ func handleRequest(ctx context.Context, req *Request) (*Response, error) {
272
+ ctx, span := tracer.Start(ctx, "handleRequest",
273
+ trace.WithAttributes(
274
+ attribute.String("request.id", req.ID),
275
+ attribute.String("request.type", req.Type),
276
+ ),
277
+ )
278
+ defer span.End()
279
+
280
+ // Database call with child span
281
+ ctx, dbSpan := tracer.Start(ctx, "database.query")
282
+ result, err := db.QueryContext(ctx, query)
283
+ if err != nil {
284
+ dbSpan.RecordError(err)
285
+ dbSpan.SetStatus(codes.Error, err.Error())
286
+ }
287
+ dbSpan.End()
288
+
289
+ // HTTP call to another service
290
+ req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
291
+ otel.GetTextMapPropagator().Inject(ctx, propagation.HeaderCarrier(req.Header))
292
+
293
+ return result, nil
294
+ }
295
+ ```
296
+
297
+ ## SLOs and SLIs
298
+
299
+ ### SLO Definition (Sloth)
300
+
301
+ ```yaml
302
+ apiVersion: sloth.slok.dev/v1
303
+ kind: PrometheusServiceLevel
304
+ metadata:
305
+ name: api-server
306
+ namespace: monitoring
307
+ spec:
308
+ service: "api-server"
309
+ labels:
310
+ team: platform
311
+ tier: "1"
312
+
313
+ slos:
314
+ # Availability SLO
315
+ - name: "requests-availability"
316
+ objective: 99.9 # 99.9% success rate
317
+ description: "99.9% of requests should succeed"
318
+ sli:
319
+ events:
320
+ errorQuery: |
321
+ sum(rate(http_requests_total{
322
+ job="api-server",
323
+ status=~"5.."
324
+ }[{{.window}}]))
325
+ totalQuery: |
326
+ sum(rate(http_requests_total{
327
+ job="api-server"
328
+ }[{{.window}}]))
329
+ alerting:
330
+ name: APIServerHighErrorRate
331
+ labels:
332
+ category: availability
333
+ annotations:
334
+ summary: "API Server error rate is too high"
335
+ runbook: "https://runbooks.example.com/api-server/high-error-rate"
336
+ pageAlert:
337
+ labels:
338
+ severity: critical
339
+ notify: pagerduty
340
+ ticketAlert:
341
+ labels:
342
+ severity: warning
343
+ notify: slack
344
+
345
+ # Latency SLO
346
+ - name: "requests-latency"
347
+ objective: 99.0 # 99% under 500ms
348
+ description: "99% of requests should complete within 500ms"
349
+ sli:
350
+ events:
351
+ errorQuery: |
352
+ sum(rate(http_request_duration_seconds_bucket{
353
+ job="api-server",
354
+ le="0.5"
355
+ }[{{.window}}]))
356
+ totalQuery: |
357
+ sum(rate(http_request_duration_seconds_count{
358
+ job="api-server"
359
+ }[{{.window}}]))
360
+ alerting:
361
+ name: APIServerHighLatency
362
+ labels:
363
+ category: latency
364
+ pageAlert:
365
+ labels:
366
+ severity: critical
367
+ ticketAlert:
368
+ labels:
369
+ severity: warning
370
+ ```
371
+
372
+ ### Error Budget Dashboard
373
+
374
+ ```yaml
375
+ # Grafana dashboard for error budget
376
+ panels:
377
+ - title: "Error Budget Remaining"
378
+ type: gauge
379
+ targets:
380
+ - expr: |
381
+ 1 - (
382
+ sum(rate(http_requests_total{status=~"5.."}[30d]))
383
+ /
384
+ sum(rate(http_requests_total[30d]))
385
+ ) / (1 - 0.999)
386
+ thresholds:
387
+ - value: 0
388
+ color: red
389
+ - value: 0.25
390
+ color: orange
391
+ - value: 0.5
392
+ color: yellow
393
+ - value: 0.75
394
+ color: green
395
+
396
+ - title: "Error Budget Burn Rate"
397
+ type: graph
398
+ targets:
399
+ - expr: |
400
+ (
401
+ sum(rate(http_requests_total{status=~"5.."}[1h]))
402
+ /
403
+ sum(rate(http_requests_total[1h]))
404
+ ) / (1 - 0.999)
405
+ legendFormat: "1h burn rate"
406
+ - expr: |
407
+ (
408
+ sum(rate(http_requests_total{status=~"5.."}[6h]))
409
+ /
410
+ sum(rate(http_requests_total[6h]))
411
+ ) / (1 - 0.999)
412
+ legendFormat: "6h burn rate"
413
+ ```
414
+
415
+ ## Alerting
416
+
417
+ ### Alert Rules
418
+
419
+ ```yaml
420
+ apiVersion: monitoring.coreos.com/v1
421
+ kind: PrometheusRule
422
+ metadata:
423
+ name: api-server-alerts
424
+ spec:
425
+ groups:
426
+ - name: api-server.rules
427
+ rules:
428
+ # High error rate (immediate)
429
+ - alert: APIServerHighErrorRate
430
+ expr: |
431
+ sum(rate(http_requests_total{job="api-server",status=~"5.."}[5m]))
432
+ /
433
+ sum(rate(http_requests_total{job="api-server"}[5m]))
434
+ > 0.01
435
+ for: 5m
436
+ labels:
437
+ severity: critical
438
+ team: platform
439
+ annotations:
440
+ summary: "API Server error rate > 1%"
441
+ description: "Error rate is {{ $value | humanizePercentage }}"
442
+ runbook_url: "https://runbooks.example.com/api-server/high-error-rate"
443
+ dashboard_url: "https://grafana.example.com/d/api-server"
444
+
445
+ # High latency
446
+ - alert: APIServerHighLatency
447
+ expr: |
448
+ histogram_quantile(0.99,
449
+ sum(rate(http_request_duration_seconds_bucket{job="api-server"}[5m])) by (le)
450
+ ) > 1
451
+ for: 10m
452
+ labels:
453
+ severity: warning
454
+ team: platform
455
+ annotations:
456
+ summary: "API Server P99 latency > 1s"
457
+ description: "P99 latency is {{ $value | humanizeDuration }}"
458
+
459
+ # Pod crash looping
460
+ - alert: APIServerPodCrashLooping
461
+ expr: |
462
+ rate(kube_pod_container_status_restarts_total{
463
+ namespace="production",
464
+ pod=~"api-server.*"
465
+ }[15m]) > 0
466
+ for: 5m
467
+ labels:
468
+ severity: critical
469
+ annotations:
470
+ summary: "API Server pod is crash looping"
471
+ description: "Pod {{ $labels.pod }} has restarted {{ $value }} times"
472
+
473
+ # High memory usage
474
+ - alert: APIServerHighMemory
475
+ expr: |
476
+ container_memory_usage_bytes{
477
+ namespace="production",
478
+ pod=~"api-server.*"
479
+ }
480
+ /
481
+ container_spec_memory_limit_bytes{
482
+ namespace="production",
483
+ pod=~"api-server.*"
484
+ }
485
+ > 0.9
486
+ for: 5m
487
+ labels:
488
+ severity: warning
489
+ annotations:
490
+ summary: "API Server memory usage > 90%"
491
+ ```
492
+
493
+ ### Alert Routing (Alertmanager)
494
+
495
+ ```yaml
496
+ apiVersion: v1
497
+ kind: Secret
498
+ metadata:
499
+ name: alertmanager-config
500
+ stringData:
501
+ alertmanager.yaml: |
502
+ global:
503
+ resolve_timeout: 5m
504
+ slack_api_url: 'https://hooks.slack.com/services/xxx'
505
+ pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
506
+
507
+ route:
508
+ receiver: 'default'
509
+ group_by: ['alertname', 'namespace', 'service']
510
+ group_wait: 30s
511
+ group_interval: 5m
512
+ repeat_interval: 4h
513
+
514
+ routes:
515
+ # Critical alerts -> PagerDuty
516
+ - match:
517
+ severity: critical
518
+ receiver: 'pagerduty-critical'
519
+ continue: true
520
+
521
+ # Warning alerts -> Slack
522
+ - match:
523
+ severity: warning
524
+ receiver: 'slack-warnings'
525
+
526
+ # Team-specific routing
527
+ - match:
528
+ team: platform
529
+ receiver: 'platform-team'
530
+
531
+ receivers:
532
+ - name: 'default'
533
+ slack_configs:
534
+ - channel: '#alerts'
535
+
536
+ - name: 'pagerduty-critical'
537
+ pagerduty_configs:
538
+ - service_key: '<pagerduty-service-key>'
539
+ severity: critical
540
+ description: '{{ .GroupLabels.alertname }}'
541
+ details:
542
+ firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}'
543
+
544
+ - name: 'slack-warnings'
545
+ slack_configs:
546
+ - channel: '#alerts-warnings'
547
+ send_resolved: true
548
+ title: '{{ .GroupLabels.alertname }}'
549
+ text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
550
+
551
+ - name: 'platform-team'
552
+ slack_configs:
553
+ - channel: '#platform-alerts'
554
+ ```
555
+
556
+ ## Dashboards
557
+
558
+ ### Service Dashboard Template
559
+
560
+ ```yaml
561
+ # Grafana dashboard as code
562
+ apiVersion: v1
563
+ kind: ConfigMap
564
+ metadata:
565
+ name: api-server-dashboard
566
+ labels:
567
+ grafana_dashboard: "1"
568
+ data:
569
+ api-server.json: |
570
+ {
571
+ "title": "API Server",
572
+ "uid": "api-server",
573
+ "tags": ["production", "api"],
574
+ "templating": {
575
+ "list": [
576
+ {
577
+ "name": "namespace",
578
+ "type": "query",
579
+ "query": "label_values(http_requests_total, namespace)"
580
+ }
581
+ ]
582
+ },
583
+ "panels": [
584
+ {
585
+ "title": "Request Rate",
586
+ "type": "graph",
587
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
588
+ "targets": [
589
+ {
590
+ "expr": "sum(rate(http_requests_total{namespace=\"$namespace\"}[5m])) by (status)",
591
+ "legendFormat": "{{status}}"
592
+ }
593
+ ]
594
+ },
595
+ {
596
+ "title": "Latency (P50, P95, P99)",
597
+ "type": "graph",
598
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
599
+ "targets": [
600
+ {
601
+ "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le))",
602
+ "legendFormat": "P50"
603
+ },
604
+ {
605
+ "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le))",
606
+ "legendFormat": "P95"
607
+ },
608
+ {
609
+ "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le))",
610
+ "legendFormat": "P99"
611
+ }
612
+ ]
613
+ },
614
+ {
615
+ "title": "Error Rate",
616
+ "type": "stat",
617
+ "gridPos": {"h": 4, "w": 6, "x": 0, "y": 8},
618
+ "targets": [
619
+ {
620
+ "expr": "sum(rate(http_requests_total{namespace=\"$namespace\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{namespace=\"$namespace\"}[5m]))"
621
+ }
622
+ ],
623
+ "fieldConfig": {
624
+ "defaults": {
625
+ "unit": "percentunit",
626
+ "thresholds": {
627
+ "steps": [
628
+ {"value": 0, "color": "green"},
629
+ {"value": 0.01, "color": "yellow"},
630
+ {"value": 0.05, "color": "red"}
631
+ ]
632
+ }
633
+ }
634
+ }
635
+ }
636
+ ]
637
+ }
638
+ ```
639
+
640
+ ## Correlation
641
+
642
+ ### Linking Metrics, Logs, and Traces
643
+
644
+ ```yaml
645
+ # Grafana data source configuration for correlation
646
+ apiVersion: 1
647
+ datasources:
648
+ - name: Prometheus
649
+ type: prometheus
650
+ url: http://prometheus:9090
651
+ jsonData:
652
+ exemplarTraceIdDestinations:
653
+ - name: trace_id
654
+ datasourceUid: tempo
655
+
656
+ - name: Loki
657
+ type: loki
658
+ url: http://loki:3100
659
+ jsonData:
660
+ derivedFields:
661
+ - name: TraceID
662
+ matcherRegex: '"trace_id":"(\w+)"'
663
+ url: '$${__value.raw}'
664
+ datasourceUid: tempo
665
+
666
+ - name: Tempo
667
+ type: tempo
668
+ url: http://tempo:3200
669
+ jsonData:
670
+ tracesToLogs:
671
+ datasourceUid: loki
672
+ tags: ['app', 'namespace']
673
+ mappedTags: [{ key: 'service.name', value: 'app' }]
674
+ mapTagNamesEnabled: true
675
+ spanStartTimeShift: '-1h'
676
+ spanEndTimeShift: '1h'
677
+ filterByTraceID: true
678
+ filterBySpanID: false
679
+ tracesToMetrics:
680
+ datasourceUid: prometheus
681
+ tags: [{ key: 'service.name', value: 'job' }]
682
+ queries:
683
+ - name: 'Request rate'
684
+ query: 'sum(rate(http_requests_total{$$__tags}[5m]))'
685
+ ```
686
+
687
+ ## Common Pitfalls
688
+
689
+ ### 1. Alert Fatigue
690
+
691
+ ```yaml
692
+ # Bad - too sensitive, will fire constantly
693
+ - alert: HighErrorRate
694
+ expr: http_errors_total > 0
695
+ for: 1m
696
+
697
+ # Good - meaningful thresholds with context
698
+ - alert: HighErrorRate
699
+ expr: |
700
+ sum(rate(http_requests_total{status=~"5.."}[5m]))
701
+ /
702
+ sum(rate(http_requests_total[5m]))
703
+ > 0.01
704
+ for: 5m
705
+ labels:
706
+ severity: warning
707
+ ```
708
+
709
+ ### 2. High Cardinality Metrics
710
+
711
+ ```go
712
+ // Bad - unbounded cardinality
713
+ requestsTotal.WithLabelValues(userID, requestPath, queryString)
714
+
715
+ // Good - bounded, normalized labels
716
+ requestsTotal.WithLabelValues(normalizePath(requestPath), method, statusCode)
717
+ ```
718
+
719
+ ### 3. Missing Context in Logs
720
+
721
+ ```go
722
+ // Bad - no context
723
+ log.Error("request failed")
724
+
725
+ // Good - full context
726
+ logger.Error("request failed",
727
+ zap.String("trace_id", traceID),
728
+ zap.String("user_id", userID),
729
+ zap.String("path", path),
730
+ zap.Error(err),
731
+ )
732
+ ```
733
+
734
+ ### 4. Sampling Without Thought
735
+
736
+ ```go
737
+ // Bad - random sampling misses important traces
738
+ sampler := trace.TraceIDRatioBased(0.01) // 1%
739
+
740
+ // Good - sample based on importance
741
+ sampler := trace.ParentBased(
742
+ trace.TraceIDRatioBased(0.1), // 10% base rate
743
+ trace.WithLocalParentSampled(trace.AlwaysSample()), // Always sample if parent sampled
744
+ trace.WithRemoteParentSampled(trace.AlwaysSample()),
745
+ )
746
+ // Plus: always sample errors, slow requests, specific user IDs
747
+ ```