agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,747 @@
|
|
|
1
|
+
# Observability
|
|
2
|
+
|
|
3
|
+
Guidelines for implementing comprehensive observability across the platform.
|
|
4
|
+
|
|
5
|
+
## Core Principles
|
|
6
|
+
|
|
7
|
+
1. **Three Pillars** - Metrics, logs, and traces working together
|
|
8
|
+
2. **SLO-Driven** - Define what matters before instrumenting everything
|
|
9
|
+
3. **Context Propagation** - Trace requests across service boundaries
|
|
10
|
+
4. **Actionable Alerts** - Every alert should have a clear response
|
|
11
|
+
|
|
12
|
+
## The Three Pillars
|
|
13
|
+
|
|
14
|
+
### Metrics (Prometheus)
|
|
15
|
+
|
|
16
|
+
```yaml
|
|
17
|
+
# ServiceMonitor for automatic discovery
|
|
18
|
+
apiVersion: monitoring.coreos.com/v1
|
|
19
|
+
kind: ServiceMonitor
|
|
20
|
+
metadata:
|
|
21
|
+
name: api-server
|
|
22
|
+
labels:
|
|
23
|
+
release: prometheus
|
|
24
|
+
spec:
|
|
25
|
+
selector:
|
|
26
|
+
matchLabels:
|
|
27
|
+
app.kubernetes.io/name: api-server
|
|
28
|
+
namespaceSelector:
|
|
29
|
+
matchNames:
|
|
30
|
+
- production
|
|
31
|
+
endpoints:
|
|
32
|
+
- port: metrics
|
|
33
|
+
interval: 30s
|
|
34
|
+
path: /metrics
|
|
35
|
+
scrapeTimeout: 10s
|
|
36
|
+
|
|
37
|
+
# Relabeling
|
|
38
|
+
relabelings:
|
|
39
|
+
- sourceLabels: [__meta_kubernetes_pod_label_app_kubernetes_io_version]
|
|
40
|
+
targetLabel: version
|
|
41
|
+
|
|
42
|
+
# Metric relabeling (drop high-cardinality)
|
|
43
|
+
metricRelabelings:
|
|
44
|
+
- sourceLabels: [__name__]
|
|
45
|
+
regex: 'go_gc_.*'
|
|
46
|
+
action: drop
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### RED Method (Request-oriented)
|
|
50
|
+
|
|
51
|
+
```go
|
|
52
|
+
// Instrument HTTP handlers with RED metrics
|
|
53
|
+
var (
|
|
54
|
+
requestsTotal = prometheus.NewCounterVec(
|
|
55
|
+
prometheus.CounterOpts{
|
|
56
|
+
Name: "http_requests_total",
|
|
57
|
+
Help: "Total HTTP requests",
|
|
58
|
+
},
|
|
59
|
+
[]string{"method", "path", "status"},
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
requestDuration = prometheus.NewHistogramVec(
|
|
63
|
+
prometheus.HistogramOpts{
|
|
64
|
+
Name: "http_request_duration_seconds",
|
|
65
|
+
Help: "HTTP request duration",
|
|
66
|
+
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
|
|
67
|
+
},
|
|
68
|
+
[]string{"method", "path"},
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
requestsInFlight = prometheus.NewGauge(
|
|
72
|
+
prometheus.GaugeOpts{
|
|
73
|
+
Name: "http_requests_in_flight",
|
|
74
|
+
Help: "Current number of HTTP requests being processed",
|
|
75
|
+
},
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
func instrumentHandler(next http.Handler) http.Handler {
|
|
80
|
+
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
81
|
+
requestsInFlight.Inc()
|
|
82
|
+
defer requestsInFlight.Dec()
|
|
83
|
+
|
|
84
|
+
start := time.Now()
|
|
85
|
+
wrapped := &responseWriter{ResponseWriter: w, statusCode: 200}
|
|
86
|
+
|
|
87
|
+
next.ServeHTTP(wrapped, r)
|
|
88
|
+
|
|
89
|
+
duration := time.Since(start).Seconds()
|
|
90
|
+
path := normalizePath(r.URL.Path) // Avoid high cardinality
|
|
91
|
+
|
|
92
|
+
requestsTotal.WithLabelValues(r.Method, path, strconv.Itoa(wrapped.statusCode)).Inc()
|
|
93
|
+
requestDuration.WithLabelValues(r.Method, path).Observe(duration)
|
|
94
|
+
})
|
|
95
|
+
}
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### USE Method (Resource-oriented)
|
|
99
|
+
|
|
100
|
+
```yaml
|
|
101
|
+
# Resource utilization metrics
|
|
102
|
+
groups:
|
|
103
|
+
- name: resource-metrics
|
|
104
|
+
rules:
|
|
105
|
+
# CPU Utilization
|
|
106
|
+
- record: instance:node_cpu_utilization:ratio
|
|
107
|
+
expr: |
|
|
108
|
+
1 - avg by (instance) (
|
|
109
|
+
rate(node_cpu_seconds_total{mode="idle"}[5m])
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Memory Utilization
|
|
113
|
+
- record: instance:node_memory_utilization:ratio
|
|
114
|
+
expr: |
|
|
115
|
+
1 - (
|
|
116
|
+
node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Disk Utilization
|
|
120
|
+
- record: instance:node_disk_utilization:ratio
|
|
121
|
+
expr: |
|
|
122
|
+
1 - (
|
|
123
|
+
node_filesystem_avail_bytes{mountpoint="/"} /
|
|
124
|
+
node_filesystem_size_bytes{mountpoint="/"}
|
|
125
|
+
)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Logs (Loki)
|
|
129
|
+
|
|
130
|
+
```yaml
|
|
131
|
+
# Promtail configuration for log collection
|
|
132
|
+
apiVersion: v1
|
|
133
|
+
kind: ConfigMap
|
|
134
|
+
metadata:
|
|
135
|
+
name: promtail-config
|
|
136
|
+
data:
|
|
137
|
+
promtail.yaml: |
|
|
138
|
+
server:
|
|
139
|
+
http_listen_port: 9080
|
|
140
|
+
|
|
141
|
+
positions:
|
|
142
|
+
filename: /tmp/positions.yaml
|
|
143
|
+
|
|
144
|
+
clients:
|
|
145
|
+
- url: http://loki:3100/loki/api/v1/push
|
|
146
|
+
|
|
147
|
+
scrape_configs:
|
|
148
|
+
- job_name: kubernetes-pods
|
|
149
|
+
kubernetes_sd_configs:
|
|
150
|
+
- role: pod
|
|
151
|
+
|
|
152
|
+
relabel_configs:
|
|
153
|
+
# Keep only pods with logging enabled
|
|
154
|
+
- source_labels: [__meta_kubernetes_pod_annotation_logging_enabled]
|
|
155
|
+
action: keep
|
|
156
|
+
regex: true
|
|
157
|
+
|
|
158
|
+
# Add namespace label
|
|
159
|
+
- source_labels: [__meta_kubernetes_namespace]
|
|
160
|
+
target_label: namespace
|
|
161
|
+
|
|
162
|
+
# Add pod name label
|
|
163
|
+
- source_labels: [__meta_kubernetes_pod_name]
|
|
164
|
+
target_label: pod
|
|
165
|
+
|
|
166
|
+
# Add container name label
|
|
167
|
+
- source_labels: [__meta_kubernetes_pod_container_name]
|
|
168
|
+
target_label: container
|
|
169
|
+
|
|
170
|
+
# Add app label
|
|
171
|
+
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
|
172
|
+
target_label: app
|
|
173
|
+
|
|
174
|
+
pipeline_stages:
|
|
175
|
+
# Parse JSON logs
|
|
176
|
+
- json:
|
|
177
|
+
expressions:
|
|
178
|
+
level: level
|
|
179
|
+
message: msg
|
|
180
|
+
trace_id: trace_id
|
|
181
|
+
span_id: span_id
|
|
182
|
+
|
|
183
|
+
# Add labels from parsed JSON
|
|
184
|
+
- labels:
|
|
185
|
+
level:
|
|
186
|
+
trace_id:
|
|
187
|
+
|
|
188
|
+
# Parse timestamp
|
|
189
|
+
- timestamp:
|
|
190
|
+
source: time
|
|
191
|
+
format: RFC3339Nano
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### Structured Logging
|
|
195
|
+
|
|
196
|
+
```go
|
|
197
|
+
// Always use structured logging
|
|
198
|
+
import "go.uber.org/zap"
|
|
199
|
+
|
|
200
|
+
logger, _ := zap.NewProduction()
|
|
201
|
+
defer logger.Sync()
|
|
202
|
+
|
|
203
|
+
// Good - structured with context
|
|
204
|
+
logger.Info("request processed",
|
|
205
|
+
zap.String("method", r.Method),
|
|
206
|
+
zap.String("path", r.URL.Path),
|
|
207
|
+
zap.Int("status", status),
|
|
208
|
+
zap.Duration("duration", duration),
|
|
209
|
+
zap.String("trace_id", traceID),
|
|
210
|
+
zap.String("user_id", userID),
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
// Bad - unstructured
|
|
214
|
+
logger.Info(fmt.Sprintf("processed %s %s in %v", r.Method, r.URL.Path, duration))
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Log Levels
|
|
218
|
+
|
|
219
|
+
```go
|
|
220
|
+
// Use appropriate log levels
|
|
221
|
+
logger.Debug("detailed debugging info") // Development/troubleshooting
|
|
222
|
+
logger.Info("normal operation events") // Request processed, job completed
|
|
223
|
+
logger.Warn("recoverable issues") // Retry succeeded, deprecated API used
|
|
224
|
+
logger.Error("errors requiring attention", // Request failed, connection lost
|
|
225
|
+
zap.Error(err))
|
|
226
|
+
// Fatal/Panic - avoid in production; let orchestrator handle restarts
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Traces (OpenTelemetry)
|
|
230
|
+
|
|
231
|
+
```go
|
|
232
|
+
// OpenTelemetry setup
|
|
233
|
+
import (
|
|
234
|
+
"go.opentelemetry.io/otel"
|
|
235
|
+
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
|
236
|
+
"go.opentelemetry.io/otel/sdk/trace"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
func initTracer() (*trace.TracerProvider, error) {
|
|
240
|
+
exporter, err := otlptracegrpc.New(ctx,
|
|
241
|
+
otlptracegrpc.WithEndpoint("tempo:4317"),
|
|
242
|
+
otlptracegrpc.WithInsecure(),
|
|
243
|
+
)
|
|
244
|
+
if err != nil {
|
|
245
|
+
return nil, err
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
tp := trace.NewTracerProvider(
|
|
249
|
+
trace.WithBatcher(exporter),
|
|
250
|
+
trace.WithResource(resource.NewWithAttributes(
|
|
251
|
+
semconv.SchemaURL,
|
|
252
|
+
semconv.ServiceNameKey.String("api-server"),
|
|
253
|
+
semconv.ServiceVersionKey.String(version),
|
|
254
|
+
semconv.DeploymentEnvironmentKey.String(env),
|
|
255
|
+
)),
|
|
256
|
+
trace.WithSampler(trace.ParentBased(
|
|
257
|
+
trace.TraceIDRatioBased(0.1), // 10% sampling
|
|
258
|
+
)),
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
otel.SetTracerProvider(tp)
|
|
262
|
+
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
|
|
263
|
+
propagation.TraceContext{},
|
|
264
|
+
propagation.Baggage{},
|
|
265
|
+
))
|
|
266
|
+
|
|
267
|
+
return tp, nil
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Create spans for operations
|
|
271
|
+
func handleRequest(ctx context.Context, req *Request) (*Response, error) {
|
|
272
|
+
ctx, span := tracer.Start(ctx, "handleRequest",
|
|
273
|
+
trace.WithAttributes(
|
|
274
|
+
attribute.String("request.id", req.ID),
|
|
275
|
+
attribute.String("request.type", req.Type),
|
|
276
|
+
),
|
|
277
|
+
)
|
|
278
|
+
defer span.End()
|
|
279
|
+
|
|
280
|
+
// Database call with child span
|
|
281
|
+
ctx, dbSpan := tracer.Start(ctx, "database.query")
|
|
282
|
+
result, err := db.QueryContext(ctx, query)
|
|
283
|
+
if err != nil {
|
|
284
|
+
dbSpan.RecordError(err)
|
|
285
|
+
dbSpan.SetStatus(codes.Error, err.Error())
|
|
286
|
+
}
|
|
287
|
+
dbSpan.End()
|
|
288
|
+
|
|
289
|
+
// HTTP call to another service
|
|
290
|
+
req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
291
|
+
otel.GetTextMapPropagator().Inject(ctx, propagation.HeaderCarrier(req.Header))
|
|
292
|
+
|
|
293
|
+
return result, nil
|
|
294
|
+
}
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
## SLOs and SLIs
|
|
298
|
+
|
|
299
|
+
### SLO Definition (Sloth)
|
|
300
|
+
|
|
301
|
+
```yaml
|
|
302
|
+
apiVersion: sloth.slok.dev/v1
|
|
303
|
+
kind: PrometheusServiceLevel
|
|
304
|
+
metadata:
|
|
305
|
+
name: api-server
|
|
306
|
+
namespace: monitoring
|
|
307
|
+
spec:
|
|
308
|
+
service: "api-server"
|
|
309
|
+
labels:
|
|
310
|
+
team: platform
|
|
311
|
+
tier: "1"
|
|
312
|
+
|
|
313
|
+
slos:
|
|
314
|
+
# Availability SLO
|
|
315
|
+
- name: "requests-availability"
|
|
316
|
+
objective: 99.9 # 99.9% success rate
|
|
317
|
+
description: "99.9% of requests should succeed"
|
|
318
|
+
sli:
|
|
319
|
+
events:
|
|
320
|
+
errorQuery: |
|
|
321
|
+
sum(rate(http_requests_total{
|
|
322
|
+
job="api-server",
|
|
323
|
+
status=~"5.."
|
|
324
|
+
}[{{.window}}]))
|
|
325
|
+
totalQuery: |
|
|
326
|
+
sum(rate(http_requests_total{
|
|
327
|
+
job="api-server"
|
|
328
|
+
}[{{.window}}]))
|
|
329
|
+
alerting:
|
|
330
|
+
name: APIServerHighErrorRate
|
|
331
|
+
labels:
|
|
332
|
+
category: availability
|
|
333
|
+
annotations:
|
|
334
|
+
summary: "API Server error rate is too high"
|
|
335
|
+
runbook: "https://runbooks.example.com/api-server/high-error-rate"
|
|
336
|
+
pageAlert:
|
|
337
|
+
labels:
|
|
338
|
+
severity: critical
|
|
339
|
+
notify: pagerduty
|
|
340
|
+
ticketAlert:
|
|
341
|
+
labels:
|
|
342
|
+
severity: warning
|
|
343
|
+
notify: slack
|
|
344
|
+
|
|
345
|
+
# Latency SLO
|
|
346
|
+
- name: "requests-latency"
|
|
347
|
+
objective: 99.0 # 99% under 500ms
|
|
348
|
+
description: "99% of requests should complete within 500ms"
|
|
349
|
+
sli:
|
|
350
|
+
events:
|
|
351
|
+
errorQuery: |
|
|
352
|
+
sum(rate(http_request_duration_seconds_bucket{
|
|
353
|
+
job="api-server",
|
|
354
|
+
le="0.5"
|
|
355
|
+
}[{{.window}}]))
|
|
356
|
+
totalQuery: |
|
|
357
|
+
sum(rate(http_request_duration_seconds_count{
|
|
358
|
+
job="api-server"
|
|
359
|
+
}[{{.window}}]))
|
|
360
|
+
alerting:
|
|
361
|
+
name: APIServerHighLatency
|
|
362
|
+
labels:
|
|
363
|
+
category: latency
|
|
364
|
+
pageAlert:
|
|
365
|
+
labels:
|
|
366
|
+
severity: critical
|
|
367
|
+
ticketAlert:
|
|
368
|
+
labels:
|
|
369
|
+
severity: warning
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
### Error Budget Dashboard
|
|
373
|
+
|
|
374
|
+
```yaml
|
|
375
|
+
# Grafana dashboard for error budget
|
|
376
|
+
panels:
|
|
377
|
+
- title: "Error Budget Remaining"
|
|
378
|
+
type: gauge
|
|
379
|
+
targets:
|
|
380
|
+
- expr: |
|
|
381
|
+
1 - (
|
|
382
|
+
sum(rate(http_requests_total{status=~"5.."}[30d]))
|
|
383
|
+
/
|
|
384
|
+
sum(rate(http_requests_total[30d]))
|
|
385
|
+
) / (1 - 0.999)
|
|
386
|
+
thresholds:
|
|
387
|
+
- value: 0
|
|
388
|
+
color: red
|
|
389
|
+
- value: 0.25
|
|
390
|
+
color: orange
|
|
391
|
+
- value: 0.5
|
|
392
|
+
color: yellow
|
|
393
|
+
- value: 0.75
|
|
394
|
+
color: green
|
|
395
|
+
|
|
396
|
+
- title: "Error Budget Burn Rate"
|
|
397
|
+
type: graph
|
|
398
|
+
targets:
|
|
399
|
+
- expr: |
|
|
400
|
+
(
|
|
401
|
+
sum(rate(http_requests_total{status=~"5.."}[1h]))
|
|
402
|
+
/
|
|
403
|
+
sum(rate(http_requests_total[1h]))
|
|
404
|
+
) / (1 - 0.999)
|
|
405
|
+
legendFormat: "1h burn rate"
|
|
406
|
+
- expr: |
|
|
407
|
+
(
|
|
408
|
+
sum(rate(http_requests_total{status=~"5.."}[6h]))
|
|
409
|
+
/
|
|
410
|
+
sum(rate(http_requests_total[6h]))
|
|
411
|
+
) / (1 - 0.999)
|
|
412
|
+
legendFormat: "6h burn rate"
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
## Alerting
|
|
416
|
+
|
|
417
|
+
### Alert Rules
|
|
418
|
+
|
|
419
|
+
```yaml
|
|
420
|
+
apiVersion: monitoring.coreos.com/v1
|
|
421
|
+
kind: PrometheusRule
|
|
422
|
+
metadata:
|
|
423
|
+
name: api-server-alerts
|
|
424
|
+
spec:
|
|
425
|
+
groups:
|
|
426
|
+
- name: api-server.rules
|
|
427
|
+
rules:
|
|
428
|
+
# High error rate (immediate)
|
|
429
|
+
- alert: APIServerHighErrorRate
|
|
430
|
+
expr: |
|
|
431
|
+
sum(rate(http_requests_total{job="api-server",status=~"5.."}[5m]))
|
|
432
|
+
/
|
|
433
|
+
sum(rate(http_requests_total{job="api-server"}[5m]))
|
|
434
|
+
> 0.01
|
|
435
|
+
for: 5m
|
|
436
|
+
labels:
|
|
437
|
+
severity: critical
|
|
438
|
+
team: platform
|
|
439
|
+
annotations:
|
|
440
|
+
summary: "API Server error rate > 1%"
|
|
441
|
+
description: "Error rate is {{ $value | humanizePercentage }}"
|
|
442
|
+
runbook_url: "https://runbooks.example.com/api-server/high-error-rate"
|
|
443
|
+
dashboard_url: "https://grafana.example.com/d/api-server"
|
|
444
|
+
|
|
445
|
+
# High latency
|
|
446
|
+
- alert: APIServerHighLatency
|
|
447
|
+
expr: |
|
|
448
|
+
histogram_quantile(0.99,
|
|
449
|
+
sum(rate(http_request_duration_seconds_bucket{job="api-server"}[5m])) by (le)
|
|
450
|
+
) > 1
|
|
451
|
+
for: 10m
|
|
452
|
+
labels:
|
|
453
|
+
severity: warning
|
|
454
|
+
team: platform
|
|
455
|
+
annotations:
|
|
456
|
+
summary: "API Server P99 latency > 1s"
|
|
457
|
+
description: "P99 latency is {{ $value | humanizeDuration }}"
|
|
458
|
+
|
|
459
|
+
# Pod crash looping
|
|
460
|
+
- alert: APIServerPodCrashLooping
|
|
461
|
+
expr: |
|
|
462
|
+
rate(kube_pod_container_status_restarts_total{
|
|
463
|
+
namespace="production",
|
|
464
|
+
pod=~"api-server.*"
|
|
465
|
+
}[15m]) > 0
|
|
466
|
+
for: 5m
|
|
467
|
+
labels:
|
|
468
|
+
severity: critical
|
|
469
|
+
annotations:
|
|
470
|
+
summary: "API Server pod is crash looping"
|
|
471
|
+
description: "Pod {{ $labels.pod }} has restarted {{ $value }} times"
|
|
472
|
+
|
|
473
|
+
# High memory usage
|
|
474
|
+
- alert: APIServerHighMemory
|
|
475
|
+
expr: |
|
|
476
|
+
container_memory_usage_bytes{
|
|
477
|
+
namespace="production",
|
|
478
|
+
pod=~"api-server.*"
|
|
479
|
+
}
|
|
480
|
+
/
|
|
481
|
+
container_spec_memory_limit_bytes{
|
|
482
|
+
namespace="production",
|
|
483
|
+
pod=~"api-server.*"
|
|
484
|
+
}
|
|
485
|
+
> 0.9
|
|
486
|
+
for: 5m
|
|
487
|
+
labels:
|
|
488
|
+
severity: warning
|
|
489
|
+
annotations:
|
|
490
|
+
summary: "API Server memory usage > 90%"
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
### Alert Routing (Alertmanager)
|
|
494
|
+
|
|
495
|
+
```yaml
|
|
496
|
+
apiVersion: v1
|
|
497
|
+
kind: Secret
|
|
498
|
+
metadata:
|
|
499
|
+
name: alertmanager-config
|
|
500
|
+
stringData:
|
|
501
|
+
alertmanager.yaml: |
|
|
502
|
+
global:
|
|
503
|
+
resolve_timeout: 5m
|
|
504
|
+
slack_api_url: 'https://hooks.slack.com/services/xxx'
|
|
505
|
+
pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
|
|
506
|
+
|
|
507
|
+
route:
|
|
508
|
+
receiver: 'default'
|
|
509
|
+
group_by: ['alertname', 'namespace', 'service']
|
|
510
|
+
group_wait: 30s
|
|
511
|
+
group_interval: 5m
|
|
512
|
+
repeat_interval: 4h
|
|
513
|
+
|
|
514
|
+
routes:
|
|
515
|
+
# Critical alerts -> PagerDuty
|
|
516
|
+
- match:
|
|
517
|
+
severity: critical
|
|
518
|
+
receiver: 'pagerduty-critical'
|
|
519
|
+
continue: true
|
|
520
|
+
|
|
521
|
+
# Warning alerts -> Slack
|
|
522
|
+
- match:
|
|
523
|
+
severity: warning
|
|
524
|
+
receiver: 'slack-warnings'
|
|
525
|
+
|
|
526
|
+
# Team-specific routing
|
|
527
|
+
- match:
|
|
528
|
+
team: platform
|
|
529
|
+
receiver: 'platform-team'
|
|
530
|
+
|
|
531
|
+
receivers:
|
|
532
|
+
- name: 'default'
|
|
533
|
+
slack_configs:
|
|
534
|
+
- channel: '#alerts'
|
|
535
|
+
|
|
536
|
+
- name: 'pagerduty-critical'
|
|
537
|
+
pagerduty_configs:
|
|
538
|
+
- service_key: '<pagerduty-service-key>'
|
|
539
|
+
severity: critical
|
|
540
|
+
description: '{{ .GroupLabels.alertname }}'
|
|
541
|
+
details:
|
|
542
|
+
firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}'
|
|
543
|
+
|
|
544
|
+
- name: 'slack-warnings'
|
|
545
|
+
slack_configs:
|
|
546
|
+
- channel: '#alerts-warnings'
|
|
547
|
+
send_resolved: true
|
|
548
|
+
title: '{{ .GroupLabels.alertname }}'
|
|
549
|
+
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
|
550
|
+
|
|
551
|
+
- name: 'platform-team'
|
|
552
|
+
slack_configs:
|
|
553
|
+
- channel: '#platform-alerts'
|
|
554
|
+
```
|
|
555
|
+
|
|
556
|
+
## Dashboards
|
|
557
|
+
|
|
558
|
+
### Service Dashboard Template
|
|
559
|
+
|
|
560
|
+
```yaml
|
|
561
|
+
# Grafana dashboard as code
|
|
562
|
+
apiVersion: v1
|
|
563
|
+
kind: ConfigMap
|
|
564
|
+
metadata:
|
|
565
|
+
name: api-server-dashboard
|
|
566
|
+
labels:
|
|
567
|
+
grafana_dashboard: "1"
|
|
568
|
+
data:
|
|
569
|
+
api-server.json: |
|
|
570
|
+
{
|
|
571
|
+
"title": "API Server",
|
|
572
|
+
"uid": "api-server",
|
|
573
|
+
"tags": ["production", "api"],
|
|
574
|
+
"templating": {
|
|
575
|
+
"list": [
|
|
576
|
+
{
|
|
577
|
+
"name": "namespace",
|
|
578
|
+
"type": "query",
|
|
579
|
+
"query": "label_values(http_requests_total, namespace)"
|
|
580
|
+
}
|
|
581
|
+
]
|
|
582
|
+
},
|
|
583
|
+
"panels": [
|
|
584
|
+
{
|
|
585
|
+
"title": "Request Rate",
|
|
586
|
+
"type": "graph",
|
|
587
|
+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
|
588
|
+
"targets": [
|
|
589
|
+
{
|
|
590
|
+
"expr": "sum(rate(http_requests_total{namespace=\"$namespace\"}[5m])) by (status)",
|
|
591
|
+
"legendFormat": "{{status}}"
|
|
592
|
+
}
|
|
593
|
+
]
|
|
594
|
+
},
|
|
595
|
+
{
|
|
596
|
+
"title": "Latency (P50, P95, P99)",
|
|
597
|
+
"type": "graph",
|
|
598
|
+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
|
599
|
+
"targets": [
|
|
600
|
+
{
|
|
601
|
+
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le))",
|
|
602
|
+
"legendFormat": "P50"
|
|
603
|
+
},
|
|
604
|
+
{
|
|
605
|
+
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le))",
|
|
606
|
+
"legendFormat": "P95"
|
|
607
|
+
},
|
|
608
|
+
{
|
|
609
|
+
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le))",
|
|
610
|
+
"legendFormat": "P99"
|
|
611
|
+
}
|
|
612
|
+
]
|
|
613
|
+
},
|
|
614
|
+
{
|
|
615
|
+
"title": "Error Rate",
|
|
616
|
+
"type": "stat",
|
|
617
|
+
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 8},
|
|
618
|
+
"targets": [
|
|
619
|
+
{
|
|
620
|
+
"expr": "sum(rate(http_requests_total{namespace=\"$namespace\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{namespace=\"$namespace\"}[5m]))"
|
|
621
|
+
}
|
|
622
|
+
],
|
|
623
|
+
"fieldConfig": {
|
|
624
|
+
"defaults": {
|
|
625
|
+
"unit": "percentunit",
|
|
626
|
+
"thresholds": {
|
|
627
|
+
"steps": [
|
|
628
|
+
{"value": 0, "color": "green"},
|
|
629
|
+
{"value": 0.01, "color": "yellow"},
|
|
630
|
+
{"value": 0.05, "color": "red"}
|
|
631
|
+
]
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
]
|
|
637
|
+
}
|
|
638
|
+
```
|
|
639
|
+
|
|
640
|
+
## Correlation
|
|
641
|
+
|
|
642
|
+
### Linking Metrics, Logs, and Traces
|
|
643
|
+
|
|
644
|
+
```yaml
|
|
645
|
+
# Grafana data source configuration for correlation
|
|
646
|
+
apiVersion: 1
|
|
647
|
+
datasources:
|
|
648
|
+
- name: Prometheus
|
|
649
|
+
type: prometheus
|
|
650
|
+
url: http://prometheus:9090
|
|
651
|
+
jsonData:
|
|
652
|
+
exemplarTraceIdDestinations:
|
|
653
|
+
- name: trace_id
|
|
654
|
+
datasourceUid: tempo
|
|
655
|
+
|
|
656
|
+
- name: Loki
|
|
657
|
+
type: loki
|
|
658
|
+
url: http://loki:3100
|
|
659
|
+
jsonData:
|
|
660
|
+
derivedFields:
|
|
661
|
+
- name: TraceID
|
|
662
|
+
matcherRegex: '"trace_id":"(\w+)"'
|
|
663
|
+
url: '$${__value.raw}'
|
|
664
|
+
datasourceUid: tempo
|
|
665
|
+
|
|
666
|
+
- name: Tempo
|
|
667
|
+
type: tempo
|
|
668
|
+
url: http://tempo:3200
|
|
669
|
+
jsonData:
|
|
670
|
+
tracesToLogs:
|
|
671
|
+
datasourceUid: loki
|
|
672
|
+
tags: ['app', 'namespace']
|
|
673
|
+
mappedTags: [{ key: 'service.name', value: 'app' }]
|
|
674
|
+
mapTagNamesEnabled: true
|
|
675
|
+
spanStartTimeShift: '-1h'
|
|
676
|
+
spanEndTimeShift: '1h'
|
|
677
|
+
filterByTraceID: true
|
|
678
|
+
filterBySpanID: false
|
|
679
|
+
tracesToMetrics:
|
|
680
|
+
datasourceUid: prometheus
|
|
681
|
+
tags: [{ key: 'service.name', value: 'job' }]
|
|
682
|
+
queries:
|
|
683
|
+
- name: 'Request rate'
|
|
684
|
+
query: 'sum(rate(http_requests_total{$$__tags}[5m]))'
|
|
685
|
+
```
|
|
686
|
+
|
|
687
|
+
## Common Pitfalls
|
|
688
|
+
|
|
689
|
+
### 1. Alert Fatigue
|
|
690
|
+
|
|
691
|
+
```yaml
|
|
692
|
+
# Bad - too sensitive, will fire constantly
|
|
693
|
+
- alert: HighErrorRate
|
|
694
|
+
expr: http_errors_total > 0
|
|
695
|
+
for: 1m
|
|
696
|
+
|
|
697
|
+
# Good - meaningful thresholds with context
|
|
698
|
+
- alert: HighErrorRate
|
|
699
|
+
expr: |
|
|
700
|
+
sum(rate(http_requests_total{status=~"5.."}[5m]))
|
|
701
|
+
/
|
|
702
|
+
sum(rate(http_requests_total[5m]))
|
|
703
|
+
> 0.01
|
|
704
|
+
for: 5m
|
|
705
|
+
labels:
|
|
706
|
+
severity: warning
|
|
707
|
+
```
|
|
708
|
+
|
|
709
|
+
### 2. High Cardinality Metrics
|
|
710
|
+
|
|
711
|
+
```go
|
|
712
|
+
// Bad - unbounded cardinality
|
|
713
|
+
requestsTotal.WithLabelValues(userID, requestPath, queryString)
|
|
714
|
+
|
|
715
|
+
// Good - bounded, normalized labels
|
|
716
|
+
requestsTotal.WithLabelValues(normalizePath(requestPath), method, statusCode)
|
|
717
|
+
```
|
|
718
|
+
|
|
719
|
+
### 3. Missing Context in Logs
|
|
720
|
+
|
|
721
|
+
```go
|
|
722
|
+
// Bad - no context
|
|
723
|
+
log.Error("request failed")
|
|
724
|
+
|
|
725
|
+
// Good - full context
|
|
726
|
+
logger.Error("request failed",
|
|
727
|
+
zap.String("trace_id", traceID),
|
|
728
|
+
zap.String("user_id", userID),
|
|
729
|
+
zap.String("path", path),
|
|
730
|
+
zap.Error(err),
|
|
731
|
+
)
|
|
732
|
+
```
|
|
733
|
+
|
|
734
|
+
### 4. Sampling Without Thought
|
|
735
|
+
|
|
736
|
+
```go
|
|
737
|
+
// Bad - random sampling misses important traces
|
|
738
|
+
sampler := trace.TraceIDRatioBased(0.01) // 1%
|
|
739
|
+
|
|
740
|
+
// Good - sample based on importance
|
|
741
|
+
sampler := trace.ParentBased(
|
|
742
|
+
trace.TraceIDRatioBased(0.1), // 10% base rate
|
|
743
|
+
trace.WithLocalParentSampled(trace.AlwaysSample()), // Always sample if parent sampled
|
|
744
|
+
trace.WithRemoteParentSampled(trace.AlwaysSample()),
|
|
745
|
+
)
|
|
746
|
+
// Plus: always sample errors, slow requests, specific user IDs
|
|
747
|
+
```
|