agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,617 @@
|
|
|
1
|
+
# SLOs, SLIs, and Error Budgets
|
|
2
|
+
|
|
3
|
+
Comprehensive guidelines for defining, measuring, and using Service Level Objectives.
|
|
4
|
+
|
|
5
|
+
## Core Concepts
|
|
6
|
+
|
|
7
|
+
### Definitions
|
|
8
|
+
|
|
9
|
+
```yaml
|
|
10
|
+
sli:
|
|
11
|
+
name: "Service Level Indicator"
|
|
12
|
+
definition: "A quantitative measure of some aspect of the service"
|
|
13
|
+
characteristics:
|
|
14
|
+
- "Measurable and objective"
|
|
15
|
+
- "Expressed as a ratio or percentage"
|
|
16
|
+
- "Reflects user experience"
|
|
17
|
+
example: "Proportion of requests that complete successfully"
|
|
18
|
+
|
|
19
|
+
slo:
|
|
20
|
+
name: "Service Level Objective"
|
|
21
|
+
definition: "A target value or range for an SLI"
|
|
22
|
+
characteristics:
|
|
23
|
+
- "Sets expectations for reliability"
|
|
24
|
+
- "Defines acceptable performance"
|
|
25
|
+
- "Basis for error budgets"
|
|
26
|
+
example: "99.9% of requests succeed over a 30-day window"
|
|
27
|
+
|
|
28
|
+
sla:
|
|
29
|
+
name: "Service Level Agreement"
|
|
30
|
+
definition: "A contract with consequences for missing the SLO"
|
|
31
|
+
characteristics:
|
|
32
|
+
- "External commitment to customers"
|
|
33
|
+
- "Usually less strict than internal SLO"
|
|
34
|
+
- "Has financial/legal implications"
|
|
35
|
+
example: "99.5% availability, credits if breached"
|
|
36
|
+
|
|
37
|
+
error_budget:
|
|
38
|
+
name: "Error Budget"
|
|
39
|
+
definition: "The allowed amount of unreliability"
|
|
40
|
+
characteristics:
|
|
41
|
+
- "Calculated from SLO (1 - SLO)"
|
|
42
|
+
- "Consumed by incidents and errors"
|
|
43
|
+
- "Balances reliability with velocity"
|
|
44
|
+
example: "0.1% = 43.2 minutes of downtime per month"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### The SLO Stack
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
51
|
+
│ SLA (External) │
|
|
52
|
+
│ Contractual commitment: 99.5% availability │
|
|
53
|
+
│ Consequence: Service credits if breached │
|
|
54
|
+
├─────────────────────────────────────────────────────────────┤
|
|
55
|
+
│ SLO (Internal) │
|
|
56
|
+
│ Target: 99.9% availability │
|
|
57
|
+
│ Buffer above SLA for safety margin │
|
|
58
|
+
├─────────────────────────────────────────────────────────────┤
|
|
59
|
+
│ SLI (Measurement) │
|
|
60
|
+
│ Metric: successful_requests / total_requests │
|
|
61
|
+
│ Measured at the load balancer │
|
|
62
|
+
└─────────────────────────────────────────────────────────────┘
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Choosing Good SLIs
|
|
66
|
+
|
|
67
|
+
### SLI Categories
|
|
68
|
+
|
|
69
|
+
```yaml
|
|
70
|
+
availability:
|
|
71
|
+
question: "Is the service up and responding?"
|
|
72
|
+
measurement: "Successful requests / Total requests"
|
|
73
|
+
good_for:
|
|
74
|
+
- "APIs and web services"
|
|
75
|
+
- "User-facing applications"
|
|
76
|
+
- "Critical dependencies"
|
|
77
|
+
examples:
|
|
78
|
+
- "HTTP 2xx/3xx responses"
|
|
79
|
+
- "Non-timeout responses"
|
|
80
|
+
- "Valid responses (not error pages)"
|
|
81
|
+
|
|
82
|
+
latency:
|
|
83
|
+
question: "How fast is the service responding?"
|
|
84
|
+
measurement: "Requests faster than threshold / Total requests"
|
|
85
|
+
good_for:
|
|
86
|
+
- "Real-time applications"
|
|
87
|
+
- "Interactive user experiences"
|
|
88
|
+
- "API response times"
|
|
89
|
+
examples:
|
|
90
|
+
- "Requests < 200ms at p50"
|
|
91
|
+
- "Requests < 1s at p99"
|
|
92
|
+
- "Time to first byte < 500ms"
|
|
93
|
+
|
|
94
|
+
throughput:
|
|
95
|
+
question: "How much work is being done?"
|
|
96
|
+
measurement: "Work units completed per time period"
|
|
97
|
+
good_for:
|
|
98
|
+
- "Batch processing systems"
|
|
99
|
+
- "Data pipelines"
|
|
100
|
+
- "Message queues"
|
|
101
|
+
examples:
|
|
102
|
+
- "Messages processed per second"
|
|
103
|
+
- "Records transformed per hour"
|
|
104
|
+
- "Jobs completed per day"
|
|
105
|
+
|
|
106
|
+
correctness:
|
|
107
|
+
question: "Is the service returning correct results?"
|
|
108
|
+
measurement: "Correct responses / Total responses"
|
|
109
|
+
good_for:
|
|
110
|
+
- "Data processing"
|
|
111
|
+
- "Financial calculations"
|
|
112
|
+
- "Search results"
|
|
113
|
+
examples:
|
|
114
|
+
- "Search results with relevant items"
|
|
115
|
+
- "Transactions with correct amounts"
|
|
116
|
+
- "Data exports with valid format"
|
|
117
|
+
|
|
118
|
+
freshness:
|
|
119
|
+
question: "How current is the data?"
|
|
120
|
+
measurement: "Data age at query time"
|
|
121
|
+
good_for:
|
|
122
|
+
- "Real-time dashboards"
|
|
123
|
+
- "Cache-dependent systems"
|
|
124
|
+
- "Event-driven systems"
|
|
125
|
+
examples:
|
|
126
|
+
- "Data < 5 minutes old"
|
|
127
|
+
- "Index updated within 1 hour"
|
|
128
|
+
- "Cache miss rate < 10%"
|
|
129
|
+
|
|
130
|
+
durability:
|
|
131
|
+
question: "Is the data safe?"
|
|
132
|
+
measurement: "Data successfully persisted / Data submitted"
|
|
133
|
+
good_for:
|
|
134
|
+
- "Storage systems"
|
|
135
|
+
- "Databases"
|
|
136
|
+
- "Backup systems"
|
|
137
|
+
examples:
|
|
138
|
+
- "Writes acknowledged and durable"
|
|
139
|
+
- "Backup success rate"
|
|
140
|
+
- "Replication lag < threshold"
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### SLI Specification
|
|
144
|
+
|
|
145
|
+
```yaml
|
|
146
|
+
# Good SLI specification template
|
|
147
|
+
sli_specification:
|
|
148
|
+
name: "api_availability"
|
|
149
|
+
description: "Proportion of API requests that succeed"
|
|
150
|
+
|
|
151
|
+
measurement:
|
|
152
|
+
what: "HTTP requests returning 2xx or 3xx status"
|
|
153
|
+
where: "Measured at the load balancer"
|
|
154
|
+
excludes:
|
|
155
|
+
- "Health check endpoints (/health, /ready)"
|
|
156
|
+
- "Internal endpoints (/metrics, /debug)"
|
|
157
|
+
- "Requests from automated scanners"
|
|
158
|
+
|
|
159
|
+
calculation: |
|
|
160
|
+
sum(http_requests_total{status=~"2..|3..", endpoint!~"/health|/ready|/metrics"})
|
|
161
|
+
/
|
|
162
|
+
sum(http_requests_total{endpoint!~"/health|/ready|/metrics"})
|
|
163
|
+
|
|
164
|
+
notes:
|
|
165
|
+
- "4xx errors are counted as success (client errors)"
|
|
166
|
+
- "5xx errors are counted as failures"
|
|
167
|
+
- "Timeouts (no response) counted as failures"
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Setting SLO Targets
|
|
171
|
+
|
|
172
|
+
### How to Choose Targets
|
|
173
|
+
|
|
174
|
+
```yaml
|
|
175
|
+
target_setting_approach:
|
|
176
|
+
step_1_baseline:
|
|
177
|
+
action: "Measure current performance"
|
|
178
|
+
duration: "2-4 weeks of data"
|
|
179
|
+
questions:
|
|
180
|
+
- "What is current availability?"
|
|
181
|
+
- "What is current latency distribution?"
|
|
182
|
+
- "What do users experience today?"
|
|
183
|
+
|
|
184
|
+
step_2_user_expectations:
|
|
185
|
+
action: "Understand user needs"
|
|
186
|
+
questions:
|
|
187
|
+
- "What reliability do users expect?"
|
|
188
|
+
- "What do competitors offer?"
|
|
189
|
+
- "What does business require?"
|
|
190
|
+
|
|
191
|
+
step_3_cost_benefit:
|
|
192
|
+
action: "Evaluate reliability vs cost"
|
|
193
|
+
considerations:
|
|
194
|
+
- "Each 9 costs exponentially more"
|
|
195
|
+
- "99.99% requires very different architecture than 99.9%"
|
|
196
|
+
- "Error budget must be usable"
|
|
197
|
+
|
|
198
|
+
step_4_iterate:
|
|
199
|
+
action: "Start conservative, adjust based on data"
|
|
200
|
+
guidance:
|
|
201
|
+
- "Better to exceed SLO than miss it"
|
|
202
|
+
- "Tighten after consistent achievement"
|
|
203
|
+
- "Loosen if budget is never used"
|
|
204
|
+
|
|
205
|
+
reliability_cost_reality:
|
|
206
|
+
ninety_percent:
|
|
207
|
+
downtime_per_month: "72 hours"
|
|
208
|
+
effort: "Minimal - basic monitoring"
|
|
209
|
+
|
|
210
|
+
ninety_nine_percent:
|
|
211
|
+
downtime_per_month: "7.2 hours"
|
|
212
|
+
effort: "Moderate - good practices"
|
|
213
|
+
|
|
214
|
+
three_nines:
|
|
215
|
+
downtime_per_month: "43 minutes"
|
|
216
|
+
effort: "Significant - redundancy required"
|
|
217
|
+
|
|
218
|
+
four_nines:
|
|
219
|
+
downtime_per_month: "4.3 minutes"
|
|
220
|
+
effort: "Major - multi-region, auto-failover"
|
|
221
|
+
|
|
222
|
+
five_nines:
|
|
223
|
+
downtime_per_month: "26 seconds"
|
|
224
|
+
effort: "Extreme - specialized expertise required"
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### SLO Window Selection
|
|
228
|
+
|
|
229
|
+
```yaml
|
|
230
|
+
window_types:
|
|
231
|
+
rolling:
|
|
232
|
+
description: "Last N days from now"
|
|
233
|
+
example: "99.9% over rolling 30 days"
|
|
234
|
+
pros:
|
|
235
|
+
- "Always current"
|
|
236
|
+
- "Continuous feedback"
|
|
237
|
+
- "No reset-day gaming"
|
|
238
|
+
cons:
|
|
239
|
+
- "Bad day stays in window for full period"
|
|
240
|
+
|
|
241
|
+
calendar:
|
|
242
|
+
description: "Current calendar period"
|
|
243
|
+
example: "99.9% this month"
|
|
244
|
+
pros:
|
|
245
|
+
- "Aligns with business cycles"
|
|
246
|
+
- "Clean reset for planning"
|
|
247
|
+
cons:
|
|
248
|
+
- "End-of-period gaming possible"
|
|
249
|
+
- "Sudden reset loses context"
|
|
250
|
+
|
|
251
|
+
recommended_windows:
|
|
252
|
+
short_term: "7 days - for operational alerting"
|
|
253
|
+
standard: "30 days - for SLO tracking"
|
|
254
|
+
long_term: "90 days - for trend analysis"
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## Error Budgets
|
|
258
|
+
|
|
259
|
+
### Calculating Error Budgets
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
# Error budget calculation examples
|
|
263
|
+
|
|
264
|
+
def calculate_error_budget(slo_target: float, window_days: int) -> dict:
|
|
265
|
+
"""Calculate error budget in various units."""
|
|
266
|
+
|
|
267
|
+
# Error budget as percentage
|
|
268
|
+
error_budget_percent = 1 - slo_target
|
|
269
|
+
|
|
270
|
+
# Convert to time
|
|
271
|
+
window_minutes = window_days * 24 * 60
|
|
272
|
+
error_budget_minutes = window_minutes * error_budget_percent
|
|
273
|
+
|
|
274
|
+
# Convert to requests (example: 1M requests/day)
|
|
275
|
+
daily_requests = 1_000_000
|
|
276
|
+
total_requests = daily_requests * window_days
|
|
277
|
+
error_budget_requests = total_requests * error_budget_percent
|
|
278
|
+
|
|
279
|
+
return {
|
|
280
|
+
"percentage": error_budget_percent,
|
|
281
|
+
"minutes": error_budget_minutes,
|
|
282
|
+
"requests": error_budget_requests,
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
# Examples:
|
|
286
|
+
# 99.9% SLO over 30 days
|
|
287
|
+
# = 0.1% error budget
|
|
288
|
+
# = 43.2 minutes of downtime
|
|
289
|
+
# = 30,000 failed requests (at 1M/day)
|
|
290
|
+
|
|
291
|
+
# 99% SLO over 30 days
|
|
292
|
+
# = 1% error budget
|
|
293
|
+
# = 432 minutes (7.2 hours) of downtime
|
|
294
|
+
# = 300,000 failed requests
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
### Error Budget Policy
|
|
298
|
+
|
|
299
|
+
```yaml
|
|
300
|
+
error_budget_policy:
|
|
301
|
+
purpose: "Define actions based on error budget consumption"
|
|
302
|
+
|
|
303
|
+
thresholds:
|
|
304
|
+
healthy:
|
|
305
|
+
budget_remaining: ">50%"
|
|
306
|
+
development_velocity: "Full speed"
|
|
307
|
+
deployment_frequency: "Normal (daily+)"
|
|
308
|
+
risk_tolerance: "High - experimentation encouraged"
|
|
309
|
+
actions:
|
|
310
|
+
- "Ship new features"
|
|
311
|
+
- "Run experiments"
|
|
312
|
+
- "Accept reasonable risk"
|
|
313
|
+
- "Focus on velocity"
|
|
314
|
+
|
|
315
|
+
caution:
|
|
316
|
+
budget_remaining: "25-50%"
|
|
317
|
+
development_velocity: "Moderate"
|
|
318
|
+
deployment_frequency: "Normal with extra scrutiny"
|
|
319
|
+
risk_tolerance: "Medium"
|
|
320
|
+
actions:
|
|
321
|
+
- "Review recent reliability trends"
|
|
322
|
+
- "Prioritize reliability improvements"
|
|
323
|
+
- "Extra review for risky changes"
|
|
324
|
+
- "Consider postponing high-risk features"
|
|
325
|
+
|
|
326
|
+
critical:
|
|
327
|
+
budget_remaining: "10-25%"
|
|
328
|
+
development_velocity: "Reduced"
|
|
329
|
+
deployment_frequency: "Limited to fixes and critical features"
|
|
330
|
+
risk_tolerance: "Low"
|
|
331
|
+
actions:
|
|
332
|
+
- "Feature freeze for non-critical work"
|
|
333
|
+
- "Reliability improvements prioritized"
|
|
334
|
+
- "All changes require reliability review"
|
|
335
|
+
- "Mandatory rollback plans"
|
|
336
|
+
- "Increased monitoring during deploys"
|
|
337
|
+
|
|
338
|
+
exhausted:
|
|
339
|
+
budget_remaining: "<10%"
|
|
340
|
+
development_velocity: "Minimal"
|
|
341
|
+
deployment_frequency: "Emergency fixes only"
|
|
342
|
+
risk_tolerance: "None"
|
|
343
|
+
actions:
|
|
344
|
+
- "Full feature freeze"
|
|
345
|
+
- "All engineering on reliability"
|
|
346
|
+
- "Post-incident review before any deploy"
|
|
347
|
+
- "Consider service degradation for stability"
|
|
348
|
+
- "Executive escalation"
|
|
349
|
+
|
|
350
|
+
budget_replenishment:
|
|
351
|
+
note: "Budget replenishes as time passes in rolling window"
|
|
352
|
+
example: "A 30-day window means yesterday's errors age out in 30 days"
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
### Burn Rate Alerting
|
|
356
|
+
|
|
357
|
+
```yaml
|
|
358
|
+
burn_rate_concept:
|
|
359
|
+
definition: "Rate at which error budget is being consumed"
|
|
360
|
+
formula: "actual_error_rate / tolerated_error_rate"
|
|
361
|
+
|
|
362
|
+
example:
|
|
363
|
+
slo: "99.9% over 30 days"
|
|
364
|
+
tolerated_error_rate: "0.1%"
|
|
365
|
+
actual_error_rate: "0.3%"
|
|
366
|
+
burn_rate: "3x (budget consumed 3x faster than allowed)"
|
|
367
|
+
|
|
368
|
+
multi_window_burn_rate:
|
|
369
|
+
purpose: "Balance between fast detection and noise reduction"
|
|
370
|
+
|
|
371
|
+
windows:
|
|
372
|
+
fast_burn:
|
|
373
|
+
short_window: "5m"
|
|
374
|
+
long_window: "1h"
|
|
375
|
+
burn_rate: "14.4x"
|
|
376
|
+
budget_consumed: "2% in 1 hour"
|
|
377
|
+
severity: "Page immediately"
|
|
378
|
+
|
|
379
|
+
medium_burn:
|
|
380
|
+
short_window: "30m"
|
|
381
|
+
long_window: "6h"
|
|
382
|
+
burn_rate: "6x"
|
|
383
|
+
budget_consumed: "5% in 6 hours"
|
|
384
|
+
severity: "Page during business hours"
|
|
385
|
+
|
|
386
|
+
slow_burn:
|
|
387
|
+
short_window: "2h"
|
|
388
|
+
long_window: "24h"
|
|
389
|
+
burn_rate: "3x"
|
|
390
|
+
budget_consumed: "10% in 24 hours"
|
|
391
|
+
severity: "Ticket"
|
|
392
|
+
|
|
393
|
+
very_slow_burn:
|
|
394
|
+
short_window: "6h"
|
|
395
|
+
long_window: "3d"
|
|
396
|
+
burn_rate: "1x"
|
|
397
|
+
budget_consumed: "Budget on track to exhaust"
|
|
398
|
+
severity: "Review in standup"
|
|
399
|
+
|
|
400
|
+
prometheus_burn_rate_rules: |
|
|
401
|
+
# Fast burn - 2% of 30-day budget in 1 hour
|
|
402
|
+
groups:
|
|
403
|
+
- name: slo-burn-rate
|
|
404
|
+
rules:
|
|
405
|
+
- alert: SLOFastBurn
|
|
406
|
+
expr: |
|
|
407
|
+
(
|
|
408
|
+
sum(rate(http_requests_total{status=~"5.."}[5m]))
|
|
409
|
+
/ sum(rate(http_requests_total[5m]))
|
|
410
|
+
> 14.4 * 0.001
|
|
411
|
+
)
|
|
412
|
+
and
|
|
413
|
+
(
|
|
414
|
+
sum(rate(http_requests_total{status=~"5.."}[1h]))
|
|
415
|
+
/ sum(rate(http_requests_total[1h]))
|
|
416
|
+
> 14.4 * 0.001
|
|
417
|
+
)
|
|
418
|
+
labels:
|
|
419
|
+
severity: critical
|
|
420
|
+
annotations:
|
|
421
|
+
summary: "SLO burn rate critical - paging"
|
|
422
|
+
|
|
423
|
+
- alert: SLOMediumBurn
|
|
424
|
+
expr: |
|
|
425
|
+
(
|
|
426
|
+
sum(rate(http_requests_total{status=~"5.."}[30m]))
|
|
427
|
+
/ sum(rate(http_requests_total[30m]))
|
|
428
|
+
> 6 * 0.001
|
|
429
|
+
)
|
|
430
|
+
and
|
|
431
|
+
(
|
|
432
|
+
sum(rate(http_requests_total{status=~"5.."}[6h]))
|
|
433
|
+
/ sum(rate(http_requests_total[6h]))
|
|
434
|
+
> 6 * 0.001
|
|
435
|
+
)
|
|
436
|
+
labels:
|
|
437
|
+
severity: warning
|
|
438
|
+
annotations:
|
|
439
|
+
summary: "SLO burn rate elevated"
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
## SLO Examples by Service Type
|
|
443
|
+
|
|
444
|
+
### API Service SLOs
|
|
445
|
+
|
|
446
|
+
```yaml
|
|
447
|
+
api_service_slos:
|
|
448
|
+
availability:
|
|
449
|
+
sli: "Proportion of non-5xx responses"
|
|
450
|
+
target: "99.9%"
|
|
451
|
+
window: "30 days rolling"
|
|
452
|
+
measurement: |
|
|
453
|
+
sum(rate(http_requests_total{status!~"5.."}[5m]))
|
|
454
|
+
/ sum(rate(http_requests_total[5m]))
|
|
455
|
+
|
|
456
|
+
latency_p50:
|
|
457
|
+
sli: "Median response time under threshold"
|
|
458
|
+
target: "99%"
|
|
459
|
+
threshold: "100ms"
|
|
460
|
+
window: "30 days rolling"
|
|
461
|
+
|
|
462
|
+
latency_p99:
|
|
463
|
+
sli: "99th percentile response time under threshold"
|
|
464
|
+
target: "99%"
|
|
465
|
+
threshold: "500ms"
|
|
466
|
+
window: "30 days rolling"
|
|
467
|
+
```
|
|
468
|
+
|
|
469
|
+
### Database SLOs
|
|
470
|
+
|
|
471
|
+
```yaml
|
|
472
|
+
database_slos:
|
|
473
|
+
availability:
|
|
474
|
+
sli: "Proportion of successful queries"
|
|
475
|
+
target: "99.99%"
|
|
476
|
+
window: "30 days rolling"
|
|
477
|
+
|
|
478
|
+
read_latency:
|
|
479
|
+
sli: "Read queries under latency threshold"
|
|
480
|
+
target: "99%"
|
|
481
|
+
threshold: "50ms at p99"
|
|
482
|
+
|
|
483
|
+
write_latency:
|
|
484
|
+
sli: "Write queries under latency threshold"
|
|
485
|
+
target: "99%"
|
|
486
|
+
threshold: "100ms at p99"
|
|
487
|
+
|
|
488
|
+
replication_lag:
|
|
489
|
+
sli: "Replica within lag threshold"
|
|
490
|
+
target: "99.9%"
|
|
491
|
+
threshold: "1 second"
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
### Batch Processing SLOs
|
|
495
|
+
|
|
496
|
+
```yaml
|
|
497
|
+
batch_processing_slos:
|
|
498
|
+
completeness:
|
|
499
|
+
sli: "Jobs completing successfully"
|
|
500
|
+
target: "99%"
|
|
501
|
+
window: "7 days rolling"
|
|
502
|
+
|
|
503
|
+
timeliness:
|
|
504
|
+
sli: "Jobs completing within SLA"
|
|
505
|
+
target: "95%"
|
|
506
|
+
threshold: "Job completes within 2x expected duration"
|
|
507
|
+
|
|
508
|
+
freshness:
|
|
509
|
+
sli: "Data processed within freshness requirement"
|
|
510
|
+
target: "99%"
|
|
511
|
+
threshold: "Data no more than 1 hour old"
|
|
512
|
+
```
|
|
513
|
+
|
|
514
|
+
## SLO Dashboard Design
|
|
515
|
+
|
|
516
|
+
### Essential Dashboard Elements
|
|
517
|
+
|
|
518
|
+
```yaml
|
|
519
|
+
slo_dashboard_sections:
|
|
520
|
+
summary:
|
|
521
|
+
- "Current SLO status (meeting/at risk/breaching)"
|
|
522
|
+
- "Error budget remaining (percentage and time)"
|
|
523
|
+
- "Burn rate trend"
|
|
524
|
+
|
|
525
|
+
detailed_metrics:
|
|
526
|
+
- "SLI value over time"
|
|
527
|
+
- "Error budget consumption over time"
|
|
528
|
+
- "Burn rate over time"
|
|
529
|
+
|
|
530
|
+
context:
|
|
531
|
+
- "Recent incidents affecting SLO"
|
|
532
|
+
- "Recent deployments"
|
|
533
|
+
- "Traffic patterns"
|
|
534
|
+
|
|
535
|
+
historical:
|
|
536
|
+
- "SLO achievement by month"
|
|
537
|
+
- "Error budget consumption by cause"
|
|
538
|
+
- "Trend analysis"
|
|
539
|
+
|
|
540
|
+
grafana_panel_examples:
|
|
541
|
+
current_slo_status: |
|
|
542
|
+
# Stat panel showing current SLI
|
|
543
|
+
100 * (
|
|
544
|
+
sum(rate(http_requests_total{status!~"5.."}[30d]))
|
|
545
|
+
/ sum(rate(http_requests_total[30d]))
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
error_budget_remaining: |
|
|
549
|
+
# Gauge showing budget remaining
|
|
550
|
+
1 - (
|
|
551
|
+
sum(increase(http_requests_total{status=~"5.."}[30d]))
|
|
552
|
+
/ (sum(increase(http_requests_total[30d])) * 0.001)
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
burn_rate: |
|
|
556
|
+
# Graph showing burn rate over time
|
|
557
|
+
(
|
|
558
|
+
sum(rate(http_requests_total{status=~"5.."}[1h]))
|
|
559
|
+
/ sum(rate(http_requests_total[1h]))
|
|
560
|
+
) / 0.001
|
|
561
|
+
```
|
|
562
|
+
|
|
563
|
+
## Common Pitfalls
|
|
564
|
+
|
|
565
|
+
### SLI Pitfalls
|
|
566
|
+
|
|
567
|
+
```yaml
|
|
568
|
+
pitfall_measuring_wrong_thing:
|
|
569
|
+
wrong: "Server thinks request succeeded"
|
|
570
|
+
right: "User actually got the response they needed"
|
|
571
|
+
example: "Measure at the edge, not just the server"
|
|
572
|
+
|
|
573
|
+
pitfall_excluding_too_much:
|
|
574
|
+
wrong: "Exclude retries, certain error codes, specific endpoints"
|
|
575
|
+
right: "Measure what users experience"
|
|
576
|
+
note: "If users see it, it counts"
|
|
577
|
+
|
|
578
|
+
pitfall_internal_metrics:
|
|
579
|
+
wrong: "CPU usage, memory usage, queue depth"
|
|
580
|
+
right: "Request success rate, latency, user-visible outcomes"
|
|
581
|
+
note: "Internal metrics are useful but not SLIs"
|
|
582
|
+
```
|
|
583
|
+
|
|
584
|
+
### SLO Pitfalls
|
|
585
|
+
|
|
586
|
+
```yaml
|
|
587
|
+
pitfall_100_percent:
|
|
588
|
+
wrong: "Our SLO is 100% availability"
|
|
589
|
+
right: "Our SLO is 99.9% availability with clear error budget"
|
|
590
|
+
reason: "100% is unachievable and prevents any change"
|
|
591
|
+
|
|
592
|
+
pitfall_too_many_slos:
|
|
593
|
+
wrong: "50 SLOs covering every metric"
|
|
594
|
+
right: "3-5 SLOs covering critical user journeys"
|
|
595
|
+
reason: "Too many SLOs means none are meaningful"
|
|
596
|
+
|
|
597
|
+
pitfall_slo_without_teeth:
|
|
598
|
+
wrong: "We have SLOs but nothing happens when we miss them"
|
|
599
|
+
right: "Error budget policy drives real decisions"
|
|
600
|
+
reason: "SLOs without consequences are just reports"
|
|
601
|
+
```
|
|
602
|
+
|
|
603
|
+
### Error Budget Pitfalls
|
|
604
|
+
|
|
605
|
+
```yaml
|
|
606
|
+
pitfall_budget_as_target:
|
|
607
|
+
wrong: "We must use all our error budget"
|
|
608
|
+
right: "Error budget is a limit, not a goal"
|
|
609
|
+
|
|
610
|
+
pitfall_hoarding_budget:
|
|
611
|
+
wrong: "Never deploy because we might use error budget"
|
|
612
|
+
right: "Use budget to enable velocity and experimentation"
|
|
613
|
+
|
|
614
|
+
pitfall_ignoring_budget:
|
|
615
|
+
wrong: "Budget is exhausted but we keep shipping"
|
|
616
|
+
right: "Follow error budget policy strictly"
|
|
617
|
+
```
|