specweave 0.3.13 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/CLAUDE.md +17 -1
  2. package/README.md +1 -1
  3. package/bin/install-all.sh +9 -2
  4. package/bin/install-hooks.sh +57 -0
  5. package/dist/cli/commands/init.d.ts.map +1 -1
  6. package/dist/cli/commands/init.js +55 -0
  7. package/dist/cli/commands/init.js.map +1 -1
  8. package/dist/core/agent-model-manager.d.ts +52 -0
  9. package/dist/core/agent-model-manager.d.ts.map +1 -0
  10. package/dist/core/agent-model-manager.js +120 -0
  11. package/dist/core/agent-model-manager.js.map +1 -0
  12. package/dist/core/cost-tracker.d.ts +108 -0
  13. package/dist/core/cost-tracker.d.ts.map +1 -0
  14. package/dist/core/cost-tracker.js +281 -0
  15. package/dist/core/cost-tracker.js.map +1 -0
  16. package/dist/core/model-selector.d.ts +57 -0
  17. package/dist/core/model-selector.d.ts.map +1 -0
  18. package/dist/core/model-selector.js +115 -0
  19. package/dist/core/model-selector.js.map +1 -0
  20. package/dist/core/phase-detector.d.ts +62 -0
  21. package/dist/core/phase-detector.d.ts.map +1 -0
  22. package/dist/core/phase-detector.js +229 -0
  23. package/dist/core/phase-detector.js.map +1 -0
  24. package/dist/types/cost-tracking.d.ts +43 -0
  25. package/dist/types/cost-tracking.d.ts.map +1 -0
  26. package/dist/types/cost-tracking.js +8 -0
  27. package/dist/types/cost-tracking.js.map +1 -0
  28. package/dist/types/model-selection.d.ts +53 -0
  29. package/dist/types/model-selection.d.ts.map +1 -0
  30. package/dist/types/model-selection.js +12 -0
  31. package/dist/types/model-selection.js.map +1 -0
  32. package/dist/utils/cost-reporter.d.ts +58 -0
  33. package/dist/utils/cost-reporter.d.ts.map +1 -0
  34. package/dist/utils/cost-reporter.js +224 -0
  35. package/dist/utils/cost-reporter.js.map +1 -0
  36. package/dist/utils/pricing-constants.d.ts +70 -0
  37. package/dist/utils/pricing-constants.d.ts.map +1 -0
  38. package/dist/utils/pricing-constants.js +71 -0
  39. package/dist/utils/pricing-constants.js.map +1 -0
  40. package/package.json +1 -1
  41. package/src/agents/architect/AGENT.md +3 -0
  42. package/src/agents/code-reviewer.md +156 -0
  43. package/src/agents/data-scientist/AGENT.md +181 -0
  44. package/src/agents/database-optimizer/AGENT.md +147 -0
  45. package/src/agents/devops/AGENT.md +3 -0
  46. package/src/agents/diagrams-architect/AGENT.md +3 -0
  47. package/src/agents/docs-writer/AGENT.md +3 -0
  48. package/src/agents/kubernetes-architect/AGENT.md +142 -0
  49. package/src/agents/ml-engineer/AGENT.md +150 -0
  50. package/src/agents/mlops-engineer/AGENT.md +201 -0
  51. package/src/agents/network-engineer/AGENT.md +149 -0
  52. package/src/agents/observability-engineer/AGENT.md +213 -0
  53. package/src/agents/payment-integration/AGENT.md +35 -0
  54. package/src/agents/performance/AGENT.md +3 -0
  55. package/src/agents/performance-engineer/AGENT.md +153 -0
  56. package/src/agents/pm/AGENT.md +3 -0
  57. package/src/agents/qa-lead/AGENT.md +3 -0
  58. package/src/agents/security/AGENT.md +3 -0
  59. package/src/agents/sre/AGENT.md +3 -0
  60. package/src/agents/tdd-orchestrator/AGENT.md +169 -0
  61. package/src/agents/tech-lead/AGENT.md +3 -0
  62. package/src/commands/specweave.costs.md +261 -0
  63. package/src/commands/specweave.ml-pipeline.md +292 -0
  64. package/src/commands/specweave.monitor-setup.md +501 -0
  65. package/src/commands/specweave.slo-implement.md +1055 -0
  66. package/src/commands/specweave.sync-github.md +1 -1
  67. package/src/commands/specweave.tdd-cycle.md +199 -0
  68. package/src/commands/specweave.tdd-green.md +842 -0
  69. package/src/commands/specweave.tdd-red.md +135 -0
  70. package/src/commands/specweave.tdd-refactor.md +165 -0
  71. package/src/skills/SKILLS-INDEX.md +18 -10
  72. package/src/skills/billing-automation/SKILL.md +559 -0
  73. package/src/skills/distributed-tracing/SKILL.md +438 -0
  74. package/src/skills/e2e-playwright/README.md +1 -1
  75. package/src/skills/e2e-playwright/package.json +1 -1
  76. package/src/skills/gitops-workflow/SKILL.md +285 -0
  77. package/src/skills/gitops-workflow/references/argocd-setup.md +134 -0
  78. package/src/skills/gitops-workflow/references/sync-policies.md +131 -0
  79. package/src/skills/grafana-dashboards/SKILL.md +369 -0
  80. package/src/skills/helm-chart-scaffolding/SKILL.md +544 -0
  81. package/src/skills/helm-chart-scaffolding/assets/Chart.yaml.template +42 -0
  82. package/src/skills/helm-chart-scaffolding/assets/values.yaml.template +185 -0
  83. package/src/skills/helm-chart-scaffolding/references/chart-structure.md +500 -0
  84. package/src/skills/helm-chart-scaffolding/scripts/validate-chart.sh +244 -0
  85. package/src/skills/k8s-manifest-generator/SKILL.md +511 -0
  86. package/src/skills/k8s-manifest-generator/assets/configmap-template.yaml +296 -0
  87. package/src/skills/k8s-manifest-generator/assets/deployment-template.yaml +203 -0
  88. package/src/skills/k8s-manifest-generator/assets/service-template.yaml +171 -0
  89. package/src/skills/k8s-manifest-generator/references/deployment-spec.md +753 -0
  90. package/src/skills/k8s-manifest-generator/references/service-spec.md +724 -0
  91. package/src/skills/k8s-security-policies/SKILL.md +334 -0
  92. package/src/skills/k8s-security-policies/assets/network-policy-template.yaml +177 -0
  93. package/src/skills/k8s-security-policies/references/rbac-patterns.md +187 -0
  94. package/src/skills/ml-pipeline-workflow/SKILL.md +245 -0
  95. package/src/skills/paypal-integration/SKILL.md +467 -0
  96. package/src/skills/pci-compliance/SKILL.md +466 -0
  97. package/src/skills/prometheus-configuration/SKILL.md +392 -0
  98. package/src/skills/slo-implementation/SKILL.md +329 -0
  99. package/src/skills/stripe-integration/SKILL.md +442 -0
  100. package/src/skills/tdd-workflow/SKILL.md +378 -0
  101. package/src/templates/README.md.template +1 -1
  102. package/src/skills/bmad-method-expert/SKILL.md +0 -626
  103. package/src/skills/bmad-method-expert/scripts/analyze-project.js +0 -318
  104. package/src/skills/bmad-method-expert/scripts/check-setup.js +0 -208
  105. package/src/skills/bmad-method-expert/scripts/generate-template.js +0 -1149
  106. package/src/skills/bmad-method-expert/scripts/validate-documents.js +0 -340
  107. package/src/skills/context-optimizer/SKILL.md +0 -588
  108. package/src/skills/figma-designer/SKILL.md +0 -149
  109. package/src/skills/figma-implementer/SKILL.md +0 -148
  110. package/src/skills/figma-mcp-connector/SKILL.md +0 -136
  111. package/src/skills/figma-to-code/SKILL.md +0 -128
  112. package/src/skills/spec-kit-expert/SKILL.md +0 -1010
@@ -0,0 +1,392 @@
1
+ ---
2
+ name: prometheus-configuration
3
+ description: Set up Prometheus for comprehensive metric collection, storage, and monitoring of infrastructure and applications. Use when implementing metrics collection, setting up monitoring infrastructure, or configuring alerting systems.
4
+ ---
5
+
6
+ # Prometheus Configuration
7
+
8
+ Complete guide to Prometheus setup, metric collection, scrape configuration, and recording rules.
9
+
10
+ ## Purpose
11
+
12
+ Configure Prometheus for comprehensive metric collection, alerting, and monitoring of infrastructure and applications.
13
+
14
+ ## When to Use
15
+
16
+ - Set up Prometheus monitoring
17
+ - Configure metric scraping
18
+ - Create recording rules
19
+ - Design alert rules
20
+ - Implement service discovery
21
+
22
+ ## Prometheus Architecture
23
+
24
+ ```
25
+ ┌──────────────┐
26
+ │ Applications │ ← Instrumented with client libraries
27
+ └──────┬───────┘
28
+ │ /metrics endpoint
29
+
30
+ ┌──────────────┐
31
+ │ Prometheus │ ← Scrapes metrics periodically
32
+ │ Server │
33
+ └──────┬───────┘
34
+
35
+ ├─→ AlertManager (alerts)
36
+ ├─→ Grafana (visualization)
37
+ └─→ Long-term storage (Thanos/Cortex)
38
+ ```
39
+
40
+ ## Installation
41
+
42
+ ### Kubernetes with Helm
43
+
44
+ ```bash
45
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
46
+ helm repo update
47
+
48
+ helm install prometheus prometheus-community/kube-prometheus-stack \
49
+ --namespace monitoring \
50
+ --create-namespace \
51
+ --set prometheus.prometheusSpec.retention=30d \
52
+ --set prometheus.prometheusSpec.storageVolumeSize=50Gi
53
+ ```
54
+
55
+ ### Docker Compose
56
+
57
+ ```yaml
58
+ version: '3.8'
59
+ services:
60
+ prometheus:
61
+ image: prom/prometheus:latest
62
+ ports:
63
+ - "9090:9090"
64
+ volumes:
65
+ - ./prometheus.yml:/etc/prometheus/prometheus.yml
66
+ - prometheus-data:/prometheus
67
+ command:
68
+ - '--config.file=/etc/prometheus/prometheus.yml'
69
+ - '--storage.tsdb.path=/prometheus'
70
+ - '--storage.tsdb.retention.time=30d'
71
+
72
+ volumes:
73
+ prometheus-data:
74
+ ```
75
+
76
+ ## Configuration File
77
+
78
+ **prometheus.yml:**
79
+ ```yaml
80
+ global:
81
+ scrape_interval: 15s
82
+ evaluation_interval: 15s
83
+ external_labels:
84
+ cluster: 'production'
85
+ region: 'us-west-2'
86
+
87
+ # Alertmanager configuration
88
+ alerting:
89
+ alertmanagers:
90
+ - static_configs:
91
+ - targets:
92
+ - alertmanager:9093
93
+
94
+ # Load rules files
95
+ rule_files:
96
+ - /etc/prometheus/rules/*.yml
97
+
98
+ # Scrape configurations
99
+ scrape_configs:
100
+ # Prometheus itself
101
+ - job_name: 'prometheus'
102
+ static_configs:
103
+ - targets: ['localhost:9090']
104
+
105
+ # Node exporters
106
+ - job_name: 'node-exporter'
107
+ static_configs:
108
+ - targets:
109
+ - 'node1:9100'
110
+ - 'node2:9100'
111
+ - 'node3:9100'
112
+ relabel_configs:
113
+ - source_labels: [__address__]
114
+ target_label: instance
115
+ regex: '([^:]+)(:[0-9]+)?'
116
+ replacement: '${1}'
117
+
118
+ # Kubernetes pods with annotations
119
+ - job_name: 'kubernetes-pods'
120
+ kubernetes_sd_configs:
121
+ - role: pod
122
+ relabel_configs:
123
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
124
+ action: keep
125
+ regex: true
126
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
127
+ action: replace
128
+ target_label: __metrics_path__
129
+ regex: (.+)
130
+ - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
131
+ action: replace
132
+ regex: ([^:]+)(?::\d+)?;(\d+)
133
+ replacement: $1:$2
134
+ target_label: __address__
135
+ - source_labels: [__meta_kubernetes_namespace]
136
+ action: replace
137
+ target_label: namespace
138
+ - source_labels: [__meta_kubernetes_pod_name]
139
+ action: replace
140
+ target_label: pod
141
+
142
+ # Application metrics
143
+ - job_name: 'my-app'
144
+ static_configs:
145
+ - targets:
146
+ - 'app1.example.com:9090'
147
+ - 'app2.example.com:9090'
148
+ metrics_path: '/metrics'
149
+ scheme: 'https'
150
+ tls_config:
151
+ ca_file: /etc/prometheus/ca.crt
152
+ cert_file: /etc/prometheus/client.crt
153
+ key_file: /etc/prometheus/client.key
154
+ ```
155
+
156
+ **Reference:** See `assets/prometheus.yml.template`
157
+
158
+ ## Scrape Configurations
159
+
160
+ ### Static Targets
161
+
162
+ ```yaml
163
+ scrape_configs:
164
+ - job_name: 'static-targets'
165
+ static_configs:
166
+ - targets: ['host1:9100', 'host2:9100']
167
+ labels:
168
+ env: 'production'
169
+ region: 'us-west-2'
170
+ ```
171
+
172
+ ### File-based Service Discovery
173
+
174
+ ```yaml
175
+ scrape_configs:
176
+ - job_name: 'file-sd'
177
+ file_sd_configs:
178
+ - files:
179
+ - /etc/prometheus/targets/*.json
180
+ - /etc/prometheus/targets/*.yml
181
+ refresh_interval: 5m
182
+ ```
183
+
184
+ **targets/production.json:**
185
+ ```json
186
+ [
187
+ {
188
+ "targets": ["app1:9090", "app2:9090"],
189
+ "labels": {
190
+ "env": "production",
191
+ "service": "api"
192
+ }
193
+ }
194
+ ]
195
+ ```
196
+
197
+ ### Kubernetes Service Discovery
198
+
199
+ ```yaml
200
+ scrape_configs:
201
+ - job_name: 'kubernetes-services'
202
+ kubernetes_sd_configs:
203
+ - role: service
204
+ relabel_configs:
205
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
206
+ action: keep
207
+ regex: true
208
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
209
+ action: replace
210
+ target_label: __scheme__
211
+ regex: (https?)
212
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
213
+ action: replace
214
+ target_label: __metrics_path__
215
+ regex: (.+)
216
+ ```
217
+
218
+ **Reference:** See `references/scrape-configs.md`
219
+
220
+ ## Recording Rules
221
+
222
+ Create pre-computed metrics for frequently queried expressions:
223
+
224
+ ```yaml
225
+ # /etc/prometheus/rules/recording_rules.yml
226
+ groups:
227
+ - name: api_metrics
228
+ interval: 15s
229
+ rules:
230
+ # HTTP request rate per service
231
+ - record: job:http_requests:rate5m
232
+ expr: sum by (job) (rate(http_requests_total[5m]))
233
+
234
+ # Error rate percentage
235
+ - record: job:http_requests_errors:rate5m
236
+ expr: sum by (job) (rate(http_requests_total{status=~"5.."}[5m]))
237
+
238
+ - record: job:http_requests_error_rate:percentage
239
+ expr: |
240
+ (job:http_requests_errors:rate5m / job:http_requests:rate5m) * 100
241
+
242
+ # P95 latency
243
+ - record: job:http_request_duration:p95
244
+ expr: |
245
+ histogram_quantile(0.95,
246
+ sum by (job, le) (rate(http_request_duration_seconds_bucket[5m]))
247
+ )
248
+
249
+ - name: resource_metrics
250
+ interval: 30s
251
+ rules:
252
+ # CPU utilization percentage
253
+ - record: instance:node_cpu:utilization
254
+ expr: |
255
+ 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
256
+
257
+ # Memory utilization percentage
258
+ - record: instance:node_memory:utilization
259
+ expr: |
260
+ 100 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100)
261
+
262
+ # Disk usage percentage
263
+ - record: instance:node_disk:utilization
264
+ expr: |
265
+ 100 - ((node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100)
266
+ ```
267
+
268
+ **Reference:** See `references/recording-rules.md`
269
+
270
+ ## Alert Rules
271
+
272
+ ```yaml
273
+ # /etc/prometheus/rules/alert_rules.yml
274
+ groups:
275
+ - name: availability
276
+ interval: 30s
277
+ rules:
278
+ - alert: ServiceDown
279
+ expr: up{job="my-app"} == 0
280
+ for: 1m
281
+ labels:
282
+ severity: critical
283
+ annotations:
284
+ summary: "Service {{ $labels.instance }} is down"
285
+ description: "{{ $labels.job }} has been down for more than 1 minute"
286
+
287
+ - alert: HighErrorRate
288
+ expr: job:http_requests_error_rate:percentage > 5
289
+ for: 5m
290
+ labels:
291
+ severity: warning
292
+ annotations:
293
+ summary: "High error rate for {{ $labels.job }}"
294
+ description: "Error rate is {{ $value }}% (threshold: 5%)"
295
+
296
+ - alert: HighLatency
297
+ expr: job:http_request_duration:p95 > 1
298
+ for: 5m
299
+ labels:
300
+ severity: warning
301
+ annotations:
302
+ summary: "High latency for {{ $labels.job }}"
303
+ description: "P95 latency is {{ $value }}s (threshold: 1s)"
304
+
305
+ - name: resources
306
+ interval: 1m
307
+ rules:
308
+ - alert: HighCPUUsage
309
+ expr: instance:node_cpu:utilization > 80
310
+ for: 5m
311
+ labels:
312
+ severity: warning
313
+ annotations:
314
+ summary: "High CPU usage on {{ $labels.instance }}"
315
+ description: "CPU usage is {{ $value }}%"
316
+
317
+ - alert: HighMemoryUsage
318
+ expr: instance:node_memory:utilization > 85
319
+ for: 5m
320
+ labels:
321
+ severity: warning
322
+ annotations:
323
+ summary: "High memory usage on {{ $labels.instance }}"
324
+ description: "Memory usage is {{ $value }}%"
325
+
326
+ - alert: DiskSpaceLow
327
+ expr: instance:node_disk:utilization > 90
328
+ for: 5m
329
+ labels:
330
+ severity: critical
331
+ annotations:
332
+ summary: "Low disk space on {{ $labels.instance }}"
333
+ description: "Disk usage is {{ $value }}%"
334
+ ```
335
+
336
+ ## Validation
337
+
338
+ ```bash
339
+ # Validate configuration
340
+ promtool check config prometheus.yml
341
+
342
+ # Validate rules
343
+ promtool check rules /etc/prometheus/rules/*.yml
344
+
345
+ # Test query
346
+ promtool query instant http://localhost:9090 'up'
347
+ ```
348
+
349
+ **Reference:** See `scripts/validate-prometheus.sh`
350
+
351
+ ## Best Practices
352
+
353
+ 1. **Use consistent naming** for metrics (prefix_name_unit)
354
+ 2. **Set appropriate scrape intervals** (15-60s typical)
355
+ 3. **Use recording rules** for expensive queries
356
+ 4. **Implement high availability** (multiple Prometheus instances)
357
+ 5. **Configure retention** based on storage capacity
358
+ 6. **Use relabeling** for metric cleanup
359
+ 7. **Monitor Prometheus itself**
360
+ 8. **Implement federation** for large deployments
361
+ 9. **Use Thanos/Cortex** for long-term storage
362
+ 10. **Document custom metrics**
363
+
364
+ ## Troubleshooting
365
+
366
+ **Check scrape targets:**
367
+ ```bash
368
+ curl http://localhost:9090/api/v1/targets
369
+ ```
370
+
371
+ **Check configuration:**
372
+ ```bash
373
+ curl http://localhost:9090/api/v1/status/config
374
+ ```
375
+
376
+ **Test query:**
377
+ ```bash
378
+ curl 'http://localhost:9090/api/v1/query?query=up'
379
+ ```
380
+
381
+ ## Reference Files
382
+
383
+ - `assets/prometheus.yml.template` - Complete configuration template
384
+ - `references/scrape-configs.md` - Scrape configuration patterns
385
+ - `references/recording-rules.md` - Recording rule examples
386
+ - `scripts/validate-prometheus.sh` - Validation script
387
+
388
+ ## Related Skills
389
+
390
+ - `grafana-dashboards` - For visualization
391
+ - `slo-implementation` - For SLO monitoring
392
+ - `distributed-tracing` - For request tracing
@@ -0,0 +1,329 @@
1
+ ---
2
+ name: slo-implementation
3
+ description: Define and implement Service Level Indicators (SLIs) and Service Level Objectives (SLOs) with error budgets and alerting. Use when establishing reliability targets, implementing SRE practices, or measuring service performance.
4
+ ---
5
+
6
+ # SLO Implementation
7
+
8
+ Framework for defining and implementing Service Level Indicators (SLIs), Service Level Objectives (SLOs), and error budgets.
9
+
10
+ ## Purpose
11
+
12
+ Implement measurable reliability targets using SLIs, SLOs, and error budgets to balance reliability with innovation velocity.
13
+
14
+ ## When to Use
15
+
16
+ - Define service reliability targets
17
+ - Measure user-perceived reliability
18
+ - Implement error budgets
19
+ - Create SLO-based alerts
20
+ - Track reliability goals
21
+
22
+ ## SLI/SLO/SLA Hierarchy
23
+
24
+ ```
25
+ SLA (Service Level Agreement)
26
+ ↓ Contract with customers
27
+ SLO (Service Level Objective)
28
+ ↓ Internal reliability target
29
+ SLI (Service Level Indicator)
30
+ ↓ Actual measurement
31
+ ```
32
+
33
+ ## Defining SLIs
34
+
35
+ ### Common SLI Types
36
+
37
+ #### 1. Availability SLI
38
+ ```promql
39
+ # Successful requests / Total requests
40
+ sum(rate(http_requests_total{status!~"5.."}[28d]))
41
+ /
42
+ sum(rate(http_requests_total[28d]))
43
+ ```
44
+
45
+ #### 2. Latency SLI
46
+ ```promql
47
+ # Requests below latency threshold / Total requests
48
+ sum(rate(http_request_duration_seconds_bucket{le="0.5"}[28d]))
49
+ /
50
+ sum(rate(http_request_duration_seconds_count[28d]))
51
+ ```
52
+
53
+ #### 3. Durability SLI
54
+ ```
55
+ # Successful writes / Total writes
56
+ sum(storage_writes_successful_total)
57
+ /
58
+ sum(storage_writes_total)
59
+ ```
60
+
61
+ **Reference:** See `references/slo-definitions.md`
62
+
63
+ ## Setting SLO Targets
64
+
65
+ ### Availability SLO Examples
66
+
67
+ | SLO % | Downtime/Month | Downtime/Year |
68
+ |-------|----------------|---------------|
69
+ | 99% | 7.2 hours | 3.65 days |
70
+ | 99.9% | 43.2 minutes | 8.76 hours |
71
+ | 99.95%| 21.6 minutes | 4.38 hours |
72
+ | 99.99%| 4.32 minutes | 52.56 minutes |
73
+
74
+ ### Choose Appropriate SLOs
75
+
76
+ **Consider:**
77
+ - User expectations
78
+ - Business requirements
79
+ - Current performance
80
+ - Cost of reliability
81
+ - Competitor benchmarks
82
+
83
+ **Example SLOs:**
84
+ ```yaml
85
+ slos:
86
+ - name: api_availability
87
+ target: 99.9
88
+ window: 28d
89
+ sli: |
90
+ sum(rate(http_requests_total{status!~"5.."}[28d]))
91
+ /
92
+ sum(rate(http_requests_total[28d]))
93
+
94
+ - name: api_latency_p95
95
+ target: 99
96
+ window: 28d
97
+ sli: |
98
+ sum(rate(http_request_duration_seconds_bucket{le="0.5"}[28d]))
99
+ /
100
+ sum(rate(http_request_duration_seconds_count[28d]))
101
+ ```
102
+
103
+ ## Error Budget Calculation
104
+
105
+ ### Error Budget Formula
106
+
107
+ ```
108
+ Error Budget = 1 - SLO Target
109
+ ```
110
+
111
+ **Example:**
112
+ - SLO: 99.9% availability
113
+ - Error Budget: 0.1% = 43.2 minutes/month
114
+ - Current Error: 0.05% = 21.6 minutes/month
115
+ - Remaining Budget: 50%
116
+
117
+ ### Error Budget Policy
118
+
119
+ ```yaml
120
+ error_budget_policy:
121
+ - remaining_budget: 100%
122
+ action: Normal development velocity
123
+ - remaining_budget: 50%
124
+ action: Consider postponing risky changes
125
+ - remaining_budget: 10%
126
+ action: Freeze non-critical changes
127
+ - remaining_budget: 0%
128
+ action: Feature freeze, focus on reliability
129
+ ```
130
+
131
+ **Reference:** See `references/error-budget.md`
132
+
133
+ ## SLO Implementation
134
+
135
+ ### Prometheus Recording Rules
136
+
137
+ ```yaml
138
+ # SLI Recording Rules
139
+ groups:
140
+ - name: sli_rules
141
+ interval: 30s
142
+ rules:
143
+ # Availability SLI
144
+ - record: sli:http_availability:ratio
145
+ expr: |
146
+ sum(rate(http_requests_total{status!~"5.."}[28d]))
147
+ /
148
+ sum(rate(http_requests_total[28d]))
149
+
150
+ # Latency SLI (requests < 500ms)
151
+ - record: sli:http_latency:ratio
152
+ expr: |
153
+ sum(rate(http_request_duration_seconds_bucket{le="0.5"}[28d]))
154
+ /
155
+ sum(rate(http_request_duration_seconds_count[28d]))
156
+
157
+ - name: slo_rules
158
+ interval: 5m
159
+ rules:
160
+ # SLO compliance (1 = meeting SLO, 0 = violating)
161
+ - record: slo:http_availability:compliance
162
+ expr: sli:http_availability:ratio >= bool 0.999
163
+
164
+ - record: slo:http_latency:compliance
165
+ expr: sli:http_latency:ratio >= bool 0.99
166
+
167
+ # Error budget remaining (percentage)
168
+ - record: slo:http_availability:error_budget_remaining
169
+ expr: |
170
+ (sli:http_availability:ratio - 0.999) / (1 - 0.999) * 100
171
+
172
+ # Error budget burn rate
173
+ - record: slo:http_availability:burn_rate_5m
174
+ expr: |
175
+ (1 - (
176
+ sum(rate(http_requests_total{status!~"5.."}[5m]))
177
+ /
178
+ sum(rate(http_requests_total[5m]))
179
+ )) / (1 - 0.999)
180
+ ```
181
+
182
+ ### SLO Alerting Rules
183
+
184
+ ```yaml
185
+ groups:
186
+ - name: slo_alerts
187
+ interval: 1m
188
+ rules:
189
+ # Fast burn: 14.4x rate, 1 hour window
190
+ # Consumes 2% error budget in 1 hour
191
+ - alert: SLOErrorBudgetBurnFast
192
+ expr: |
193
+ slo:http_availability:burn_rate_1h > 14.4
194
+ and
195
+ slo:http_availability:burn_rate_5m > 14.4
196
+ for: 2m
197
+ labels:
198
+ severity: critical
199
+ annotations:
200
+ summary: "Fast error budget burn detected"
201
+ description: "Error budget burning at {{ $value }}x rate"
202
+
203
+ # Slow burn: 6x rate, 6 hour window
204
+ # Consumes 5% error budget in 6 hours
205
+ - alert: SLOErrorBudgetBurnSlow
206
+ expr: |
207
+ slo:http_availability:burn_rate_6h > 6
208
+ and
209
+ slo:http_availability:burn_rate_30m > 6
210
+ for: 15m
211
+ labels:
212
+ severity: warning
213
+ annotations:
214
+ summary: "Slow error budget burn detected"
215
+ description: "Error budget burning at {{ $value }}x rate"
216
+
217
+ # Error budget exhausted
218
+ - alert: SLOErrorBudgetExhausted
219
+ expr: slo:http_availability:error_budget_remaining < 0
220
+ for: 5m
221
+ labels:
222
+ severity: critical
223
+ annotations:
224
+ summary: "SLO error budget exhausted"
225
+ description: "Error budget remaining: {{ $value }}%"
226
+ ```
227
+
228
+ ## SLO Dashboard
229
+
230
+ **Grafana Dashboard Structure:**
231
+
232
+ ```
233
+ ┌────────────────────────────────────┐
234
+ │ SLO Compliance (Current) │
235
+ │ ✓ 99.95% (Target: 99.9%) │
236
+ ├────────────────────────────────────┤
237
+ │ Error Budget Remaining: 65% │
238
+ │ ████████░░ 65% │
239
+ ├────────────────────────────────────┤
240
+ │ SLI Trend (28 days) │
241
+ │ [Time series graph] │
242
+ ├────────────────────────────────────┤
243
+ │ Burn Rate Analysis │
244
+ │ [Burn rate by time window] │
245
+ └────────────────────────────────────┘
246
+ ```
247
+
248
+ **Example Queries:**
249
+
250
+ ```promql
251
+ # Current SLO compliance
252
+ sli:http_availability:ratio * 100
253
+
254
+ # Error budget remaining
255
+ slo:http_availability:error_budget_remaining
256
+
257
+ # Days until error budget exhausted (at current burn rate)
258
+ (slo:http_availability:error_budget_remaining / 100)
259
+ *
260
+ 28
261
+ /
262
+ (1 - sli:http_availability:ratio) * (1 - 0.999)
263
+ ```
264
+
265
+ ## Multi-Window Burn Rate Alerts
266
+
267
+ ```yaml
268
+ # Combination of short and long windows reduces false positives
269
+ rules:
270
+ - alert: SLOBurnRateHigh
271
+ expr: |
272
+ (
273
+ slo:http_availability:burn_rate_1h > 14.4
274
+ and
275
+ slo:http_availability:burn_rate_5m > 14.4
276
+ )
277
+ or
278
+ (
279
+ slo:http_availability:burn_rate_6h > 6
280
+ and
281
+ slo:http_availability:burn_rate_30m > 6
282
+ )
283
+ labels:
284
+ severity: critical
285
+ ```
286
+
287
+ ## SLO Review Process
288
+
289
+ ### Weekly Review
290
+ - Current SLO compliance
291
+ - Error budget status
292
+ - Trend analysis
293
+ - Incident impact
294
+
295
+ ### Monthly Review
296
+ - SLO achievement
297
+ - Error budget usage
298
+ - Incident postmortems
299
+ - SLO adjustments
300
+
301
+ ### Quarterly Review
302
+ - SLO relevance
303
+ - Target adjustments
304
+ - Process improvements
305
+ - Tooling enhancements
306
+
307
+ ## Best Practices
308
+
309
+ 1. **Start with user-facing services**
310
+ 2. **Use multiple SLIs** (availability, latency, etc.)
311
+ 3. **Set achievable SLOs** (don't aim for 100%)
312
+ 4. **Implement multi-window alerts** to reduce noise
313
+ 5. **Track error budget** consistently
314
+ 6. **Review SLOs regularly**
315
+ 7. **Document SLO decisions**
316
+ 8. **Align with business goals**
317
+ 9. **Automate SLO reporting**
318
+ 10. **Use SLOs for prioritization**
319
+
320
+ ## Reference Files
321
+
322
+ - `assets/slo-template.md` - SLO definition template
323
+ - `references/slo-definitions.md` - SLO definition patterns
324
+ - `references/error-budget.md` - Error budget calculations
325
+
326
+ ## Related Skills
327
+
328
+ - `prometheus-configuration` - For metric collection
329
+ - `grafana-dashboards` - For SLO visualization