@sylix/coworker 2.0.10 → 2.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/dist/commands/slash/config.d.ts.map +1 -1
  2. package/dist/commands/slash/config.js +23 -5
  3. package/dist/commands/slash/config.js.map +1 -1
  4. package/dist/commands/slash/todo.js +1 -1
  5. package/dist/commands/slash/todo.js.map +1 -1
  6. package/dist/core/CoWorkerAgent.d.ts.map +1 -1
  7. package/dist/core/CoWorkerAgent.js +6 -3
  8. package/dist/core/CoWorkerAgent.js.map +1 -1
  9. package/dist/permissions/PermissionInterceptor.js +1 -1
  10. package/dist/permissions/PermissionInterceptor.js.map +1 -1
  11. package/dist/skills/defaults/accessibility/screen-reader-testing.md +545 -0
  12. package/dist/skills/defaults/accessibility/wcag-audit-patterns.md +555 -0
  13. package/dist/skills/defaults/ai-ml/rag.md +276 -0
  14. package/dist/skills/defaults/backend-development/api-design-principles.md +528 -0
  15. package/dist/skills/defaults/backend-development/api-design.md +285 -0
  16. package/dist/skills/defaults/backend-development/architecture-patterns.md +494 -0
  17. package/dist/skills/defaults/backend-development/async-python.md +237 -0
  18. package/dist/skills/defaults/backend-development/auth-implementation-patterns.md +638 -0
  19. package/dist/skills/defaults/backend-development/bazel-build-optimization.md +387 -0
  20. package/dist/skills/defaults/backend-development/billing-automation/SKILL.md +566 -0
  21. package/dist/skills/defaults/backend-development/code-review-excellence.md +538 -0
  22. package/dist/skills/defaults/backend-development/cqrs-implementation.md +554 -0
  23. package/dist/skills/defaults/backend-development/database-design.md +305 -0
  24. package/dist/skills/defaults/backend-development/debugging-strategies.md +536 -0
  25. package/dist/skills/defaults/backend-development/e2e-testing-patterns.md +544 -0
  26. package/dist/skills/defaults/backend-development/error-handling-patterns.md +641 -0
  27. package/dist/skills/defaults/backend-development/fastapi-templates.md +559 -0
  28. package/dist/skills/defaults/backend-development/fastapi.md +309 -0
  29. package/dist/skills/defaults/backend-development/git-advanced-workflows.md +405 -0
  30. package/dist/skills/defaults/backend-development/microservices-patterns.md +595 -0
  31. package/dist/skills/defaults/backend-development/microservices.md +284 -0
  32. package/dist/skills/defaults/backend-development/monorepo-management.md +623 -0
  33. package/dist/skills/defaults/backend-development/nodejs-backend-patterns.md +1048 -0
  34. package/dist/skills/defaults/backend-development/nx-workspace-patterns.md +457 -0
  35. package/dist/skills/defaults/backend-development/paypal-integration/SKILL.md +478 -0
  36. package/dist/skills/defaults/backend-development/pci-compliance/SKILL.md +480 -0
  37. package/dist/skills/defaults/backend-development/python-anti-patterns.md +349 -0
  38. package/dist/skills/defaults/backend-development/python-background-jobs.md +364 -0
  39. package/dist/skills/defaults/backend-development/python-code-style.md +360 -0
  40. package/dist/skills/defaults/backend-development/python-configuration.md +368 -0
  41. package/dist/skills/defaults/backend-development/python-design-patterns.md +296 -0
  42. package/dist/skills/defaults/backend-development/python-error-handling.md +323 -0
  43. package/dist/skills/defaults/backend-development/python-packaging.md +887 -0
  44. package/dist/skills/defaults/backend-development/python-performance-optimization.md +874 -0
  45. package/dist/skills/defaults/backend-development/python-project-structure.md +252 -0
  46. package/dist/skills/defaults/backend-development/python-resilience.md +376 -0
  47. package/dist/skills/defaults/backend-development/python-resource-management.md +421 -0
  48. package/dist/skills/defaults/backend-development/python-type-safety.md +428 -0
  49. package/dist/skills/defaults/backend-development/sql-optimization-patterns.md +509 -0
  50. package/dist/skills/defaults/backend-development/stripe-integration/SKILL.md +522 -0
  51. package/dist/skills/defaults/backend-development/turborepo-caching.md +376 -0
  52. package/dist/skills/defaults/blockchain/defi-protocol-templates.md +430 -0
  53. package/dist/skills/defaults/blockchain/nft-standards.md +364 -0
  54. package/dist/skills/defaults/blockchain/solidity-security.md +514 -0
  55. package/dist/skills/defaults/blockchain/web3-testing.md +360 -0
  56. package/dist/skills/defaults/business/competitive-landscape/SKILL.md +527 -0
  57. package/dist/skills/defaults/business/market-sizing-analysis/SKILL.md +451 -0
  58. package/dist/skills/defaults/business/startup-financial-modeling/SKILL.md +494 -0
  59. package/dist/skills/defaults/business/startup-metrics-framework/SKILL.md +564 -0
  60. package/dist/skills/defaults/business/team-composition-analysis.md +437 -0
  61. package/dist/skills/defaults/compliance/employment-contract-templates/SKILL.md +527 -0
  62. package/dist/skills/defaults/compliance/gdpr-data-handling/SKILL.md +630 -0
  63. package/dist/skills/defaults/data-engineering/airflow-dag-patterns.md +436 -0
  64. package/dist/skills/defaults/data-engineering/airflow.md +519 -0
  65. package/dist/skills/defaults/data-engineering/data-quality.md +583 -0
  66. package/dist/skills/defaults/data-engineering/dbt-transformation-patterns.md +482 -0
  67. package/dist/skills/defaults/data-engineering/dbt.md +556 -0
  68. package/dist/skills/defaults/data-engineering/ml-pipeline-workflow/SKILL.md +247 -0
  69. package/dist/skills/defaults/data-engineering/spark-optimization.md +348 -0
  70. package/dist/skills/defaults/data-engineering/spark.md +411 -0
  71. package/dist/skills/defaults/database/postgresql.md +202 -0
  72. package/dist/skills/defaults/debugging/systematic-debugging.md +249 -0
  73. package/dist/skills/defaults/devops/architecture-decision-records.md +448 -0
  74. package/dist/skills/defaults/devops/changelog-automation.md +580 -0
  75. package/dist/skills/defaults/devops/cicd.md +314 -0
  76. package/dist/skills/defaults/devops/cloud.md +263 -0
  77. package/dist/skills/defaults/devops/code-review-excellence.md +299 -0
  78. package/dist/skills/defaults/devops/cost-optimization.md +295 -0
  79. package/dist/skills/defaults/devops/deployment-pipeline-design.md +356 -0
  80. package/dist/skills/defaults/devops/docker.md +281 -0
  81. package/dist/skills/defaults/devops/git-workflows.md +205 -0
  82. package/dist/skills/defaults/devops/github-actions.md +311 -0
  83. package/dist/skills/defaults/devops/gitlab-ci-patterns.md +266 -0
  84. package/dist/skills/defaults/devops/hybrid-cloud-networking.md +241 -0
  85. package/dist/skills/defaults/devops/istio-traffic-management.md +327 -0
  86. package/dist/skills/defaults/devops/kubernetes.md +339 -0
  87. package/dist/skills/defaults/devops/linkerd-patterns.md +311 -0
  88. package/dist/skills/defaults/devops/multi-cloud-architecture.md +181 -0
  89. package/dist/skills/defaults/devops/observability.md +243 -0
  90. package/dist/skills/defaults/devops/openapi-spec-generation.md +1024 -0
  91. package/dist/skills/defaults/devops/postmortem-writing.md +396 -0
  92. package/dist/skills/defaults/devops/prometheus-configuration.md +265 -0
  93. package/dist/skills/defaults/devops/secrets-management.md +341 -0
  94. package/dist/skills/defaults/devops/service-mesh-observability.md +385 -0
  95. package/dist/skills/defaults/devops/terraform-module-library.md +244 -0
  96. package/dist/skills/defaults/finance/backtesting-frameworks/SKILL.md +663 -0
  97. package/dist/skills/defaults/finance/risk-metrics-calculation/SKILL.md +557 -0
  98. package/dist/skills/defaults/frontend/accessibility-compliance.md +420 -0
  99. package/dist/skills/defaults/frontend/design-system-patterns.md +337 -0
  100. package/dist/skills/defaults/frontend/interaction-design.md +327 -0
  101. package/dist/skills/defaults/frontend/javascript.md +311 -0
  102. package/dist/skills/defaults/frontend/modern-javascript-patterns.md +927 -0
  103. package/dist/skills/defaults/frontend/react-native-design.md +440 -0
  104. package/dist/skills/defaults/frontend/react.md +345 -0
  105. package/dist/skills/defaults/frontend/responsive-design.md +472 -0
  106. package/dist/skills/defaults/frontend/tailwind-design-system.md +337 -0
  107. package/dist/skills/defaults/frontend/typescript-advanced-types.md +724 -0
  108. package/dist/skills/defaults/frontend/typescript.md +334 -0
  109. package/dist/skills/defaults/frontend/visual-design-foundations.md +326 -0
  110. package/dist/skills/defaults/frontend/web-component-design.md +279 -0
  111. package/dist/skills/defaults/game-development/godot-gdscript-patterns.md +188 -0
  112. package/dist/skills/defaults/game-development/unity-ecs-patterns.md +594 -0
  113. package/dist/skills/defaults/kubernetes/gitops-workflow.md +285 -0
  114. package/dist/skills/defaults/kubernetes/gitops.md +280 -0
  115. package/dist/skills/defaults/kubernetes/helm-chart-scaffolding.md +553 -0
  116. package/dist/skills/defaults/kubernetes/helm.md +343 -0
  117. package/dist/skills/defaults/kubernetes/k8s-manifest-generator.md +501 -0
  118. package/dist/skills/defaults/kubernetes/k8s-security-policies.md +342 -0
  119. package/dist/skills/defaults/kubernetes/manifests.md +330 -0
  120. package/dist/skills/defaults/kubernetes/security.md +337 -0
  121. package/dist/skills/defaults/llm-application/embedding-strategies.md +608 -0
  122. package/dist/skills/defaults/llm-application/hybrid-search-implementation.md +570 -0
  123. package/dist/skills/defaults/llm-application/hybrid-search.md +570 -0
  124. package/dist/skills/defaults/llm-application/langchain-architecture.md +666 -0
  125. package/dist/skills/defaults/llm-application/langchain.md +259 -0
  126. package/dist/skills/defaults/llm-application/llm-evaluation.md +695 -0
  127. package/dist/skills/defaults/llm-application/prompt-engineering-patterns.md +449 -0
  128. package/dist/skills/defaults/llm-application/prompt-engineering.md +219 -0
  129. package/dist/skills/defaults/llm-application/rag-implementation.md +434 -0
  130. package/dist/skills/defaults/llm-application/similarity-search-patterns.md +560 -0
  131. package/dist/skills/defaults/llm-application/similarity-search.md +560 -0
  132. package/dist/skills/defaults/llm-application/vector-index-tuning.md +523 -0
  133. package/dist/skills/defaults/mobile/mobile-android-design.md +440 -0
  134. package/dist/skills/defaults/mobile/mobile-ios-design.md +266 -0
  135. package/dist/skills/defaults/monitoring/distributed-tracing.md +436 -0
  136. package/dist/skills/defaults/monitoring/grafana-dashboards.md +370 -0
  137. package/dist/skills/defaults/monitoring/prometheus-configuration.md +379 -0
  138. package/dist/skills/defaults/monitoring/slo-implementation.md +323 -0
  139. package/dist/skills/defaults/refactoring/code-refactoring.md +349 -0
  140. package/dist/skills/defaults/security/anti-reversing-techniques/SKILL.md +559 -0
  141. package/dist/skills/defaults/security/auditor.md +168 -0
  142. package/dist/skills/defaults/security/binary-analysis-patterns/SKILL.md +438 -0
  143. package/dist/skills/defaults/security/memory-forensics/SKILL.md +483 -0
  144. package/dist/skills/defaults/security/mtls-configuration.md +349 -0
  145. package/dist/skills/defaults/security/protocol-reverse-engineering/SKILL.md +520 -0
  146. package/dist/skills/defaults/security/sast-configuration.md +182 -0
  147. package/dist/skills/defaults/security/security.md +313 -0
  148. package/dist/skills/defaults/security/stride-analysis.md +273 -0
  149. package/dist/skills/defaults/security/threat-mitigation-mapping.md +290 -0
  150. package/dist/skills/defaults/systems/bash-defensive-patterns/SKILL.md +539 -0
  151. package/dist/skills/defaults/systems/bats-testing-patterns/SKILL.md +631 -0
  152. package/dist/skills/defaults/systems/go-concurrency-patterns.md +657 -0
  153. package/dist/skills/defaults/systems/memory-safety-patterns.md +605 -0
  154. package/dist/skills/defaults/systems/rust-async-patterns.md +519 -0
  155. package/dist/skills/defaults/systems/shellcheck-configuration/SKILL.md +456 -0
  156. package/dist/skills/defaults/team-collaboration/multi-reviewer-patterns.md +126 -0
  157. package/dist/skills/defaults/team-collaboration/parallel-feature-development.md +151 -0
  158. package/dist/skills/defaults/testing/javascript-testing-patterns.md +1021 -0
  159. package/dist/skills/defaults/testing/python-testing-patterns.md +351 -0
  160. package/dist/skills/defaults/testing/testing.md +332 -0
  161. package/dist/skills/defaults/workflows/context-driven-development.md +384 -0
  162. package/dist/skills/defaults/workflows/track-management.md +592 -0
  163. package/dist/skills/defaults/workflows/workflow-patterns.md +622 -0
  164. package/dist/skills/index.d.ts +11 -0
  165. package/dist/skills/index.d.ts.map +1 -0
  166. package/dist/skills/index.js +129 -0
  167. package/dist/skills/index.js.map +1 -0
  168. package/dist/utils/character.js +6 -9
  169. package/dist/utils/character.js.map +1 -1
  170. package/dist/utils/contextManager.js +3 -7
  171. package/dist/utils/contextManager.js.map +1 -1
  172. package/dist/utils/inputbar.d.ts.map +1 -1
  173. package/dist/utils/inputbar.js +8 -1
  174. package/dist/utils/inputbar.js.map +1 -1
  175. package/dist/utils/output.d.ts.map +1 -1
  176. package/dist/utils/output.js +3 -35
  177. package/dist/utils/output.js.map +1 -1
  178. package/package.json +1 -1
@@ -0,0 +1,379 @@
1
+ ---
2
+ name: prometheus-configuration
3
+ description: Set up Prometheus for comprehensive metric collection, storage, and monitoring of infrastructure and applications. Use when implementing metrics collection, setting up monitoring infrastructure, or configuring alerting systems.
4
+ ---
5
+
6
+ # Prometheus Configuration
7
+
8
+ Complete guide to Prometheus setup, metric collection, scrape configuration, and recording rules.
9
+
10
+ ## Purpose
11
+
12
+ Configure Prometheus for comprehensive metric collection, alerting, and monitoring of infrastructure and applications.
13
+
14
+ ## When to Use
15
+
16
+ - Set up Prometheus monitoring
17
+ - Configure metric scraping
18
+ - Create recording rules
19
+ - Design alert rules
20
+ - Implement service discovery
21
+
22
+ ## Prometheus Architecture
23
+
24
+ ```
25
+ ┌──────────────┐
26
+ │ Applications │ ← Instrumented with client libraries
27
+ └──────┬───────┘
28
+ │ /metrics endpoint
29
+
30
+ ┌──────────────┐
31
+ │ Prometheus │ ← Scrapes metrics periodically
32
+ │ Server │
33
+ └──────┬───────┘
34
+
35
+ ├─→ AlertManager (alerts)
36
+ ├─→ Grafana (visualization)
37
+ └─→ Long-term storage (Thanos/Cortex)
38
+ ```
39
+
40
+ ## Installation
41
+
42
+ ### Kubernetes with Helm
43
+
44
+ ```bash
45
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
46
+ helm repo update
47
+
48
+ helm install prometheus prometheus-community/kube-prometheus-stack \
49
+ --namespace monitoring \
50
+ --create-namespace \
51
+ --set prometheus.prometheusSpec.retention=30d \
52
+ --set prometheus.prometheusSpec.storageVolumeSize=50Gi
53
+ ```
54
+
55
+ ### Docker Compose
56
+
57
+ ```yaml
58
+ version: "3.8"
59
+ services:
60
+ prometheus:
61
+ image: prom/prometheus:latest
62
+ ports:
63
+ - "9090:9090"
64
+ volumes:
65
+ - ./prometheus.yml:/etc/prometheus/prometheus.yml
66
+ - prometheus-data:/prometheus
67
+ command:
68
+ - "--config.file=/etc/prometheus/prometheus.yml"
69
+ - "--storage.tsdb.path=/prometheus"
70
+ - "--storage.tsdb.retention.time=30d"
71
+
72
+ volumes:
73
+ prometheus-data:
74
+ ```
75
+
76
+ ## Configuration File
77
+
78
+ **prometheus.yml:**
79
+
80
+ ```yaml
81
+ global:
82
+ scrape_interval: 15s
83
+ evaluation_interval: 15s
84
+ external_labels:
85
+ cluster: "production"
86
+ region: "us-west-2"
87
+
88
+ # Alertmanager configuration
89
+ alerting:
90
+ alertmanagers:
91
+ - static_configs:
92
+ - targets:
93
+ - alertmanager:9093
94
+
95
+ # Load rules files
96
+ rule_files:
97
+ - /etc/prometheus/rules/*.yml
98
+
99
+ # Scrape configurations
100
+ scrape_configs:
101
+ # Prometheus itself
102
+ - job_name: "prometheus"
103
+ static_configs:
104
+ - targets: ["localhost:9090"]
105
+
106
+ # Node exporters
107
+ - job_name: "node-exporter"
108
+ static_configs:
109
+ - targets:
110
+ - "node1:9100"
111
+ - "node2:9100"
112
+ - "node3:9100"
113
+ relabel_configs:
114
+ - source_labels: [__address__]
115
+ target_label: instance
116
+ regex: "([^:]+)(:[0-9]+)?"
117
+ replacement: "${1}"
118
+
119
+ # Kubernetes pods with annotations
120
+ - job_name: "kubernetes-pods"
121
+ kubernetes_sd_configs:
122
+ - role: pod
123
+ relabel_configs:
124
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
125
+ action: keep
126
+ regex: true
127
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
128
+ action: replace
129
+ target_label: __metrics_path__
130
+ regex: (.+)
131
+ - source_labels:
132
+ [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
133
+ action: replace
134
+ regex: ([^:]+)(?::\d+)?;(\d+)
135
+ replacement: $1:$2
136
+ target_label: __address__
137
+ - source_labels: [__meta_kubernetes_namespace]
138
+ action: replace
139
+ target_label: namespace
140
+ - source_labels: [__meta_kubernetes_pod_name]
141
+ action: replace
142
+ target_label: pod
143
+
144
+ # Application metrics
145
+ - job_name: "my-app"
146
+ static_configs:
147
+ - targets:
148
+ - "app1.example.com:9090"
149
+ - "app2.example.com:9090"
150
+ metrics_path: "/metrics"
151
+ scheme: "https"
152
+ tls_config:
153
+ ca_file: /etc/prometheus/ca.crt
154
+ cert_file: /etc/prometheus/client.crt
155
+ key_file: /etc/prometheus/client.key
156
+ ```
157
+
158
+ ## Scrape Configurations
159
+
160
+ ### Static Targets
161
+
162
+ ```yaml
163
+ scrape_configs:
164
+ - job_name: "static-targets"
165
+ static_configs:
166
+ - targets: ["host1:9100", "host2:9100"]
167
+ labels:
168
+ env: "production"
169
+ region: "us-west-2"
170
+ ```
171
+
172
+ ### File-based Service Discovery
173
+
174
+ ```yaml
175
+ scrape_configs:
176
+ - job_name: "file-sd"
177
+ file_sd_configs:
178
+ - files:
179
+ - /etc/prometheus/targets/*.json
180
+ - /etc/prometheus/targets/*.yml
181
+ refresh_interval: 5m
182
+ ```
183
+
184
+ **targets/production.json:**
185
+
186
+ ```json
187
+ [
188
+ {
189
+ "targets": ["app1:9090", "app2:9090"],
190
+ "labels": {
191
+ "env": "production",
192
+ "service": "api"
193
+ }
194
+ }
195
+ ]
196
+ ```
197
+
198
+ ### Kubernetes Service Discovery
199
+
200
+ ```yaml
201
+ scrape_configs:
202
+ - job_name: "kubernetes-services"
203
+ kubernetes_sd_configs:
204
+ - role: service
205
+ relabel_configs:
206
+ - source_labels:
207
+ [__meta_kubernetes_service_annotation_prometheus_io_scrape]
208
+ action: keep
209
+ regex: true
210
+ - source_labels:
211
+ [__meta_kubernetes_service_annotation_prometheus_io_scheme]
212
+ action: replace
213
+ target_label: __scheme__
214
+ regex: (https?)
215
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
216
+ action: replace
217
+ target_label: __metrics_path__
218
+ regex: (.+)
219
+ ```
220
+
221
+ ## Recording Rules
222
+
223
+ Create pre-computed metrics for frequently queried expressions:
224
+
225
+ ```yaml
226
+ # /etc/prometheus/rules/recording_rules.yml
227
+ groups:
228
+ - name: api_metrics
229
+ interval: 15s
230
+ rules:
231
+ # HTTP request rate per service
232
+ - record: job:http_requests:rate5m
233
+ expr: sum by (job) (rate(http_requests_total[5m]))
234
+
235
+ # Error rate percentage
236
+ - record: job:http_requests_errors:rate5m
237
+ expr: sum by (job) (rate(http_requests_total{status=~"5.."}[5m]))
238
+
239
+ - record: job:http_requests_error_rate:percentage
240
+ expr: |
241
+ (job:http_requests_errors:rate5m / job:http_requests:rate5m) * 100
242
+
243
+ # P95 latency
244
+ - record: job:http_request_duration:p95
245
+ expr: |
246
+ histogram_quantile(0.95,
247
+ sum by (job, le) (rate(http_request_duration_seconds_bucket[5m]))
248
+ )
249
+
250
+ - name: resource_metrics
251
+ interval: 30s
252
+ rules:
253
+ # CPU utilization percentage
254
+ - record: instance:node_cpu:utilization
255
+ expr: |
256
+ 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
257
+
258
+ # Memory utilization percentage
259
+ - record: instance:node_memory:utilization
260
+ expr: |
261
+ 100 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100)
262
+
263
+ # Disk usage percentage
264
+ - record: instance:node_disk:utilization
265
+ expr: |
266
+ 100 - ((node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100)
267
+ ```
268
+
269
+ ## Alert Rules
270
+
271
+ ```yaml
272
+ # /etc/prometheus/rules/alert_rules.yml
273
+ groups:
274
+ - name: availability
275
+ interval: 30s
276
+ rules:
277
+ - alert: ServiceDown
278
+ expr: up{job="my-app"} == 0
279
+ for: 1m
280
+ labels:
281
+ severity: critical
282
+ annotations:
283
+ summary: "Service {{ $labels.instance }} is down"
284
+ description: "{{ $labels.job }} has been down for more than 1 minute"
285
+
286
+ - alert: HighErrorRate
287
+ expr: job:http_requests_error_rate:percentage > 5
288
+ for: 5m
289
+ labels:
290
+ severity: warning
291
+ annotations:
292
+ summary: "High error rate for {{ $labels.job }}"
293
+ description: "Error rate is {{ $value }}% (threshold: 5%)"
294
+
295
+ - alert: HighLatency
296
+ expr: job:http_request_duration:p95 > 1
297
+ for: 5m
298
+ labels:
299
+ severity: warning
300
+ annotations:
301
+ summary: "High latency for {{ $labels.job }}"
302
+ description: "P95 latency is {{ $value }}s (threshold: 1s)"
303
+
304
+ - name: resources
305
+ interval: 1m
306
+ rules:
307
+ - alert: HighCPUUsage
308
+ expr: instance:node_cpu:utilization > 80
309
+ for: 5m
310
+ labels:
311
+ severity: warning
312
+ annotations:
313
+ summary: "High CPU usage on {{ $labels.instance }}"
314
+ description: "CPU usage is {{ $value }}%"
315
+
316
+ - alert: HighMemoryUsage
317
+ expr: instance:node_memory:utilization > 85
318
+ for: 5m
319
+ labels:
320
+ severity: warning
321
+ annotations:
322
+ summary: "High memory usage on {{ $labels.instance }}"
323
+ description: "Memory usage is {{ $value }}%"
324
+
325
+ - alert: DiskSpaceLow
326
+ expr: instance:node_disk:utilization > 90
327
+ for: 5m
328
+ labels:
329
+ severity: critical
330
+ annotations:
331
+ summary: "Low disk space on {{ $labels.instance }}"
332
+ description: "Disk usage is {{ $value }}%"
333
+ ```
334
+
335
+ ## Validation
336
+
337
+ ```bash
338
+ # Validate configuration
339
+ promtool check config prometheus.yml
340
+
341
+ # Validate rules
342
+ promtool check rules /etc/prometheus/rules/*.yml
343
+
344
+ # Test query
345
+ promtool query instant http://localhost:9090 'up'
346
+ ```
347
+
348
+ ## Best Practices
349
+
350
+ 1. **Use consistent naming** for metrics (prefix_name_unit)
351
+ 2. **Set appropriate scrape intervals** (15-60s typical)
352
+ 3. **Use recording rules** for expensive queries
353
+ 4. **Implement high availability** (multiple Prometheus instances)
354
+ 5. **Configure retention** based on storage capacity
355
+ 6. **Use relabeling** for metric cleanup
356
+ 7. **Monitor Prometheus itself**
357
+ 8. **Implement federation** for large deployments
358
+ 9. **Use Thanos/Cortex** for long-term storage
359
+ 10. **Document custom metrics**
360
+
361
+ ## Troubleshooting
362
+
363
+ **Check scrape targets:**
364
+
365
+ ```bash
366
+ curl http://localhost:9090/api/v1/targets
367
+ ```
368
+
369
+ **Check configuration:**
370
+
371
+ ```bash
372
+ curl http://localhost:9090/api/v1/status/config
373
+ ```
374
+
375
+ **Test query:**
376
+
377
+ ```bash
378
+ curl 'http://localhost:9090/api/v1/query?query=up'
379
+ ```
@@ -0,0 +1,323 @@
1
+ ---
2
+ name: slo-implementation
3
+ description: Define and implement Service Level Indicators (SLIs) and Service Level Objectives (SLOs) with error budgets and alerting. Use when establishing reliability targets, implementing SRE practices, or measuring service performance.
4
+ ---
5
+
6
+ # SLO Implementation
7
+
8
+ Framework for defining and implementing Service Level Indicators (SLIs), Service Level Objectives (SLOs), and error budgets.
9
+
10
+ ## Purpose
11
+
12
+ Implement measurable reliability targets using SLIs, SLOs, and error budgets to balance reliability with innovation velocity.
13
+
14
+ ## When to Use
15
+
16
+ - Define service reliability targets
17
+ - Measure user-perceived reliability
18
+ - Implement error budgets
19
+ - Create SLO-based alerts
20
+ - Track reliability goals
21
+
22
+ ## SLI/SLO/SLA Hierarchy
23
+
24
+ ```
25
+ SLA (Service Level Agreement)
26
+ ↓ Contract with customers
27
+ SLO (Service Level Objective)
28
+ ↓ Internal reliability target
29
+ SLI (Service Level Indicator)
30
+ ↓ Actual measurement
31
+ ```
32
+
33
+ ## Defining SLIs
34
+
35
+ ### Common SLI Types
36
+
37
+ #### 1. Availability SLI
38
+
39
+ ```promql
40
+ # Successful requests / Total requests
41
+ sum(rate(http_requests_total{status!~"5.."}[28d]))
42
+ /
43
+ sum(rate(http_requests_total[28d]))
44
+ ```
45
+
46
+ #### 2. Latency SLI
47
+
48
+ ```promql
49
+ # Requests below latency threshold / Total requests
50
+ sum(rate(http_request_duration_seconds_bucket{le="0.5"}[28d]))
51
+ /
52
+ sum(rate(http_request_duration_seconds_count[28d]))
53
+ ```
54
+
55
+ #### 3. Durability SLI
56
+
57
+ ```
58
+ # Successful writes / Total writes
59
+ sum(storage_writes_successful_total)
60
+ /
61
+ sum(storage_writes_total)
62
+ ```
63
+
64
+ ## Setting SLO Targets
65
+
66
+ ### Availability SLO Examples
67
+
68
+ | SLO % | Downtime/Month | Downtime/Year |
69
+ | ------ | -------------- | ------------- |
70
+ | 99% | 7.2 hours | 3.65 days |
71
+ | 99.9% | 43.2 minutes | 8.76 hours |
72
+ | 99.95% | 21.6 minutes | 4.38 hours |
73
+ | 99.99% | 4.32 minutes | 52.56 minutes |
74
+
75
+ ### Choose Appropriate SLOs
76
+
77
+ **Consider:**
78
+
79
+ - User expectations
80
+ - Business requirements
81
+ - Current performance
82
+ - Cost of reliability
83
+ - Competitor benchmarks
84
+
85
+ **Example SLOs:**
86
+
87
+ ```yaml
88
+ slos:
89
+ - name: api_availability
90
+ target: 99.9
91
+ window: 28d
92
+ sli: |
93
+ sum(rate(http_requests_total{status!~"5.."}[28d]))
94
+ /
95
+ sum(rate(http_requests_total[28d]))
96
+
97
+ - name: api_latency_p95
98
+ target: 99
99
+ window: 28d
100
+ sli: |
101
+ sum(rate(http_request_duration_seconds_bucket{le="0.5"}[28d]))
102
+ /
103
+ sum(rate(http_request_duration_seconds_count[28d]))
104
+ ```
105
+
106
+ ## Error Budget Calculation
107
+
108
+ ### Error Budget Formula
109
+
110
+ ```
111
+ Error Budget = 1 - SLO Target
112
+ ```
113
+
114
+ **Example:**
115
+
116
+ - SLO: 99.9% availability
117
+ - Error Budget: 0.1% = 43.2 minutes/month
118
+ - Current Error: 0.05% = 21.6 minutes/month
119
+ - Remaining Budget: 50%
120
+
121
+ ### Error Budget Policy
122
+
123
+ ```yaml
124
+ error_budget_policy:
125
+ - remaining_budget: 100%
126
+ action: Normal development velocity
127
+ - remaining_budget: 50%
128
+ action: Consider postponing risky changes
129
+ - remaining_budget: 10%
130
+ action: Freeze non-critical changes
131
+ - remaining_budget: 0%
132
+ action: Feature freeze, focus on reliability
133
+ ```
134
+
135
+ ## SLO Implementation
136
+
137
+ ### Prometheus Recording Rules
138
+
139
+ ```yaml
140
+ # SLI Recording Rules
141
+ groups:
142
+ - name: sli_rules
143
+ interval: 30s
144
+ rules:
145
+ # Availability SLI
146
+ - record: sli:http_availability:ratio
147
+ expr: |
148
+ sum(rate(http_requests_total{status!~"5.."}[28d]))
149
+ /
150
+ sum(rate(http_requests_total[28d]))
151
+
152
+ # Latency SLI (requests < 500ms)
153
+ - record: sli:http_latency:ratio
154
+ expr: |
155
+ sum(rate(http_request_duration_seconds_bucket{le="0.5"}[28d]))
156
+ /
157
+ sum(rate(http_request_duration_seconds_count[28d]))
158
+
159
+ - name: slo_rules
160
+ interval: 5m
161
+ rules:
162
+ # SLO compliance (1 = meeting SLO, 0 = violating)
163
+ - record: slo:http_availability:compliance
164
+ expr: sli:http_availability:ratio >= bool 0.999
165
+
166
+ - record: slo:http_latency:compliance
167
+ expr: sli:http_latency:ratio >= bool 0.99
168
+
169
+ # Error budget remaining (percentage)
170
+ - record: slo:http_availability:error_budget_remaining
171
+ expr: |
172
+ (sli:http_availability:ratio - 0.999) / (1 - 0.999) * 100
173
+
174
+ # Error budget burn rate
175
+ - record: slo:http_availability:burn_rate_5m
176
+ expr: |
177
+ (1 - (
178
+ sum(rate(http_requests_total{status!~"5.."}[5m]))
179
+ /
180
+ sum(rate(http_requests_total[5m]))
181
+ )) / (1 - 0.999)
182
+ ```
183
+
184
+ ### SLO Alerting Rules
185
+
186
+ ```yaml
187
+ groups:
188
+ - name: slo_alerts
189
+ interval: 1m
190
+ rules:
191
+ # Fast burn: 14.4x rate, 1 hour window
192
+ # Consumes 2% error budget in 1 hour
193
+ - alert: SLOErrorBudgetBurnFast
194
+ expr: |
195
+ slo:http_availability:burn_rate_1h > 14.4
196
+ and
197
+ slo:http_availability:burn_rate_5m > 14.4
198
+ for: 2m
199
+ labels:
200
+ severity: critical
201
+ annotations:
202
+ summary: "Fast error budget burn detected"
203
+ description: "Error budget burning at {{ $value }}x rate"
204
+
205
+ # Slow burn: 6x rate, 6 hour window
206
+ # Consumes 5% error budget in 6 hours
207
+ - alert: SLOErrorBudgetBurnSlow
208
+ expr: |
209
+ slo:http_availability:burn_rate_6h > 6
210
+ and
211
+ slo:http_availability:burn_rate_30m > 6
212
+ for: 15m
213
+ labels:
214
+ severity: warning
215
+ annotations:
216
+ summary: "Slow error budget burn detected"
217
+ description: "Error budget burning at {{ $value }}x rate"
218
+
219
+ # Error budget exhausted
220
+ - alert: SLOErrorBudgetExhausted
221
+ expr: slo:http_availability:error_budget_remaining < 0
222
+ for: 5m
223
+ labels:
224
+ severity: critical
225
+ annotations:
226
+ summary: "SLO error budget exhausted"
227
+ description: "Error budget remaining: {{ $value }}%"
228
+ ```
229
+
230
+ ## SLO Dashboard
231
+
232
+ **Grafana Dashboard Structure:**
233
+
234
+ ```
235
+ ┌────────────────────────────────────┐
236
+ │ SLO Compliance (Current) │
237
+ │ ✓ 99.95% (Target: 99.9%) │
238
+ ├────────────────────────────────────┤
239
+ │ Error Budget Remaining: 65% │
240
+ │ ████████░░ 65% │
241
+ ├────────────────────────────────────┤
242
+ │ SLI Trend (28 days) │
243
+ │ [Time series graph] │
244
+ ├────────────────────────────────────┤
245
+ │ Burn Rate Analysis │
246
+ │ [Burn rate by time window] │
247
+ └────────────────────────────────────┘
248
+ ```
249
+
250
+ **Example Queries:**
251
+
252
+ ```promql
253
+ # Current SLO compliance
254
+ sli:http_availability:ratio * 100
255
+
256
+ # Error budget remaining
257
+ slo:http_availability:error_budget_remaining
258
+
259
+ # Days until error budget exhausted (at current burn rate)
260
+ (slo:http_availability:error_budget_remaining / 100)
261
+ *
262
+ 28
263
+ /
264
+ (1 - sli:http_availability:ratio) * (1 - 0.999)
265
+ ```
266
+
267
+ ## Multi-Window Burn Rate Alerts
268
+
269
+ ```yaml
270
+ # Combination of short and long windows reduces false positives
271
+ rules:
272
+ - alert: SLOBurnRateHigh
273
+ expr: |
274
+ (
275
+ slo:http_availability:burn_rate_1h > 14.4
276
+ and
277
+ slo:http_availability:burn_rate_5m > 14.4
278
+ )
279
+ or
280
+ (
281
+ slo:http_availability:burn_rate_6h > 6
282
+ and
283
+ slo:http_availability:burn_rate_30m > 6
284
+ )
285
+ labels:
286
+ severity: critical
287
+ ```
288
+
289
+ ## SLO Review Process
290
+
291
+ ### Weekly Review
292
+
293
+ - Current SLO compliance
294
+ - Error budget status
295
+ - Trend analysis
296
+ - Incident impact
297
+
298
+ ### Monthly Review
299
+
300
+ - SLO achievement
301
+ - Error budget usage
302
+ - Incident postmortems
303
+ - SLO adjustments
304
+
305
+ ### Quarterly Review
306
+
307
+ - SLO relevance
308
+ - Target adjustments
309
+ - Process improvements
310
+ - Tooling enhancements
311
+
312
+ ## Best Practices
313
+
314
+ 1. **Start with user-facing services**
315
+ 2. **Use multiple SLIs** (availability, latency, etc.)
316
+ 3. **Set achievable SLOs** (don't aim for 100%)
317
+ 4. **Implement multi-window alerts** to reduce noise
318
+ 5. **Track error budget** consistently
319
+ 6. **Review SLOs regularly**
320
+ 7. **Document SLO decisions**
321
+ 8. **Align with business goals**
322
+ 9. **Automate SLO reporting**
323
+ 10. **Use SLOs for prioritization**