proagents 1.6.17 → 1.6.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +169 -0
- package/COMMANDS.md +595 -0
- package/README.md +13 -23
- package/package.json +2 -7
- package/.proagents/ai-models/README.md +0 -141
- package/.proagents/ai-models/cost-management.md +0 -362
- package/.proagents/ai-models/fallbacks.md +0 -342
- package/.proagents/ai-models/model-config.md +0 -318
- package/.proagents/ai-models/task-routing.md +0 -503
- package/.proagents/ai-training/README.md +0 -155
- package/.proagents/ai-training/continuous-learning.md +0 -413
- package/.proagents/ai-training/domain-knowledge.md +0 -378
- package/.proagents/ai-training/pattern-learning.md +0 -455
- package/.proagents/ai-training/training-data.md +0 -337
- package/.proagents/ai-training/user-preferences.md +0 -346
- package/.proagents/approval-workflows/README.md +0 -146
- package/.proagents/approval-workflows/approval-config.md +0 -332
- package/.proagents/approval-workflows/approval-stages.md +0 -503
- package/.proagents/approval-workflows/emergency-bypass.md +0 -351
- package/.proagents/approval-workflows/examples.md +0 -859
- package/.proagents/approval-workflows/notifications.md +0 -320
- package/.proagents/compliance/README.md +0 -206
- package/.proagents/compliance/access-control.md +0 -310
- package/.proagents/compliance/audit-logging.md +0 -444
- package/.proagents/compliance/compliance-frameworks.md +0 -429
- package/.proagents/compliance/reports.md +0 -491
- package/.proagents/compliance/retention-policies.md +0 -454
- package/.proagents/config-versioning/README.md +0 -120
- package/.proagents/config-versioning/changelog.md +0 -300
- package/.proagents/config-versioning/rollback.md +0 -283
- package/.proagents/config-versioning/versioning.md +0 -330
- package/.proagents/contract-testing/README.md +0 -223
- package/.proagents/contract-testing/contract-testing.md +0 -614
- package/.proagents/contract-testing/pact-integration.md +0 -507
- package/.proagents/contract-testing/schema-validation.md +0 -565
- package/.proagents/dependency-management/README.md +0 -140
- package/.proagents/dependency-management/automation.md +0 -363
- package/.proagents/dependency-management/compatibility.md +0 -319
- package/.proagents/dependency-management/security-scanning.md +0 -413
- package/.proagents/dependency-management/update-policies.md +0 -374
- package/.proagents/disaster-recovery/README.md +0 -247
- package/.proagents/disaster-recovery/automation.md +0 -366
- package/.proagents/disaster-recovery/backup-recovery.md +0 -571
- package/.proagents/disaster-recovery/incident-response.md +0 -565
- package/.proagents/disaster-recovery/rollback-procedures.md +0 -499
- package/.proagents/disaster-recovery/runbooks.md +0 -603
- package/.proagents/disaster-recovery/scenarios.md +0 -892
- package/.proagents/disaster-recovery/testing.md +0 -438
- package/.proagents/environments/README.md +0 -244
- package/.proagents/environments/configuration.md +0 -437
- package/.proagents/environments/promotion.md +0 -434
- package/.proagents/environments/setup.md +0 -420
- package/.proagents/examples/README.md +0 -55
- package/.proagents/examples/backend-nodejs/README.md +0 -188
- package/.proagents/examples/backend-nodejs/complete-conversation.md +0 -601
- package/.proagents/examples/backend-nodejs/proagents.config.yaml +0 -415
- package/.proagents/examples/backend-nodejs/workflow-example.md +0 -909
- package/.proagents/examples/fullstack-nextjs/README.md +0 -155
- package/.proagents/examples/fullstack-nextjs/complete-conversation.md +0 -604
- package/.proagents/examples/fullstack-nextjs/proagents.config.yaml +0 -287
- package/.proagents/examples/fullstack-nextjs/workflow-example.md +0 -553
- package/.proagents/examples/mobile-react-native/README.md +0 -171
- package/.proagents/examples/mobile-react-native/complete-conversation.md +0 -825
- package/.proagents/examples/mobile-react-native/proagents.config.yaml +0 -330
- package/.proagents/examples/mobile-react-native/workflow-example.md +0 -723
- package/.proagents/examples/web-frontend-react/README.md +0 -125
- package/.proagents/examples/web-frontend-react/complete-conversation.md +0 -556
- package/.proagents/examples/web-frontend-react/proagents.config.yaml +0 -183
- package/.proagents/examples/web-frontend-react/workflow-example.md +0 -603
- package/.proagents/existing-projects/README.md +0 -65
- package/.proagents/existing-projects/challenges.md +0 -861
- package/.proagents/existing-projects/coexistence-mode.md +0 -483
- package/.proagents/existing-projects/compatibility-assessment.md +0 -541
- package/.proagents/existing-projects/gradual-adoption.md +0 -515
- package/.proagents/existing-projects/migration-strategies.md +0 -788
- package/.proagents/existing-projects/pattern-reconciliation.md +0 -489
- package/.proagents/existing-projects/team-onboarding.md +0 -617
- package/.proagents/existing-projects/technical-debt-handling.md +0 -644
- package/.proagents/feature-flags/README.md +0 -263
- package/.proagents/feature-flags/ab-testing.md +0 -413
- package/.proagents/feature-flags/configuration.md +0 -420
- package/.proagents/feature-flags/kill-switches.md +0 -444
- package/.proagents/feature-flags/rollout-strategies.md +0 -392
- package/.proagents/history.log +0 -12
- package/.proagents/i18n/README.md +0 -133
- package/.proagents/i18n/extraction.md +0 -433
- package/.proagents/i18n/tms-integration.md +0 -332
- package/.proagents/i18n/translation-workflow.md +0 -413
- package/.proagents/i18n/validation.md +0 -355
- package/.proagents/logging/README.md +0 -276
- package/.proagents/logging/aggregation.md +0 -475
- package/.proagents/logging/log-levels.md +0 -376
- package/.proagents/logging/sensitive-data.md +0 -423
- package/.proagents/logging/structured-logging.md +0 -406
- package/.proagents/metrics/README.md +0 -69
- package/.proagents/metrics/code-quality-kpis.md +0 -461
- package/.proagents/metrics/deployment-metrics.md +0 -517
- package/.proagents/metrics/developer-productivity.md +0 -368
- package/.proagents/metrics/learning-effectiveness.md +0 -478
- package/.proagents/migrations/README.md +0 -77
- package/.proagents/migrations/from-claude-projects.md +0 -313
- package/.proagents/migrations/from-cursor-rules.md +0 -345
- package/.proagents/migrations/from-custom-workflows.md +0 -410
- package/.proagents/monitoring/README.md +0 -308
- package/.proagents/monitoring/alerting.md +0 -449
- package/.proagents/monitoring/dashboards.md +0 -454
- package/.proagents/monitoring/health-checks.md +0 -436
- package/.proagents/monitoring/metrics.md +0 -434
- package/.proagents/multi-project/README.md +0 -170
- package/.proagents/multi-project/coordinated-deploy.md +0 -510
- package/.proagents/multi-project/cross-project-deps.md +0 -395
- package/.proagents/multi-project/unified-changelog.md +0 -477
- package/.proagents/multi-project/walkthroughs/monorepo-setup.md +0 -787
- package/.proagents/multi-project/workspace-config.md +0 -408
- package/.proagents/notifications/README.md +0 -151
- package/.proagents/notifications/channels.md +0 -457
- package/.proagents/notifications/preferences.md +0 -415
- package/.proagents/notifications/routing.md +0 -449
- package/.proagents/notifications/scheduling.md +0 -425
- package/.proagents/notifications/templates.md +0 -446
- package/.proagents/offline-mode/README.md +0 -145
- package/.proagents/offline-mode/caching.md +0 -344
- package/.proagents/offline-mode/offline-operations.md +0 -312
- package/.proagents/offline-mode/queue-specifications.md +0 -679
- package/.proagents/offline-mode/sync.md +0 -475
- package/.proagents/parallel-features/README.md +0 -85
- package/.proagents/parallel-features/conflict-detection.md +0 -226
- package/.proagents/parallel-features/dependency-management.md +0 -392
- package/.proagents/parallel-features/merge-coordination.md +0 -506
- package/.proagents/parallel-features/tracking-system.md +0 -416
- package/.proagents/performance/README.md +0 -59
- package/.proagents/performance/bundle-analysis.md +0 -375
- package/.proagents/performance/load-testing.md +0 -563
- package/.proagents/performance/runtime-metrics.md +0 -489
- package/.proagents/performance/web-vitals.md +0 -425
- package/.proagents/plugins/README.md +0 -139
- package/.proagents/plugins/creating-plugins.md +0 -504
- package/.proagents/plugins/plugin-api.md +0 -467
- package/.proagents/plugins/plugin-registry.md +0 -276
- package/.proagents/reporting/README.md +0 -158
- package/.proagents/reporting/dashboards.md +0 -366
- package/.proagents/reporting/exports.md +0 -524
- package/.proagents/reporting/quality-metrics.md +0 -385
- package/.proagents/reporting/templates/README.md +0 -56
- package/.proagents/reporting/templates/dashboard-config.json +0 -187
- package/.proagents/reporting/templates/metrics-queries.md +0 -427
- package/.proagents/reporting/templates/react-dashboard.tsx +0 -544
- package/.proagents/reporting/templates/widgets.md +0 -451
- package/.proagents/reporting/velocity-metrics.md +0 -340
- package/.proagents/reverse-engineering/README.md +0 -151
- package/.proagents/reverse-engineering/architecture-extraction.md +0 -325
- package/.proagents/reverse-engineering/code-analysis.md +0 -377
- package/.proagents/reverse-engineering/dependency-mapping.md +0 -567
- package/.proagents/reverse-engineering/diagram-generation.md +0 -586
- package/.proagents/reverse-engineering/documentation-generation.md +0 -468
- package/.proagents/reverse-engineering/pattern-detection.md +0 -569
- package/.proagents/reverse-engineering/quality-assessment.md +0 -733
- package/.proagents/secrets/README.md +0 -278
- package/.proagents/secrets/access-control.md +0 -443
- package/.proagents/secrets/rotation.md +0 -403
- package/.proagents/secrets/scanning.md +0 -487
- package/.proagents/secrets/storage.md +0 -394
- package/.proagents/webhooks/README.md +0 -126
- package/.proagents/webhooks/endpoints.md +0 -298
- package/.proagents/webhooks/events.md +0 -316
- package/.proagents/webhooks/payloads.md +0 -325
- package/.proagents/webhooks/reliability.md +0 -363
- package/.proagents/webhooks/security.md +0 -380
|
@@ -1,449 +0,0 @@
|
|
|
1
|
-
# Alerting
|
|
2
|
-
|
|
3
|
-
Configuring alerts for monitoring and incident response.
|
|
4
|
-
|
|
5
|
-
---
|
|
6
|
-
|
|
7
|
-
## Alert Configuration
|
|
8
|
-
|
|
9
|
-
### Basic Alert Rules
|
|
10
|
-
|
|
11
|
-
```yaml
|
|
12
|
-
# proagents.config.yaml
|
|
13
|
-
monitoring:
|
|
14
|
-
alerting:
|
|
15
|
-
enabled: true
|
|
16
|
-
|
|
17
|
-
rules:
|
|
18
|
-
# High error rate
|
|
19
|
-
- name: "HighErrorRate"
|
|
20
|
-
expression: "error_rate > 0.05"
|
|
21
|
-
duration: "5m"
|
|
22
|
-
severity: "critical"
|
|
23
|
-
summary: "Error rate above 5%"
|
|
24
|
-
description: "Error rate is {{ $value | humanizePercentage }}"
|
|
25
|
-
|
|
26
|
-
# Elevated latency
|
|
27
|
-
- name: "HighLatency"
|
|
28
|
-
expression: "http_request_duration_p99 > 2"
|
|
29
|
-
duration: "10m"
|
|
30
|
-
severity: "warning"
|
|
31
|
-
summary: "P99 latency above 2 seconds"
|
|
32
|
-
|
|
33
|
-
# Service down
|
|
34
|
-
- name: "ServiceDown"
|
|
35
|
-
expression: "up == 0"
|
|
36
|
-
duration: "1m"
|
|
37
|
-
severity: "critical"
|
|
38
|
-
summary: "Service {{ $labels.service }} is down"
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
### Alert Severity Levels
|
|
42
|
-
|
|
43
|
-
```yaml
|
|
44
|
-
monitoring:
|
|
45
|
-
alerting:
|
|
46
|
-
severities:
|
|
47
|
-
critical:
|
|
48
|
-
description: "Immediate action required"
|
|
49
|
-
color: "#FF0000"
|
|
50
|
-
pagerduty_severity: "critical"
|
|
51
|
-
response_time: "5m"
|
|
52
|
-
|
|
53
|
-
warning:
|
|
54
|
-
description: "Action required soon"
|
|
55
|
-
color: "#FFA500"
|
|
56
|
-
pagerduty_severity: "warning"
|
|
57
|
-
response_time: "30m"
|
|
58
|
-
|
|
59
|
-
info:
|
|
60
|
-
description: "For awareness"
|
|
61
|
-
color: "#0000FF"
|
|
62
|
-
pagerduty_severity: "info"
|
|
63
|
-
response_time: "24h"
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
---
|
|
67
|
-
|
|
68
|
-
## Alert Rules by Category
|
|
69
|
-
|
|
70
|
-
### Availability Alerts
|
|
71
|
-
|
|
72
|
-
```yaml
|
|
73
|
-
monitoring:
|
|
74
|
-
alerting:
|
|
75
|
-
rules:
|
|
76
|
-
availability:
|
|
77
|
-
# Service down
|
|
78
|
-
- name: "ServiceDown"
|
|
79
|
-
expression: "up == 0"
|
|
80
|
-
duration: "1m"
|
|
81
|
-
severity: "critical"
|
|
82
|
-
labels:
|
|
83
|
-
category: "availability"
|
|
84
|
-
|
|
85
|
-
# Health check failing
|
|
86
|
-
- name: "HealthCheckFailing"
|
|
87
|
-
expression: "health_check_status != 1"
|
|
88
|
-
duration: "2m"
|
|
89
|
-
severity: "critical"
|
|
90
|
-
|
|
91
|
-
# High error rate
|
|
92
|
-
- name: "ErrorRateHigh"
|
|
93
|
-
expression: |
|
|
94
|
-
sum(rate(http_requests_total{status=~"5.."}[5m]))
|
|
95
|
-
/
|
|
96
|
-
sum(rate(http_requests_total[5m]))
|
|
97
|
-
> 0.05
|
|
98
|
-
duration: "5m"
|
|
99
|
-
severity: "critical"
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
### Performance Alerts
|
|
103
|
-
|
|
104
|
-
```yaml
|
|
105
|
-
monitoring:
|
|
106
|
-
alerting:
|
|
107
|
-
rules:
|
|
108
|
-
performance:
|
|
109
|
-
# P99 latency
|
|
110
|
-
- name: "HighP99Latency"
|
|
111
|
-
expression: |
|
|
112
|
-
histogram_quantile(0.99,
|
|
113
|
-
rate(http_request_duration_seconds_bucket[5m])
|
|
114
|
-
) > 2
|
|
115
|
-
duration: "10m"
|
|
116
|
-
severity: "warning"
|
|
117
|
-
|
|
118
|
-
# Slow database queries
|
|
119
|
-
- name: "SlowDatabaseQueries"
|
|
120
|
-
expression: |
|
|
121
|
-
histogram_quantile(0.95,
|
|
122
|
-
rate(db_query_duration_seconds_bucket[5m])
|
|
123
|
-
) > 1
|
|
124
|
-
duration: "5m"
|
|
125
|
-
severity: "warning"
|
|
126
|
-
|
|
127
|
-
# High memory usage
|
|
128
|
-
- name: "HighMemoryUsage"
|
|
129
|
-
expression: |
|
|
130
|
-
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)
|
|
131
|
-
/
|
|
132
|
-
node_memory_MemTotal_bytes
|
|
133
|
-
> 0.9
|
|
134
|
-
duration: "5m"
|
|
135
|
-
severity: "warning"
|
|
136
|
-
```
|
|
137
|
-
|
|
138
|
-
### Business Alerts
|
|
139
|
-
|
|
140
|
-
```yaml
|
|
141
|
-
monitoring:
|
|
142
|
-
alerting:
|
|
143
|
-
rules:
|
|
144
|
-
business:
|
|
145
|
-
# Order processing delay
|
|
146
|
-
- name: "OrderProcessingDelay"
|
|
147
|
-
expression: |
|
|
148
|
-
avg(order_processing_duration_seconds) > 60
|
|
149
|
-
duration: "10m"
|
|
150
|
-
severity: "warning"
|
|
151
|
-
summary: "Order processing taking longer than expected"
|
|
152
|
-
|
|
153
|
-
# Payment failures
|
|
154
|
-
- name: "HighPaymentFailureRate"
|
|
155
|
-
expression: |
|
|
156
|
-
sum(rate(payment_failures_total[5m]))
|
|
157
|
-
/
|
|
158
|
-
sum(rate(payment_attempts_total[5m]))
|
|
159
|
-
> 0.1
|
|
160
|
-
duration: "5m"
|
|
161
|
-
severity: "critical"
|
|
162
|
-
|
|
163
|
-
# Queue backup
|
|
164
|
-
- name: "QueueBackup"
|
|
165
|
-
expression: "queue_messages_pending > 10000"
|
|
166
|
-
duration: "15m"
|
|
167
|
-
severity: "warning"
|
|
168
|
-
```
|
|
169
|
-
|
|
170
|
-
### Security Alerts
|
|
171
|
-
|
|
172
|
-
```yaml
|
|
173
|
-
monitoring:
|
|
174
|
-
alerting:
|
|
175
|
-
rules:
|
|
176
|
-
security:
|
|
177
|
-
# High authentication failures
|
|
178
|
-
- name: "HighAuthFailures"
|
|
179
|
-
expression: |
|
|
180
|
-
sum(rate(auth_failures_total[5m])) > 100
|
|
181
|
-
duration: "2m"
|
|
182
|
-
severity: "critical"
|
|
183
|
-
labels:
|
|
184
|
-
category: "security"
|
|
185
|
-
|
|
186
|
-
# Unusual traffic pattern
|
|
187
|
-
- name: "UnusualTrafficPattern"
|
|
188
|
-
expression: |
|
|
189
|
-
rate(http_requests_total[5m]) >
|
|
190
|
-
avg_over_time(rate(http_requests_total[5m])[1d:]) * 3
|
|
191
|
-
duration: "5m"
|
|
192
|
-
severity: "warning"
|
|
193
|
-
|
|
194
|
-
# Rate limit breaches
|
|
195
|
-
- name: "RateLimitBreaches"
|
|
196
|
-
expression: |
|
|
197
|
-
sum(rate(rate_limit_exceeded_total[5m])) > 50
|
|
198
|
-
duration: "5m"
|
|
199
|
-
severity: "warning"
|
|
200
|
-
```
|
|
201
|
-
|
|
202
|
-
---
|
|
203
|
-
|
|
204
|
-
## Notification Channels
|
|
205
|
-
|
|
206
|
-
### Channel Configuration
|
|
207
|
-
|
|
208
|
-
```yaml
|
|
209
|
-
monitoring:
|
|
210
|
-
alerting:
|
|
211
|
-
channels:
|
|
212
|
-
# Slack
|
|
213
|
-
slack:
|
|
214
|
-
webhook_url_env: "SLACK_WEBHOOK_URL"
|
|
215
|
-
default_channel: "#alerts"
|
|
216
|
-
severity_channels:
|
|
217
|
-
critical: "#incidents"
|
|
218
|
-
warning: "#alerts"
|
|
219
|
-
info: "#monitoring"
|
|
220
|
-
|
|
221
|
-
# PagerDuty
|
|
222
|
-
pagerduty:
|
|
223
|
-
api_key_env: "PAGERDUTY_API_KEY"
|
|
224
|
-
service_key_env: "PAGERDUTY_SERVICE_KEY"
|
|
225
|
-
severity_mapping:
|
|
226
|
-
critical: "critical"
|
|
227
|
-
warning: "warning"
|
|
228
|
-
|
|
229
|
-
# Email
|
|
230
|
-
email:
|
|
231
|
-
smtp:
|
|
232
|
-
host: "smtp.company.com"
|
|
233
|
-
port: 587
|
|
234
|
-
from: "alerts@company.com"
|
|
235
|
-
recipients:
|
|
236
|
-
critical: ["oncall@company.com"]
|
|
237
|
-
warning: ["engineering@company.com"]
|
|
238
|
-
|
|
239
|
-
# Opsgenie
|
|
240
|
-
opsgenie:
|
|
241
|
-
api_key_env: "OPSGENIE_API_KEY"
|
|
242
|
-
priority_mapping:
|
|
243
|
-
critical: "P1"
|
|
244
|
-
warning: "P3"
|
|
245
|
-
```
|
|
246
|
-
|
|
247
|
-
### Routing Rules
|
|
248
|
-
|
|
249
|
-
```yaml
|
|
250
|
-
monitoring:
|
|
251
|
-
alerting:
|
|
252
|
-
routing:
|
|
253
|
-
# Default route
|
|
254
|
-
default:
|
|
255
|
-
channels: ["slack"]
|
|
256
|
-
repeat_interval: "4h"
|
|
257
|
-
|
|
258
|
-
# Route by severity
|
|
259
|
-
routes:
|
|
260
|
-
- match:
|
|
261
|
-
severity: "critical"
|
|
262
|
-
channels: ["pagerduty", "slack"]
|
|
263
|
-
repeat_interval: "1h"
|
|
264
|
-
|
|
265
|
-
- match:
|
|
266
|
-
severity: "warning"
|
|
267
|
-
channels: ["slack", "email"]
|
|
268
|
-
repeat_interval: "4h"
|
|
269
|
-
|
|
270
|
-
# Route by category
|
|
271
|
-
- match:
|
|
272
|
-
category: "security"
|
|
273
|
-
channels: ["pagerduty", "slack"]
|
|
274
|
-
additional_recipients:
|
|
275
|
-
slack: "#security-alerts"
|
|
276
|
-
|
|
277
|
-
# Route by service
|
|
278
|
-
- match:
|
|
279
|
-
service: "payments"
|
|
280
|
-
channels: ["pagerduty"]
|
|
281
|
-
escalation_policy: "payments-oncall"
|
|
282
|
-
```
|
|
283
|
-
|
|
284
|
-
---
|
|
285
|
-
|
|
286
|
-
## Alert Lifecycle
|
|
287
|
-
|
|
288
|
-
### Alert States
|
|
289
|
-
|
|
290
|
-
```
|
|
291
|
-
┌──────────┐ ┌──────────┐ ┌──────────┐
|
|
292
|
-
│ Inactive │───▶│ Pending │───▶│ Firing │
|
|
293
|
-
└──────────┘ └──────────┘ └──────────┘
|
|
294
|
-
▲ │ │
|
|
295
|
-
│ ▼ │
|
|
296
|
-
│ (duration not │
|
|
297
|
-
│ reached) │
|
|
298
|
-
│ │
|
|
299
|
-
└──────────────────────────────┘
|
|
300
|
-
(condition cleared)
|
|
301
|
-
```
|
|
302
|
-
|
|
303
|
-
### Inhibition Rules
|
|
304
|
-
|
|
305
|
-
```yaml
|
|
306
|
-
monitoring:
|
|
307
|
-
alerting:
|
|
308
|
-
inhibition:
|
|
309
|
-
# Suppress warnings when critical firing
|
|
310
|
-
- source:
|
|
311
|
-
severity: "critical"
|
|
312
|
-
target:
|
|
313
|
-
severity: "warning"
|
|
314
|
-
equal: ["service"]
|
|
315
|
-
|
|
316
|
-
# Suppress downstream when upstream failing
|
|
317
|
-
- source:
|
|
318
|
-
alertname: "DatabaseDown"
|
|
319
|
-
target:
|
|
320
|
-
alertname: "HighErrorRate"
|
|
321
|
-
```
|
|
322
|
-
|
|
323
|
-
### Silencing
|
|
324
|
-
|
|
325
|
-
```yaml
|
|
326
|
-
monitoring:
|
|
327
|
-
alerting:
|
|
328
|
-
silencing:
|
|
329
|
-
# Maintenance windows
|
|
330
|
-
scheduled:
|
|
331
|
-
- name: "Weekly Maintenance"
|
|
332
|
-
matchers:
|
|
333
|
-
service: "api"
|
|
334
|
-
schedule:
|
|
335
|
-
day_of_week: "sunday"
|
|
336
|
-
start: "02:00"
|
|
337
|
-
end: "04:00"
|
|
338
|
-
timezone: "America/New_York"
|
|
339
|
-
|
|
340
|
-
# Manual silences
|
|
341
|
-
api:
|
|
342
|
-
enabled: true
|
|
343
|
-
path: "/_admin/silences"
|
|
344
|
-
auth_required: true
|
|
345
|
-
```
|
|
346
|
-
|
|
347
|
-
---
|
|
348
|
-
|
|
349
|
-
## Alert Templates
|
|
350
|
-
|
|
351
|
-
### Notification Templates
|
|
352
|
-
|
|
353
|
-
```yaml
|
|
354
|
-
monitoring:
|
|
355
|
-
alerting:
|
|
356
|
-
templates:
|
|
357
|
-
slack:
|
|
358
|
-
firing: |
|
|
359
|
-
:red_circle: *ALERT: {{ .AlertName }}*
|
|
360
|
-
Severity: {{ .Severity }}
|
|
361
|
-
Summary: {{ .Summary }}
|
|
362
|
-
Service: {{ .Labels.service }}
|
|
363
|
-
{{ if .Description }}
|
|
364
|
-
Details: {{ .Description }}
|
|
365
|
-
{{ end }}
|
|
366
|
-
<{{ .DashboardURL }}|View Dashboard>
|
|
367
|
-
|
|
368
|
-
resolved: |
|
|
369
|
-
:large_green_circle: *RESOLVED: {{ .AlertName }}*
|
|
370
|
-
Duration: {{ .Duration }}
|
|
371
|
-
Service: {{ .Labels.service }}
|
|
372
|
-
|
|
373
|
-
pagerduty:
|
|
374
|
-
title: "[{{ .Severity | upper }}] {{ .AlertName }}"
|
|
375
|
-
body: |
|
|
376
|
-
{{ .Summary }}
|
|
377
|
-
|
|
378
|
-
Service: {{ .Labels.service }}
|
|
379
|
-
Environment: {{ .Labels.environment }}
|
|
380
|
-
|
|
381
|
-
{{ .Description }}
|
|
382
|
-
```
|
|
383
|
-
|
|
384
|
-
---
|
|
385
|
-
|
|
386
|
-
## Escalation
|
|
387
|
-
|
|
388
|
-
### Escalation Policies
|
|
389
|
-
|
|
390
|
-
```yaml
|
|
391
|
-
monitoring:
|
|
392
|
-
alerting:
|
|
393
|
-
escalation:
|
|
394
|
-
policies:
|
|
395
|
-
default:
|
|
396
|
-
- wait: "5m"
|
|
397
|
-
notify: ["primary-oncall"]
|
|
398
|
-
- wait: "15m"
|
|
399
|
-
notify: ["secondary-oncall"]
|
|
400
|
-
- wait: "30m"
|
|
401
|
-
notify: ["engineering-manager"]
|
|
402
|
-
- wait: "1h"
|
|
403
|
-
notify: ["vp-engineering"]
|
|
404
|
-
|
|
405
|
-
payments:
|
|
406
|
-
- wait: "2m"
|
|
407
|
-
notify: ["payments-oncall"]
|
|
408
|
-
- wait: "10m"
|
|
409
|
-
notify: ["payments-lead", "engineering-manager"]
|
|
410
|
-
```
|
|
411
|
-
|
|
412
|
-
---
|
|
413
|
-
|
|
414
|
-
## Commands
|
|
415
|
-
|
|
416
|
-
```bash
|
|
417
|
-
# List active alerts
|
|
418
|
-
proagents alerts list
|
|
419
|
-
|
|
420
|
-
# View alert details
|
|
421
|
-
proagents alerts show HighErrorRate
|
|
422
|
-
|
|
423
|
-
# Silence alert
|
|
424
|
-
proagents alerts silence HighErrorRate --duration 1h --reason "Deploying fix"
|
|
425
|
-
|
|
426
|
-
# Remove silence
|
|
427
|
-
proagents alerts unsilence HighErrorRate
|
|
428
|
-
|
|
429
|
-
# Test alert rule
|
|
430
|
-
proagents alerts test --rule HighErrorRate
|
|
431
|
-
|
|
432
|
-
# View alert history
|
|
433
|
-
proagents alerts history --last 7d
|
|
434
|
-
|
|
435
|
-
# Check alerting configuration
|
|
436
|
-
proagents alerts validate
|
|
437
|
-
```
|
|
438
|
-
|
|
439
|
-
---
|
|
440
|
-
|
|
441
|
-
## Best Practices
|
|
442
|
-
|
|
443
|
-
1. **Alert on Symptoms**: Alert on user-facing issues, not causes
|
|
444
|
-
2. **Actionable Alerts**: Every alert should have a clear action
|
|
445
|
-
3. **Avoid Alert Fatigue**: Tune thresholds to reduce noise
|
|
446
|
-
4. **Include Context**: Add runbook links and relevant details
|
|
447
|
-
5. **Use Inhibition**: Prevent alert cascades
|
|
448
|
-
6. **Test Alerts**: Regularly verify alerts work correctly
|
|
449
|
-
7. **Document Runbooks**: Link each alert to a runbook
|