agentic-qe 1.9.3 → 1.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -0
- package/README.md +30 -5
- package/config/.env.otel.example +25 -0
- package/config/OTEL-QUICK-REFERENCE.md +137 -0
- package/config/README-OTEL.md +222 -0
- package/config/alerting-rules.yml +518 -0
- package/config/docker-compose.otel.yml +187 -0
- package/config/grafana/dashboards/agentic-qe-overview.json +286 -0
- package/config/grafana/provisioning/dashboards/dashboards.yml +19 -0
- package/config/grafana/provisioning/datasources/datasources.yml +53 -0
- package/config/otel-collector-config.yaml.example +145 -0
- package/config/prometheus.yml.example +106 -0
- package/dist/alerting/AlertManager.d.ts +120 -0
- package/dist/alerting/AlertManager.d.ts.map +1 -0
- package/dist/alerting/AlertManager.js +345 -0
- package/dist/alerting/AlertManager.js.map +1 -0
- package/dist/alerting/FeedbackRouter.d.ts +98 -0
- package/dist/alerting/FeedbackRouter.d.ts.map +1 -0
- package/dist/alerting/FeedbackRouter.js +331 -0
- package/dist/alerting/FeedbackRouter.js.map +1 -0
- package/dist/alerting/StrategyApplicator.d.ts +120 -0
- package/dist/alerting/StrategyApplicator.d.ts.map +1 -0
- package/dist/alerting/StrategyApplicator.js +299 -0
- package/dist/alerting/StrategyApplicator.js.map +1 -0
- package/dist/alerting/index.d.ts +68 -0
- package/dist/alerting/index.d.ts.map +1 -0
- package/dist/alerting/index.js +112 -0
- package/dist/alerting/index.js.map +1 -0
- package/dist/alerting/types.d.ts +118 -0
- package/dist/alerting/types.d.ts.map +1 -0
- package/dist/alerting/types.js +11 -0
- package/dist/alerting/types.js.map +1 -0
- package/dist/cli/init/claude-config.d.ts.map +1 -1
- package/dist/cli/init/claude-config.js +12 -7
- package/dist/cli/init/claude-config.js.map +1 -1
- package/dist/core/memory/IPatternStore.d.ts +209 -0
- package/dist/core/memory/IPatternStore.d.ts.map +1 -0
- package/dist/core/memory/IPatternStore.js +15 -0
- package/dist/core/memory/IPatternStore.js.map +1 -0
- package/dist/core/memory/MigrationTools.d.ts +192 -0
- package/dist/core/memory/MigrationTools.d.ts.map +1 -0
- package/dist/core/memory/MigrationTools.js +615 -0
- package/dist/core/memory/MigrationTools.js.map +1 -0
- package/dist/core/memory/NeuralEnhancement.d.ts +154 -0
- package/dist/core/memory/NeuralEnhancement.d.ts.map +1 -0
- package/dist/core/memory/NeuralEnhancement.js +598 -0
- package/dist/core/memory/NeuralEnhancement.js.map +1 -0
- package/dist/core/memory/PatternStoreFactory.d.ts +143 -0
- package/dist/core/memory/PatternStoreFactory.d.ts.map +1 -0
- package/dist/core/memory/PatternStoreFactory.js +370 -0
- package/dist/core/memory/PatternStoreFactory.js.map +1 -0
- package/dist/core/memory/RealAgentDBAdapter.d.ts +1 -0
- package/dist/core/memory/RealAgentDBAdapter.d.ts.map +1 -1
- package/dist/core/memory/RealAgentDBAdapter.js +28 -20
- package/dist/core/memory/RealAgentDBAdapter.js.map +1 -1
- package/dist/core/memory/RuVectorPatternStore.d.ts +198 -0
- package/dist/core/memory/RuVectorPatternStore.d.ts.map +1 -0
- package/dist/core/memory/RuVectorPatternStore.js +605 -0
- package/dist/core/memory/RuVectorPatternStore.js.map +1 -0
- package/dist/core/memory/SelfHealingMonitor.d.ts +186 -0
- package/dist/core/memory/SelfHealingMonitor.d.ts.map +1 -0
- package/dist/core/memory/SelfHealingMonitor.js +451 -0
- package/dist/core/memory/SelfHealingMonitor.js.map +1 -0
- package/dist/core/memory/SwarmMemoryManager.d.ts +62 -0
- package/dist/core/memory/SwarmMemoryManager.d.ts.map +1 -1
- package/dist/core/memory/SwarmMemoryManager.js +97 -0
- package/dist/core/memory/SwarmMemoryManager.js.map +1 -1
- package/dist/core/memory/index.d.ts +11 -0
- package/dist/core/memory/index.d.ts.map +1 -1
- package/dist/core/memory/index.js +36 -1
- package/dist/core/memory/index.js.map +1 -1
- package/dist/reasoning/RuVectorReasoningAdapter.d.ts +232 -0
- package/dist/reasoning/RuVectorReasoningAdapter.d.ts.map +1 -0
- package/dist/reasoning/RuVectorReasoningAdapter.js +585 -0
- package/dist/reasoning/RuVectorReasoningAdapter.js.map +1 -0
- package/dist/reasoning/index.d.ts +2 -0
- package/dist/reasoning/index.d.ts.map +1 -1
- package/dist/reasoning/index.js +6 -1
- package/dist/reasoning/index.js.map +1 -1
- package/dist/reporting/ResultAggregator.d.ts +107 -0
- package/dist/reporting/ResultAggregator.d.ts.map +1 -0
- package/dist/reporting/ResultAggregator.js +435 -0
- package/dist/reporting/ResultAggregator.js.map +1 -0
- package/dist/reporting/index.d.ts +48 -0
- package/dist/reporting/index.d.ts.map +1 -0
- package/dist/reporting/index.js +154 -0
- package/dist/reporting/index.js.map +1 -0
- package/dist/reporting/reporters/ControlLoopReporter.d.ts +128 -0
- package/dist/reporting/reporters/ControlLoopReporter.d.ts.map +1 -0
- package/dist/reporting/reporters/ControlLoopReporter.js +417 -0
- package/dist/reporting/reporters/ControlLoopReporter.js.map +1 -0
- package/dist/reporting/reporters/HumanReadableReporter.d.ts +140 -0
- package/dist/reporting/reporters/HumanReadableReporter.d.ts.map +1 -0
- package/dist/reporting/reporters/HumanReadableReporter.js +524 -0
- package/dist/reporting/reporters/HumanReadableReporter.js.map +1 -0
- package/dist/reporting/reporters/JSONReporter.d.ts +193 -0
- package/dist/reporting/reporters/JSONReporter.d.ts.map +1 -0
- package/dist/reporting/reporters/JSONReporter.js +324 -0
- package/dist/reporting/reporters/JSONReporter.js.map +1 -0
- package/dist/reporting/reporters/index.d.ts +14 -0
- package/dist/reporting/reporters/index.d.ts.map +1 -0
- package/dist/reporting/reporters/index.js +19 -0
- package/dist/reporting/reporters/index.js.map +1 -0
- package/dist/reporting/types.d.ts +427 -0
- package/dist/reporting/types.d.ts.map +1 -0
- package/dist/reporting/types.js +12 -0
- package/dist/reporting/types.js.map +1 -0
- package/package.json +9 -1
|
@@ -0,0 +1,518 @@
|
|
|
1
|
+
# Prometheus Alerting Rules for Agentic QE Fleet
|
|
2
|
+
# Phase 4: Autonomous Alerting & Feedback Loop System
|
|
3
|
+
# Issue: #69
|
|
4
|
+
# Version: 1.0.0
|
|
5
|
+
# Date: 2025-11-29
|
|
6
|
+
|
|
7
|
+
groups:
|
|
8
|
+
# =========================================================================
|
|
9
|
+
# QUALITY METRIC ALERTS
|
|
10
|
+
# =========================================================================
|
|
11
|
+
- name: quality_metrics
|
|
12
|
+
interval: 15s
|
|
13
|
+
rules:
|
|
14
|
+
# Test Failure Rate Alert
|
|
15
|
+
- alert: HighTestFailureRate
|
|
16
|
+
expr: |
|
|
17
|
+
(
|
|
18
|
+
sum(rate(aqe_quality_test_count{status="failed"}[5m]))
|
|
19
|
+
/
|
|
20
|
+
sum(rate(aqe_quality_test_count[5m]))
|
|
21
|
+
) > 0.05
|
|
22
|
+
for: 5m
|
|
23
|
+
labels:
|
|
24
|
+
severity: error
|
|
25
|
+
component: quality
|
|
26
|
+
alert_type: test_failure
|
|
27
|
+
feedback_action: adjust_strategy
|
|
28
|
+
annotations:
|
|
29
|
+
summary: "Test failure rate exceeds 5%"
|
|
30
|
+
description: "Test failure rate is {{ $value | humanizePercentage }} (threshold: 5%). This indicates quality degradation."
|
|
31
|
+
feedback_strategy: "increase_test_isolation"
|
|
32
|
+
feedback_focus: "failing_tests"
|
|
33
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/high-test-failure-rate"
|
|
34
|
+
|
|
35
|
+
# Coverage Drop - Critical
|
|
36
|
+
- alert: CriticalCoverageDrop
|
|
37
|
+
expr: aqe_quality_coverage_line < 80
|
|
38
|
+
for: 1m
|
|
39
|
+
labels:
|
|
40
|
+
severity: critical
|
|
41
|
+
component: quality
|
|
42
|
+
alert_type: coverage_drop
|
|
43
|
+
feedback_action: auto_remediate
|
|
44
|
+
agent_scope: qe-coverage-analyzer
|
|
45
|
+
annotations:
|
|
46
|
+
summary: "Code coverage dropped below 80%"
|
|
47
|
+
description: "Line coverage is {{ $value }}% (threshold: 80%). Immediate action required."
|
|
48
|
+
feedback_action: "generate_additional_tests"
|
|
49
|
+
feedback_target_coverage: "85.0"
|
|
50
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/coverage-drop"
|
|
51
|
+
|
|
52
|
+
# Coverage Drop - Warning
|
|
53
|
+
- alert: WarningCoverageDrop
|
|
54
|
+
expr: aqe_quality_coverage_line < 85 and aqe_quality_coverage_line >= 80
|
|
55
|
+
for: 5m
|
|
56
|
+
labels:
|
|
57
|
+
severity: warning
|
|
58
|
+
component: quality
|
|
59
|
+
alert_type: coverage_drop
|
|
60
|
+
feedback_action: adjust_strategy
|
|
61
|
+
annotations:
|
|
62
|
+
summary: "Code coverage approaching threshold"
|
|
63
|
+
description: "Line coverage is {{ $value }}% (warning at 85%, critical at 80%)"
|
|
64
|
+
feedback_strategy: "proactive_test_generation"
|
|
65
|
+
|
|
66
|
+
# Branch Coverage Drop
|
|
67
|
+
- alert: BranchCoverageLow
|
|
68
|
+
expr: aqe_quality_coverage_branch < 75
|
|
69
|
+
for: 5m
|
|
70
|
+
labels:
|
|
71
|
+
severity: warning
|
|
72
|
+
component: quality
|
|
73
|
+
alert_type: coverage_drop
|
|
74
|
+
feedback_action: adjust_strategy
|
|
75
|
+
annotations:
|
|
76
|
+
summary: "Branch coverage below threshold"
|
|
77
|
+
description: "Branch coverage is {{ $value }}% (threshold: 75%)"
|
|
78
|
+
feedback_strategy: "focus_branch_coverage"
|
|
79
|
+
|
|
80
|
+
# Flaky Tests Increasing
|
|
81
|
+
- alert: FlakyTestsIncreasing
|
|
82
|
+
expr: aqe_quality_flaky_count > 5
|
|
83
|
+
for: 1h
|
|
84
|
+
labels:
|
|
85
|
+
severity: warning
|
|
86
|
+
component: quality
|
|
87
|
+
alert_type: flaky_tests
|
|
88
|
+
feedback_action: adjust_strategy
|
|
89
|
+
agent_scope: qe-flaky-detector
|
|
90
|
+
annotations:
|
|
91
|
+
summary: "Number of flaky tests is growing"
|
|
92
|
+
description: "{{ $value }} flaky tests detected (threshold: 5). Test stability degrading."
|
|
93
|
+
feedback_strategy: "stabilize_flaky_tests"
|
|
94
|
+
feedback_analysis_depth: "deep"
|
|
95
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/flaky-tests"
|
|
96
|
+
|
|
97
|
+
# Critical Flaky Test Count
|
|
98
|
+
- alert: CriticalFlakyTestCount
|
|
99
|
+
expr: aqe_quality_flaky_count > 10
|
|
100
|
+
for: 30m
|
|
101
|
+
labels:
|
|
102
|
+
severity: error
|
|
103
|
+
component: quality
|
|
104
|
+
alert_type: flaky_tests
|
|
105
|
+
feedback_action: escalate
|
|
106
|
+
annotations:
|
|
107
|
+
summary: "Critical number of flaky tests detected"
|
|
108
|
+
description: "{{ $value }} flaky tests (critical threshold: 10). Test suite reliability compromised."
|
|
109
|
+
feedback_action: "quarantine_flaky_tests"
|
|
110
|
+
|
|
111
|
+
# Security Vulnerabilities - Critical
|
|
112
|
+
- alert: CriticalSecurityVulnerabilities
|
|
113
|
+
expr: aqe_quality_security_vulnerability_count{severity="critical"} > 0
|
|
114
|
+
for: 0s
|
|
115
|
+
labels:
|
|
116
|
+
severity: critical
|
|
117
|
+
component: security
|
|
118
|
+
alert_type: vulnerability
|
|
119
|
+
feedback_action: escalate
|
|
120
|
+
agent_scope: qe-security-scanner
|
|
121
|
+
annotations:
|
|
122
|
+
summary: "Critical security vulnerabilities detected"
|
|
123
|
+
description: "{{ $value }} critical vulnerabilities found. Deployment must be blocked."
|
|
124
|
+
feedback_notify: "security_team"
|
|
125
|
+
feedback_block_deployment: "true"
|
|
126
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/security-vulnerabilities"
|
|
127
|
+
|
|
128
|
+
# Security Vulnerabilities - High
|
|
129
|
+
- alert: HighSecurityVulnerabilities
|
|
130
|
+
expr: aqe_quality_security_vulnerability_count{severity="high"} > 0
|
|
131
|
+
for: 0s
|
|
132
|
+
labels:
|
|
133
|
+
severity: error
|
|
134
|
+
component: security
|
|
135
|
+
alert_type: vulnerability
|
|
136
|
+
feedback_action: escalate
|
|
137
|
+
agent_scope: qe-security-scanner
|
|
138
|
+
annotations:
|
|
139
|
+
summary: "High severity security vulnerabilities detected"
|
|
140
|
+
description: "{{ $value }} high severity vulnerabilities found. Immediate remediation required."
|
|
141
|
+
feedback_notify: "security_team"
|
|
142
|
+
feedback_block_deployment: "true"
|
|
143
|
+
|
|
144
|
+
# Security Vulnerabilities - Medium (with threshold)
|
|
145
|
+
- alert: MediumSecurityVulnerabilities
|
|
146
|
+
expr: aqe_quality_security_vulnerability_count{severity="medium"} > 5
|
|
147
|
+
for: 5m
|
|
148
|
+
labels:
|
|
149
|
+
severity: warning
|
|
150
|
+
component: security
|
|
151
|
+
alert_type: vulnerability
|
|
152
|
+
feedback_action: auto_remediate
|
|
153
|
+
annotations:
|
|
154
|
+
summary: "Multiple medium severity vulnerabilities"
|
|
155
|
+
description: "{{ $value }} medium severity vulnerabilities (threshold: 5)"
|
|
156
|
+
feedback_action: "schedule_security_remediation"
|
|
157
|
+
|
|
158
|
+
# Quality Gate Failure
|
|
159
|
+
- alert: QualityGateFailed
|
|
160
|
+
expr: aqe_quality_gate_pass_rate < 1.0
|
|
161
|
+
for: 1m
|
|
162
|
+
labels:
|
|
163
|
+
severity: error
|
|
164
|
+
component: quality
|
|
165
|
+
alert_type: quality_gate
|
|
166
|
+
feedback_action: adjust_strategy
|
|
167
|
+
agent_scope: qe-quality-gate
|
|
168
|
+
annotations:
|
|
169
|
+
summary: "Quality gate evaluation failed"
|
|
170
|
+
description: "Quality gate pass rate: {{ $value | humanizePercentage }} (expected: 100%)"
|
|
171
|
+
feedback_strategy: "incremental_improvement"
|
|
172
|
+
feedback_focus_areas: "coverage,complexity,security"
|
|
173
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/quality-gate-failure"
|
|
174
|
+
|
|
175
|
+
# =========================================================================
|
|
176
|
+
# PERFORMANCE METRIC ALERTS
|
|
177
|
+
# =========================================================================
|
|
178
|
+
- name: performance_metrics
|
|
179
|
+
interval: 15s
|
|
180
|
+
rules:
|
|
181
|
+
# Test Execution Slow
|
|
182
|
+
- alert: TestExecutionSlow
|
|
183
|
+
expr: |
|
|
184
|
+
histogram_quantile(0.95,
|
|
185
|
+
rate(aqe_quality_test_duration_bucket[5m])
|
|
186
|
+
) > 30000
|
|
187
|
+
for: 5m
|
|
188
|
+
labels:
|
|
189
|
+
severity: warning
|
|
190
|
+
component: performance
|
|
191
|
+
alert_type: execution_slow
|
|
192
|
+
feedback_action: adjust_strategy
|
|
193
|
+
agent_scope: qe-test-executor
|
|
194
|
+
annotations:
|
|
195
|
+
summary: "Test execution time degraded"
|
|
196
|
+
description: "P95 test execution time is {{ $value | humanizeDuration }} (threshold: 30s)"
|
|
197
|
+
feedback_strategy: "optimize_test_suite"
|
|
198
|
+
feedback_action: "parallel_execution"
|
|
199
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/slow-tests"
|
|
200
|
+
|
|
201
|
+
# Critical Test Execution Time
|
|
202
|
+
- alert: CriticalTestExecutionTime
|
|
203
|
+
expr: |
|
|
204
|
+
histogram_quantile(0.95,
|
|
205
|
+
rate(aqe_quality_test_duration_bucket[5m])
|
|
206
|
+
) > 60000
|
|
207
|
+
for: 3m
|
|
208
|
+
labels:
|
|
209
|
+
severity: error
|
|
210
|
+
component: performance
|
|
211
|
+
alert_type: execution_slow
|
|
212
|
+
feedback_action: auto_remediate
|
|
213
|
+
annotations:
|
|
214
|
+
summary: "Test execution critically slow"
|
|
215
|
+
description: "P95 test execution time is {{ $value | humanizeDuration }} (critical threshold: 60s)"
|
|
216
|
+
feedback_action: "emergency_test_optimization"
|
|
217
|
+
|
|
218
|
+
# Agent Task Timeout
|
|
219
|
+
- alert: AgentTaskTimeout
|
|
220
|
+
expr: |
|
|
221
|
+
histogram_quantile(0.95,
|
|
222
|
+
rate(aqe_agent_task_duration_bucket[10m])
|
|
223
|
+
) > 120000
|
|
224
|
+
for: 10m
|
|
225
|
+
labels:
|
|
226
|
+
severity: error
|
|
227
|
+
component: performance
|
|
228
|
+
alert_type: task_timeout
|
|
229
|
+
feedback_action: retrain_model
|
|
230
|
+
annotations:
|
|
231
|
+
summary: "Agent tasks timing out frequently"
|
|
232
|
+
description: "P95 task duration is {{ $value | humanizeDuration }} (threshold: 2m)"
|
|
233
|
+
feedback_focus: "task_complexity_estimation"
|
|
234
|
+
feedback_learning_rate: "0.2"
|
|
235
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/agent-timeout"
|
|
236
|
+
|
|
237
|
+
# Memory Usage High
|
|
238
|
+
- alert: HighMemoryUsage
|
|
239
|
+
expr: aqe_system_memory_usage > 500000000
|
|
240
|
+
for: 1m
|
|
241
|
+
labels:
|
|
242
|
+
severity: warning
|
|
243
|
+
component: system
|
|
244
|
+
alert_type: resource_usage
|
|
245
|
+
feedback_action: auto_remediate
|
|
246
|
+
annotations:
|
|
247
|
+
summary: "Agent memory consumption exceeds threshold"
|
|
248
|
+
description: "Memory usage is {{ $value | humanize1024 }}B (threshold: 500MB)"
|
|
249
|
+
feedback_action: "garbage_collect"
|
|
250
|
+
feedback_optimize_batch_size: "true"
|
|
251
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/high-memory"
|
|
252
|
+
|
|
253
|
+
# Critical Memory Usage
|
|
254
|
+
- alert: CriticalMemoryUsage
|
|
255
|
+
expr: aqe_system_memory_usage > 800000000
|
|
256
|
+
for: 30s
|
|
257
|
+
labels:
|
|
258
|
+
severity: critical
|
|
259
|
+
component: system
|
|
260
|
+
alert_type: resource_usage
|
|
261
|
+
feedback_action: escalate
|
|
262
|
+
annotations:
|
|
263
|
+
summary: "Critical memory usage detected"
|
|
264
|
+
description: "Memory usage is {{ $value | humanize1024 }}B (critical threshold: 800MB). OOM risk."
|
|
265
|
+
feedback_action: "emergency_memory_cleanup"
|
|
266
|
+
|
|
267
|
+
# CPU Usage High
|
|
268
|
+
- alert: HighCPUUsage
|
|
269
|
+
expr: aqe_system_cpu_usage > 80
|
|
270
|
+
for: 5m
|
|
271
|
+
labels:
|
|
272
|
+
severity: warning
|
|
273
|
+
component: system
|
|
274
|
+
alert_type: resource_usage
|
|
275
|
+
feedback_action: adjust_strategy
|
|
276
|
+
annotations:
|
|
277
|
+
summary: "High CPU utilization detected"
|
|
278
|
+
description: "CPU usage is {{ $value }}% (threshold: 80%)"
|
|
279
|
+
feedback_strategy: "reduce_concurrent_tasks"
|
|
280
|
+
|
|
281
|
+
# =========================================================================
|
|
282
|
+
# LEARNING & ADAPTATION ALERTS
|
|
283
|
+
# =========================================================================
|
|
284
|
+
- name: learning_metrics
|
|
285
|
+
interval: 30s
|
|
286
|
+
rules:
|
|
287
|
+
# Low Agent Success Rate
|
|
288
|
+
- alert: LowAgentSuccessRate
|
|
289
|
+
expr: aqe_agent_success_rate < 0.90
|
|
290
|
+
for: 1h
|
|
291
|
+
labels:
|
|
292
|
+
severity: warning
|
|
293
|
+
component: learning
|
|
294
|
+
alert_type: success_rate
|
|
295
|
+
feedback_action: retrain_model
|
|
296
|
+
annotations:
|
|
297
|
+
summary: "Agent success rate below target"
|
|
298
|
+
description: "Agent {{ $labels.agent_type }} success rate is {{ $value | humanizePercentage }} (threshold: 90%)"
|
|
299
|
+
feedback_exploration_rate: "0.3"
|
|
300
|
+
feedback_focus: "failed_task_patterns"
|
|
301
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/low-success-rate"
|
|
302
|
+
|
|
303
|
+
# Critical Agent Success Rate
|
|
304
|
+
- alert: CriticalAgentSuccessRate
|
|
305
|
+
expr: aqe_agent_success_rate < 0.70
|
|
306
|
+
for: 30m
|
|
307
|
+
labels:
|
|
308
|
+
severity: error
|
|
309
|
+
component: learning
|
|
310
|
+
alert_type: success_rate
|
|
311
|
+
feedback_action: escalate
|
|
312
|
+
annotations:
|
|
313
|
+
summary: "Critical agent success rate"
|
|
314
|
+
description: "Agent {{ $labels.agent_type }} success rate is {{ $value | humanizePercentage }} (critical: 70%)"
|
|
315
|
+
feedback_action: "emergency_retraining"
|
|
316
|
+
|
|
317
|
+
# Defect Density High
|
|
318
|
+
- alert: HighDefectDensity
|
|
319
|
+
expr: aqe_quality_defect_density > 2.0
|
|
320
|
+
for: 24h
|
|
321
|
+
labels:
|
|
322
|
+
severity: error
|
|
323
|
+
component: quality
|
|
324
|
+
alert_type: defect_density
|
|
325
|
+
feedback_action: adjust_strategy
|
|
326
|
+
agent_scope: qe-quality-analyzer
|
|
327
|
+
annotations:
|
|
328
|
+
summary: "Defect density exceeds threshold"
|
|
329
|
+
description: "Defect density is {{ $value }} per KLOC (threshold: 2.0)"
|
|
330
|
+
feedback_strategy: "increase_review_depth"
|
|
331
|
+
feedback_static_analysis: "true"
|
|
332
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/high-defect-density"
|
|
333
|
+
|
|
334
|
+
# Agent Task Failure Spike
|
|
335
|
+
- alert: AgentTaskFailureSpike
|
|
336
|
+
expr: |
|
|
337
|
+
(
|
|
338
|
+
sum(rate(aqe_agent_task_count{status="failed"}[5m])) by (agent_type)
|
|
339
|
+
/
|
|
340
|
+
sum(rate(aqe_agent_task_count[5m])) by (agent_type)
|
|
341
|
+
) > 0.20
|
|
342
|
+
for: 10m
|
|
343
|
+
labels:
|
|
344
|
+
severity: warning
|
|
345
|
+
component: learning
|
|
346
|
+
alert_type: failure_spike
|
|
347
|
+
feedback_action: retrain_model
|
|
348
|
+
annotations:
|
|
349
|
+
summary: "Agent experiencing task failure spike"
|
|
350
|
+
description: "Agent {{ $labels.agent_type }} failure rate is {{ $value | humanizePercentage }} (threshold: 20%)"
|
|
351
|
+
feedback_action: "analyze_failure_patterns"
|
|
352
|
+
|
|
353
|
+
# =========================================================================
|
|
354
|
+
# FLEET COORDINATION ALERTS
|
|
355
|
+
# =========================================================================
|
|
356
|
+
- name: fleet_coordination
|
|
357
|
+
interval: 30s
|
|
358
|
+
rules:
|
|
359
|
+
# Agent Queue Depth High
|
|
360
|
+
- alert: HighAgentQueueDepth
|
|
361
|
+
expr: aqe_system_queue_depth > 50
|
|
362
|
+
for: 5m
|
|
363
|
+
labels:
|
|
364
|
+
severity: warning
|
|
365
|
+
component: coordination
|
|
366
|
+
alert_type: queue_depth
|
|
367
|
+
feedback_action: adjust_strategy
|
|
368
|
+
annotations:
|
|
369
|
+
summary: "Agent task queue is backing up"
|
|
370
|
+
description: "Queue depth is {{ $value }} tasks (threshold: 50)"
|
|
371
|
+
feedback_strategy: "scale_agents"
|
|
372
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/queue-backup"
|
|
373
|
+
|
|
374
|
+
# Agent Queue Depth Critical
|
|
375
|
+
- alert: CriticalAgentQueueDepth
|
|
376
|
+
expr: aqe_system_queue_depth > 100
|
|
377
|
+
for: 2m
|
|
378
|
+
labels:
|
|
379
|
+
severity: error
|
|
380
|
+
component: coordination
|
|
381
|
+
alert_type: queue_depth
|
|
382
|
+
feedback_action: auto_remediate
|
|
383
|
+
annotations:
|
|
384
|
+
summary: "Critical agent queue backlog"
|
|
385
|
+
description: "Queue depth is {{ $value }} tasks (critical: 100). System overloaded."
|
|
386
|
+
feedback_action: "emergency_queue_drain"
|
|
387
|
+
|
|
388
|
+
# Database Query Slow
|
|
389
|
+
- alert: SlowDatabaseQueries
|
|
390
|
+
expr: |
|
|
391
|
+
histogram_quantile(0.95,
|
|
392
|
+
rate(aqe_system_db_query_duration_bucket[5m])
|
|
393
|
+
) > 1000
|
|
394
|
+
for: 5m
|
|
395
|
+
labels:
|
|
396
|
+
severity: warning
|
|
397
|
+
component: system
|
|
398
|
+
alert_type: database_slow
|
|
399
|
+
feedback_action: adjust_strategy
|
|
400
|
+
annotations:
|
|
401
|
+
summary: "Database queries are slow"
|
|
402
|
+
description: "P95 query duration is {{ $value | humanizeDuration }} (threshold: 1s)"
|
|
403
|
+
feedback_strategy: "optimize_database_access"
|
|
404
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/slow-database"
|
|
405
|
+
|
|
406
|
+
# Event Bus Latency High
|
|
407
|
+
- alert: HighEventBusLatency
|
|
408
|
+
expr: |
|
|
409
|
+
rate(aqe_system_eventbus_latency_sum[5m])
|
|
410
|
+
/
|
|
411
|
+
rate(aqe_system_eventbus_latency_count[5m])
|
|
412
|
+
> 500
|
|
413
|
+
for: 5m
|
|
414
|
+
labels:
|
|
415
|
+
severity: warning
|
|
416
|
+
component: coordination
|
|
417
|
+
alert_type: event_latency
|
|
418
|
+
feedback_action: adjust_strategy
|
|
419
|
+
annotations:
|
|
420
|
+
summary: "Event bus experiencing high latency"
|
|
421
|
+
description: "Average event latency is {{ $value | humanizeDuration }} (threshold: 500ms)"
|
|
422
|
+
feedback_strategy: "optimize_event_handling"
|
|
423
|
+
|
|
424
|
+
# =========================================================================
|
|
425
|
+
# TOKEN COST & EFFICIENCY ALERTS
|
|
426
|
+
# =========================================================================
|
|
427
|
+
- name: cost_efficiency
|
|
428
|
+
interval: 1m
|
|
429
|
+
rules:
|
|
430
|
+
# High Token Cost Rate
|
|
431
|
+
- alert: HighTokenCostRate
|
|
432
|
+
expr: |
|
|
433
|
+
rate(aqe_agent_cost_sum[1h])
|
|
434
|
+
> 10.0
|
|
435
|
+
for: 15m
|
|
436
|
+
labels:
|
|
437
|
+
severity: warning
|
|
438
|
+
component: cost
|
|
439
|
+
alert_type: token_cost
|
|
440
|
+
feedback_action: adjust_strategy
|
|
441
|
+
annotations:
|
|
442
|
+
summary: "Token costs increasing rapidly"
|
|
443
|
+
description: "Cost rate is ${{ $value }}/hour (threshold: $10/hour)"
|
|
444
|
+
feedback_strategy: "optimize_token_usage"
|
|
445
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/high-token-cost"
|
|
446
|
+
|
|
447
|
+
# Inefficient Agent Token Usage
|
|
448
|
+
- alert: InefficientAgentTokenUsage
|
|
449
|
+
expr: |
|
|
450
|
+
(
|
|
451
|
+
rate(aqe_agent_token_usage_sum[1h]) by (agent_type)
|
|
452
|
+
/
|
|
453
|
+
rate(aqe_agent_task_count{status="success"}[1h]) by (agent_type)
|
|
454
|
+
) > 10000
|
|
455
|
+
for: 30m
|
|
456
|
+
labels:
|
|
457
|
+
severity: warning
|
|
458
|
+
component: efficiency
|
|
459
|
+
alert_type: token_efficiency
|
|
460
|
+
feedback_action: retrain_model
|
|
461
|
+
annotations:
|
|
462
|
+
summary: "Agent using excessive tokens per successful task"
|
|
463
|
+
description: "Agent {{ $labels.agent_type }} uses {{ $value }} tokens per success (threshold: 10k)"
|
|
464
|
+
feedback_strategy: "optimize_prompt_efficiency"
|
|
465
|
+
|
|
466
|
+
# =========================================================================
|
|
467
|
+
# ALERTING SYSTEM HEALTH
|
|
468
|
+
# =========================================================================
|
|
469
|
+
- name: alerting_system_health
|
|
470
|
+
interval: 30s
|
|
471
|
+
rules:
|
|
472
|
+
# High Alert Fire Rate (Alert Fatigue)
|
|
473
|
+
- alert: AlertFatigueDetected
|
|
474
|
+
expr: |
|
|
475
|
+
sum(rate(aqe_alerting_alerts_fired[1h])) > 20
|
|
476
|
+
for: 1h
|
|
477
|
+
labels:
|
|
478
|
+
severity: warning
|
|
479
|
+
component: alerting
|
|
480
|
+
alert_type: alert_fatigue
|
|
481
|
+
annotations:
|
|
482
|
+
summary: "Excessive alerts being fired"
|
|
483
|
+
description: "{{ $value }} alerts fired in the last hour. Potential alert fatigue."
|
|
484
|
+
action: "Review and tune alert thresholds"
|
|
485
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/alert-fatigue"
|
|
486
|
+
|
|
487
|
+
# High Alert Suppression Rate
|
|
488
|
+
- alert: HighAlertSuppressionRate
|
|
489
|
+
expr: |
|
|
490
|
+
(
|
|
491
|
+
rate(aqe_alerting_alerts_suppressed[1h])
|
|
492
|
+
/
|
|
493
|
+
rate(aqe_alerting_alerts_fired[1h])
|
|
494
|
+
) > 0.5
|
|
495
|
+
for: 1h
|
|
496
|
+
labels:
|
|
497
|
+
severity: info
|
|
498
|
+
component: alerting
|
|
499
|
+
alert_type: suppression_rate
|
|
500
|
+
annotations:
|
|
501
|
+
summary: "High alert suppression rate"
|
|
502
|
+
description: "{{ $value | humanizePercentage }} of alerts are being suppressed. Review cooldown settings."
|
|
503
|
+
|
|
504
|
+
# Feedback Processing Slow
|
|
505
|
+
- alert: SlowFeedbackProcessing
|
|
506
|
+
expr: |
|
|
507
|
+
histogram_quantile(0.95,
|
|
508
|
+
rate(aqe_alerting_feedback_duration_bucket[5m])
|
|
509
|
+
) > 5000
|
|
510
|
+
for: 5m
|
|
511
|
+
labels:
|
|
512
|
+
severity: warning
|
|
513
|
+
component: alerting
|
|
514
|
+
alert_type: feedback_slow
|
|
515
|
+
annotations:
|
|
516
|
+
summary: "Feedback loop processing is slow"
|
|
517
|
+
description: "P95 feedback processing time is {{ $value | humanizeDuration }} (threshold: 5s)"
|
|
518
|
+
action: "Investigate feedback router performance"
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
version: '3.8'
|
|
2
|
+
|
|
3
|
+
# OTEL Observability Stack for Agentic QE Fleet
|
|
4
|
+
# Issue #71: Complete OTEL Stack Docker Compose Configuration
|
|
5
|
+
#
|
|
6
|
+
# This compose file sets up a complete observability stack:
|
|
7
|
+
# - OTEL Collector: Receives telemetry via OTLP (gRPC:4317, HTTP:4318)
|
|
8
|
+
# - Prometheus: Scrapes metrics from OTEL Collector (port 9090)
|
|
9
|
+
# - Jaeger: Distributed tracing backend (UI on port 16686)
|
|
10
|
+
# - Grafana: Visualization and dashboards (port 3001)
|
|
11
|
+
#
|
|
12
|
+
# Usage:
|
|
13
|
+
# docker-compose -f config/docker-compose.otel.yml up -d
|
|
14
|
+
# docker-compose -f docker-compose.yml -f config/docker-compose.otel.yml up -d
|
|
15
|
+
|
|
16
|
+
services:
|
|
17
|
+
# OpenTelemetry Collector
|
|
18
|
+
otel-collector:
|
|
19
|
+
image: otel/opentelemetry-collector-contrib:latest
|
|
20
|
+
container_name: agentic-qe-otel-collector
|
|
21
|
+
command: ["--config=/etc/otel-collector-config.yaml"]
|
|
22
|
+
volumes:
|
|
23
|
+
- ./otel-collector-config.yaml.example:/etc/otel-collector-config.yaml:ro
|
|
24
|
+
- otel-data:/var/log/otel
|
|
25
|
+
ports:
|
|
26
|
+
# OTLP gRPC receiver
|
|
27
|
+
- "4317:4317"
|
|
28
|
+
# OTLP HTTP receiver
|
|
29
|
+
- "4318:4318"
|
|
30
|
+
# Prometheus exporter
|
|
31
|
+
- "8889:8889"
|
|
32
|
+
# Collector metrics (self-monitoring)
|
|
33
|
+
- "8888:8888"
|
|
34
|
+
# Health check endpoint
|
|
35
|
+
- "13133:13133"
|
|
36
|
+
# pprof profiling (development only)
|
|
37
|
+
- "1777:1777"
|
|
38
|
+
# zpages debug interface (development only)
|
|
39
|
+
- "55679:55679"
|
|
40
|
+
environment:
|
|
41
|
+
- DEPLOYMENT_ENVIRONMENT=${DEPLOYMENT_ENVIRONMENT:-development}
|
|
42
|
+
networks:
|
|
43
|
+
- agentic-qe-otel
|
|
44
|
+
restart: unless-stopped
|
|
45
|
+
healthcheck:
|
|
46
|
+
test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133/health"]
|
|
47
|
+
interval: 30s
|
|
48
|
+
timeout: 10s
|
|
49
|
+
retries: 3
|
|
50
|
+
start_period: 40s
|
|
51
|
+
|
|
52
|
+
# Prometheus - Metrics storage and querying
|
|
53
|
+
prometheus:
|
|
54
|
+
image: prom/prometheus:latest
|
|
55
|
+
container_name: agentic-qe-prometheus
|
|
56
|
+
command:
|
|
57
|
+
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
58
|
+
- '--storage.tsdb.path=/prometheus'
|
|
59
|
+
- '--storage.tsdb.retention.time=15d'
|
|
60
|
+
- '--storage.tsdb.retention.size=10GB'
|
|
61
|
+
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
|
62
|
+
- '--web.console.templates=/usr/share/prometheus/consoles'
|
|
63
|
+
- '--web.enable-lifecycle'
|
|
64
|
+
volumes:
|
|
65
|
+
- ./prometheus.yml.example:/etc/prometheus/prometheus.yml:ro
|
|
66
|
+
- prometheus-data:/prometheus
|
|
67
|
+
# Optionally mount alerting rules
|
|
68
|
+
# - ./prometheus-rules:/etc/prometheus/rules:ro
|
|
69
|
+
ports:
|
|
70
|
+
- "9090:9090"
|
|
71
|
+
networks:
|
|
72
|
+
- agentic-qe-otel
|
|
73
|
+
restart: unless-stopped
|
|
74
|
+
depends_on:
|
|
75
|
+
otel-collector:
|
|
76
|
+
condition: service_healthy
|
|
77
|
+
healthcheck:
|
|
78
|
+
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
|
|
79
|
+
interval: 30s
|
|
80
|
+
timeout: 10s
|
|
81
|
+
retries: 3
|
|
82
|
+
start_period: 30s
|
|
83
|
+
|
|
84
|
+
# Jaeger - Distributed tracing backend
|
|
85
|
+
jaeger:
|
|
86
|
+
image: jaegertracing/all-in-one:latest
|
|
87
|
+
container_name: agentic-qe-jaeger
|
|
88
|
+
environment:
|
|
89
|
+
# Collector settings
|
|
90
|
+
- COLLECTOR_OTLP_ENABLED=true
|
|
91
|
+
- COLLECTOR_ZIPKIN_HOST_PORT=:9411
|
|
92
|
+
# Storage settings (in-memory for development)
|
|
93
|
+
- SPAN_STORAGE_TYPE=badger
|
|
94
|
+
- BADGER_EPHEMERAL=false
|
|
95
|
+
- BADGER_DIRECTORY_VALUE=/badger/data
|
|
96
|
+
- BADGER_DIRECTORY_KEY=/badger/key
|
|
97
|
+
# Query settings
|
|
98
|
+
- QUERY_BASE_PATH=/
|
|
99
|
+
# Metrics backend
|
|
100
|
+
- METRICS_BACKEND=prometheus
|
|
101
|
+
- METRICS_HTTP_ROUTE=/metrics
|
|
102
|
+
volumes:
|
|
103
|
+
- jaeger-data:/badger
|
|
104
|
+
ports:
|
|
105
|
+
# Jaeger UI
|
|
106
|
+
- "16686:16686"
|
|
107
|
+
# OTLP gRPC receiver
|
|
108
|
+
- "4327:4317"
|
|
109
|
+
# OTLP HTTP receiver
|
|
110
|
+
- "4328:4318"
|
|
111
|
+
# Zipkin compatible endpoint
|
|
112
|
+
- "9411:9411"
|
|
113
|
+
# Admin port (health check, metrics)
|
|
114
|
+
- "14269:14269"
|
|
115
|
+
# Jaeger Thrift compact
|
|
116
|
+
- "6831:6831/udp"
|
|
117
|
+
# Jaeger Thrift binary
|
|
118
|
+
- "6832:6832/udp"
|
|
119
|
+
# Jaeger gRPC
|
|
120
|
+
- "14250:14250"
|
|
121
|
+
networks:
|
|
122
|
+
- agentic-qe-otel
|
|
123
|
+
restart: unless-stopped
|
|
124
|
+
healthcheck:
|
|
125
|
+
test: ["CMD", "wget", "--spider", "-q", "http://localhost:14269/"]
|
|
126
|
+
interval: 30s
|
|
127
|
+
timeout: 10s
|
|
128
|
+
retries: 3
|
|
129
|
+
start_period: 30s
|
|
130
|
+
|
|
131
|
+
# Grafana - Visualization and dashboards
|
|
132
|
+
grafana:
|
|
133
|
+
image: grafana/grafana:latest
|
|
134
|
+
container_name: agentic-qe-grafana
|
|
135
|
+
environment:
|
|
136
|
+
# Admin credentials (change in production!)
|
|
137
|
+
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
|
|
138
|
+
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
|
|
139
|
+
# Server settings
|
|
140
|
+
- GF_SERVER_ROOT_URL=http://localhost:3001
|
|
141
|
+
- GF_SERVER_SERVE_FROM_SUB_PATH=false
|
|
142
|
+
# Enable anonymous access (development only)
|
|
143
|
+
- GF_AUTH_ANONYMOUS_ENABLED=false
|
|
144
|
+
# Provisioning
|
|
145
|
+
- GF_PATHS_PROVISIONING=/etc/grafana/provisioning
|
|
146
|
+
# Plugins
|
|
147
|
+
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
|
|
148
|
+
volumes:
|
|
149
|
+
# Datasource provisioning
|
|
150
|
+
- ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
|
|
151
|
+
# Dashboard provisioning
|
|
152
|
+
- ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
|
|
153
|
+
# Dashboard JSON files
|
|
154
|
+
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
|
|
155
|
+
# Persistent storage
|
|
156
|
+
- grafana-data:/var/lib/grafana
|
|
157
|
+
ports:
|
|
158
|
+
- "3001:3000"
|
|
159
|
+
networks:
|
|
160
|
+
- agentic-qe-otel
|
|
161
|
+
restart: unless-stopped
|
|
162
|
+
depends_on:
|
|
163
|
+
prometheus:
|
|
164
|
+
condition: service_healthy
|
|
165
|
+
jaeger:
|
|
166
|
+
condition: service_healthy
|
|
167
|
+
healthcheck:
|
|
168
|
+
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health"]
|
|
169
|
+
interval: 30s
|
|
170
|
+
timeout: 10s
|
|
171
|
+
retries: 3
|
|
172
|
+
start_period: 40s
|
|
173
|
+
|
|
174
|
+
volumes:
|
|
175
|
+
otel-data:
|
|
176
|
+
driver: local
|
|
177
|
+
prometheus-data:
|
|
178
|
+
driver: local
|
|
179
|
+
jaeger-data:
|
|
180
|
+
driver: local
|
|
181
|
+
grafana-data:
|
|
182
|
+
driver: local
|
|
183
|
+
|
|
184
|
+
networks:
|
|
185
|
+
agentic-qe-otel:
|
|
186
|
+
driver: bridge
|
|
187
|
+
name: agentic-qe-otel
|