agentic-qe 1.9.3 → 1.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/CHANGELOG.md +54 -0
  2. package/README.md +30 -5
  3. package/config/.env.otel.example +25 -0
  4. package/config/OTEL-QUICK-REFERENCE.md +137 -0
  5. package/config/README-OTEL.md +222 -0
  6. package/config/alerting-rules.yml +518 -0
  7. package/config/docker-compose.otel.yml +187 -0
  8. package/config/grafana/dashboards/agentic-qe-overview.json +286 -0
  9. package/config/grafana/provisioning/dashboards/dashboards.yml +19 -0
  10. package/config/grafana/provisioning/datasources/datasources.yml +53 -0
  11. package/config/otel-collector-config.yaml.example +145 -0
  12. package/config/prometheus.yml.example +106 -0
  13. package/dist/alerting/AlertManager.d.ts +120 -0
  14. package/dist/alerting/AlertManager.d.ts.map +1 -0
  15. package/dist/alerting/AlertManager.js +345 -0
  16. package/dist/alerting/AlertManager.js.map +1 -0
  17. package/dist/alerting/FeedbackRouter.d.ts +98 -0
  18. package/dist/alerting/FeedbackRouter.d.ts.map +1 -0
  19. package/dist/alerting/FeedbackRouter.js +331 -0
  20. package/dist/alerting/FeedbackRouter.js.map +1 -0
  21. package/dist/alerting/StrategyApplicator.d.ts +120 -0
  22. package/dist/alerting/StrategyApplicator.d.ts.map +1 -0
  23. package/dist/alerting/StrategyApplicator.js +299 -0
  24. package/dist/alerting/StrategyApplicator.js.map +1 -0
  25. package/dist/alerting/index.d.ts +68 -0
  26. package/dist/alerting/index.d.ts.map +1 -0
  27. package/dist/alerting/index.js +112 -0
  28. package/dist/alerting/index.js.map +1 -0
  29. package/dist/alerting/types.d.ts +118 -0
  30. package/dist/alerting/types.d.ts.map +1 -0
  31. package/dist/alerting/types.js +11 -0
  32. package/dist/alerting/types.js.map +1 -0
  33. package/dist/cli/init/claude-config.d.ts.map +1 -1
  34. package/dist/cli/init/claude-config.js +12 -7
  35. package/dist/cli/init/claude-config.js.map +1 -1
  36. package/dist/core/memory/IPatternStore.d.ts +209 -0
  37. package/dist/core/memory/IPatternStore.d.ts.map +1 -0
  38. package/dist/core/memory/IPatternStore.js +15 -0
  39. package/dist/core/memory/IPatternStore.js.map +1 -0
  40. package/dist/core/memory/MigrationTools.d.ts +192 -0
  41. package/dist/core/memory/MigrationTools.d.ts.map +1 -0
  42. package/dist/core/memory/MigrationTools.js +615 -0
  43. package/dist/core/memory/MigrationTools.js.map +1 -0
  44. package/dist/core/memory/NeuralEnhancement.d.ts +154 -0
  45. package/dist/core/memory/NeuralEnhancement.d.ts.map +1 -0
  46. package/dist/core/memory/NeuralEnhancement.js +598 -0
  47. package/dist/core/memory/NeuralEnhancement.js.map +1 -0
  48. package/dist/core/memory/PatternStoreFactory.d.ts +143 -0
  49. package/dist/core/memory/PatternStoreFactory.d.ts.map +1 -0
  50. package/dist/core/memory/PatternStoreFactory.js +370 -0
  51. package/dist/core/memory/PatternStoreFactory.js.map +1 -0
  52. package/dist/core/memory/RealAgentDBAdapter.d.ts +1 -0
  53. package/dist/core/memory/RealAgentDBAdapter.d.ts.map +1 -1
  54. package/dist/core/memory/RealAgentDBAdapter.js +28 -20
  55. package/dist/core/memory/RealAgentDBAdapter.js.map +1 -1
  56. package/dist/core/memory/RuVectorPatternStore.d.ts +198 -0
  57. package/dist/core/memory/RuVectorPatternStore.d.ts.map +1 -0
  58. package/dist/core/memory/RuVectorPatternStore.js +605 -0
  59. package/dist/core/memory/RuVectorPatternStore.js.map +1 -0
  60. package/dist/core/memory/SelfHealingMonitor.d.ts +186 -0
  61. package/dist/core/memory/SelfHealingMonitor.d.ts.map +1 -0
  62. package/dist/core/memory/SelfHealingMonitor.js +451 -0
  63. package/dist/core/memory/SelfHealingMonitor.js.map +1 -0
  64. package/dist/core/memory/SwarmMemoryManager.d.ts +62 -0
  65. package/dist/core/memory/SwarmMemoryManager.d.ts.map +1 -1
  66. package/dist/core/memory/SwarmMemoryManager.js +97 -0
  67. package/dist/core/memory/SwarmMemoryManager.js.map +1 -1
  68. package/dist/core/memory/index.d.ts +11 -0
  69. package/dist/core/memory/index.d.ts.map +1 -1
  70. package/dist/core/memory/index.js +36 -1
  71. package/dist/core/memory/index.js.map +1 -1
  72. package/dist/reasoning/RuVectorReasoningAdapter.d.ts +232 -0
  73. package/dist/reasoning/RuVectorReasoningAdapter.d.ts.map +1 -0
  74. package/dist/reasoning/RuVectorReasoningAdapter.js +585 -0
  75. package/dist/reasoning/RuVectorReasoningAdapter.js.map +1 -0
  76. package/dist/reasoning/index.d.ts +2 -0
  77. package/dist/reasoning/index.d.ts.map +1 -1
  78. package/dist/reasoning/index.js +6 -1
  79. package/dist/reasoning/index.js.map +1 -1
  80. package/dist/reporting/ResultAggregator.d.ts +107 -0
  81. package/dist/reporting/ResultAggregator.d.ts.map +1 -0
  82. package/dist/reporting/ResultAggregator.js +435 -0
  83. package/dist/reporting/ResultAggregator.js.map +1 -0
  84. package/dist/reporting/index.d.ts +48 -0
  85. package/dist/reporting/index.d.ts.map +1 -0
  86. package/dist/reporting/index.js +154 -0
  87. package/dist/reporting/index.js.map +1 -0
  88. package/dist/reporting/reporters/ControlLoopReporter.d.ts +128 -0
  89. package/dist/reporting/reporters/ControlLoopReporter.d.ts.map +1 -0
  90. package/dist/reporting/reporters/ControlLoopReporter.js +417 -0
  91. package/dist/reporting/reporters/ControlLoopReporter.js.map +1 -0
  92. package/dist/reporting/reporters/HumanReadableReporter.d.ts +140 -0
  93. package/dist/reporting/reporters/HumanReadableReporter.d.ts.map +1 -0
  94. package/dist/reporting/reporters/HumanReadableReporter.js +524 -0
  95. package/dist/reporting/reporters/HumanReadableReporter.js.map +1 -0
  96. package/dist/reporting/reporters/JSONReporter.d.ts +193 -0
  97. package/dist/reporting/reporters/JSONReporter.d.ts.map +1 -0
  98. package/dist/reporting/reporters/JSONReporter.js +324 -0
  99. package/dist/reporting/reporters/JSONReporter.js.map +1 -0
  100. package/dist/reporting/reporters/index.d.ts +14 -0
  101. package/dist/reporting/reporters/index.d.ts.map +1 -0
  102. package/dist/reporting/reporters/index.js +19 -0
  103. package/dist/reporting/reporters/index.js.map +1 -0
  104. package/dist/reporting/types.d.ts +427 -0
  105. package/dist/reporting/types.d.ts.map +1 -0
  106. package/dist/reporting/types.js +12 -0
  107. package/dist/reporting/types.js.map +1 -0
  108. package/package.json +9 -1
@@ -0,0 +1,518 @@
1
+ # Prometheus Alerting Rules for Agentic QE Fleet
2
+ # Phase 4: Autonomous Alerting & Feedback Loop System
3
+ # Issue: #69
4
+ # Version: 1.0.0
5
+ # Date: 2025-11-29
6
+
7
+ groups:
8
+ # =========================================================================
9
+ # QUALITY METRIC ALERTS
10
+ # =========================================================================
11
+ - name: quality_metrics
12
+ interval: 15s
13
+ rules:
14
+ # Test Failure Rate Alert
15
+ - alert: HighTestFailureRate
16
+ expr: |
17
+ (
18
+ sum(rate(aqe_quality_test_count{status="failed"}[5m]))
19
+ /
20
+ sum(rate(aqe_quality_test_count[5m]))
21
+ ) > 0.05
22
+ for: 5m
23
+ labels:
24
+ severity: error
25
+ component: quality
26
+ alert_type: test_failure
27
+ feedback_action: adjust_strategy
28
+ annotations:
29
+ summary: "Test failure rate exceeds 5%"
30
+ description: "Test failure rate is {{ $value | humanizePercentage }} (threshold: 5%). This indicates quality degradation."
31
+ feedback_strategy: "increase_test_isolation"
32
+ feedback_focus: "failing_tests"
33
+ runbook_url: "https://docs.agentic-qe.io/runbooks/high-test-failure-rate"
34
+
35
+ # Coverage Drop - Critical
36
+ - alert: CriticalCoverageDrop
37
+ expr: aqe_quality_coverage_line < 80
38
+ for: 1m
39
+ labels:
40
+ severity: critical
41
+ component: quality
42
+ alert_type: coverage_drop
43
+ feedback_action: auto_remediate
44
+ agent_scope: qe-coverage-analyzer
45
+ annotations:
46
+ summary: "Code coverage dropped below 80%"
47
+ description: "Line coverage is {{ $value }}% (threshold: 80%). Immediate action required."
48
+ feedback_action: "generate_additional_tests"
49
+ feedback_target_coverage: "85.0"
50
+ runbook_url: "https://docs.agentic-qe.io/runbooks/coverage-drop"
51
+
52
+ # Coverage Drop - Warning
53
+ - alert: WarningCoverageDrop
54
+ expr: aqe_quality_coverage_line < 85 and aqe_quality_coverage_line >= 80
55
+ for: 5m
56
+ labels:
57
+ severity: warning
58
+ component: quality
59
+ alert_type: coverage_drop
60
+ feedback_action: adjust_strategy
61
+ annotations:
62
+ summary: "Code coverage approaching threshold"
63
+ description: "Line coverage is {{ $value }}% (warning at 85%, critical at 80%)"
64
+ feedback_strategy: "proactive_test_generation"
65
+
66
+ # Branch Coverage Drop
67
+ - alert: BranchCoverageLow
68
+ expr: aqe_quality_coverage_branch < 75
69
+ for: 5m
70
+ labels:
71
+ severity: warning
72
+ component: quality
73
+ alert_type: coverage_drop
74
+ feedback_action: adjust_strategy
75
+ annotations:
76
+ summary: "Branch coverage below threshold"
77
+ description: "Branch coverage is {{ $value }}% (threshold: 75%)"
78
+ feedback_strategy: "focus_branch_coverage"
79
+
80
+ # Flaky Tests Increasing
81
+ - alert: FlakyTestsIncreasing
82
+ expr: aqe_quality_flaky_count > 5
83
+ for: 1h
84
+ labels:
85
+ severity: warning
86
+ component: quality
87
+ alert_type: flaky_tests
88
+ feedback_action: adjust_strategy
89
+ agent_scope: qe-flaky-detector
90
+ annotations:
91
+ summary: "Number of flaky tests is growing"
92
+ description: "{{ $value }} flaky tests detected (threshold: 5). Test stability degrading."
93
+ feedback_strategy: "stabilize_flaky_tests"
94
+ feedback_analysis_depth: "deep"
95
+ runbook_url: "https://docs.agentic-qe.io/runbooks/flaky-tests"
96
+
97
+ # Critical Flaky Test Count
98
+ - alert: CriticalFlakyTestCount
99
+ expr: aqe_quality_flaky_count > 10
100
+ for: 30m
101
+ labels:
102
+ severity: error
103
+ component: quality
104
+ alert_type: flaky_tests
105
+ feedback_action: escalate
106
+ annotations:
107
+ summary: "Critical number of flaky tests detected"
108
+ description: "{{ $value }} flaky tests (critical threshold: 10). Test suite reliability compromised."
109
+ feedback_action: "quarantine_flaky_tests"
110
+
111
+ # Security Vulnerabilities - Critical
112
+ - alert: CriticalSecurityVulnerabilities
113
+ expr: aqe_quality_security_vulnerability_count{severity="critical"} > 0
114
+ for: 0s
115
+ labels:
116
+ severity: critical
117
+ component: security
118
+ alert_type: vulnerability
119
+ feedback_action: escalate
120
+ agent_scope: qe-security-scanner
121
+ annotations:
122
+ summary: "Critical security vulnerabilities detected"
123
+ description: "{{ $value }} critical vulnerabilities found. Deployment must be blocked."
124
+ feedback_notify: "security_team"
125
+ feedback_block_deployment: "true"
126
+ runbook_url: "https://docs.agentic-qe.io/runbooks/security-vulnerabilities"
127
+
128
+ # Security Vulnerabilities - High
129
+ - alert: HighSecurityVulnerabilities
130
+ expr: aqe_quality_security_vulnerability_count{severity="high"} > 0
131
+ for: 0s
132
+ labels:
133
+ severity: error
134
+ component: security
135
+ alert_type: vulnerability
136
+ feedback_action: escalate
137
+ agent_scope: qe-security-scanner
138
+ annotations:
139
+ summary: "High severity security vulnerabilities detected"
140
+ description: "{{ $value }} high severity vulnerabilities found. Immediate remediation required."
141
+ feedback_notify: "security_team"
142
+ feedback_block_deployment: "true"
143
+
144
+ # Security Vulnerabilities - Medium (with threshold)
145
+ - alert: MediumSecurityVulnerabilities
146
+ expr: aqe_quality_security_vulnerability_count{severity="medium"} > 5
147
+ for: 5m
148
+ labels:
149
+ severity: warning
150
+ component: security
151
+ alert_type: vulnerability
152
+ feedback_action: auto_remediate
153
+ annotations:
154
+ summary: "Multiple medium severity vulnerabilities"
155
+ description: "{{ $value }} medium severity vulnerabilities (threshold: 5)"
156
+ feedback_action: "schedule_security_remediation"
157
+
158
+ # Quality Gate Failure
159
+ - alert: QualityGateFailed
160
+ expr: aqe_quality_gate_pass_rate < 1.0
161
+ for: 1m
162
+ labels:
163
+ severity: error
164
+ component: quality
165
+ alert_type: quality_gate
166
+ feedback_action: adjust_strategy
167
+ agent_scope: qe-quality-gate
168
+ annotations:
169
+ summary: "Quality gate evaluation failed"
170
+ description: "Quality gate pass rate: {{ $value | humanizePercentage }} (expected: 100%)"
171
+ feedback_strategy: "incremental_improvement"
172
+ feedback_focus_areas: "coverage,complexity,security"
173
+ runbook_url: "https://docs.agentic-qe.io/runbooks/quality-gate-failure"
174
+
175
+ # =========================================================================
176
+ # PERFORMANCE METRIC ALERTS
177
+ # =========================================================================
178
+ - name: performance_metrics
179
+ interval: 15s
180
+ rules:
181
+ # Test Execution Slow
182
+ - alert: TestExecutionSlow
183
+ expr: |
184
+ histogram_quantile(0.95,
185
+ rate(aqe_quality_test_duration_bucket[5m])
186
+ ) > 30000
187
+ for: 5m
188
+ labels:
189
+ severity: warning
190
+ component: performance
191
+ alert_type: execution_slow
192
+ feedback_action: adjust_strategy
193
+ agent_scope: qe-test-executor
194
+ annotations:
195
+ summary: "Test execution time degraded"
196
+ description: "P95 test execution time is {{ $value | humanizeDuration }} (threshold: 30s)"
197
+ feedback_strategy: "optimize_test_suite"
198
+ feedback_action: "parallel_execution"
199
+ runbook_url: "https://docs.agentic-qe.io/runbooks/slow-tests"
200
+
201
+ # Critical Test Execution Time
202
+ - alert: CriticalTestExecutionTime
203
+ expr: |
204
+ histogram_quantile(0.95,
205
+ rate(aqe_quality_test_duration_bucket[5m])
206
+ ) > 60000
207
+ for: 3m
208
+ labels:
209
+ severity: error
210
+ component: performance
211
+ alert_type: execution_slow
212
+ feedback_action: auto_remediate
213
+ annotations:
214
+ summary: "Test execution critically slow"
215
+ description: "P95 test execution time is {{ $value | humanizeDuration }} (critical threshold: 60s)"
216
+ feedback_action: "emergency_test_optimization"
217
+
218
+ # Agent Task Timeout
219
+ - alert: AgentTaskTimeout
220
+ expr: |
221
+ histogram_quantile(0.95,
222
+ rate(aqe_agent_task_duration_bucket[10m])
223
+ ) > 120000
224
+ for: 10m
225
+ labels:
226
+ severity: error
227
+ component: performance
228
+ alert_type: task_timeout
229
+ feedback_action: retrain_model
230
+ annotations:
231
+ summary: "Agent tasks timing out frequently"
232
+ description: "P95 task duration is {{ $value | humanizeDuration }} (threshold: 2m)"
233
+ feedback_focus: "task_complexity_estimation"
234
+ feedback_learning_rate: "0.2"
235
+ runbook_url: "https://docs.agentic-qe.io/runbooks/agent-timeout"
236
+
237
+ # Memory Usage High
238
+ - alert: HighMemoryUsage
239
+ expr: aqe_system_memory_usage > 500000000
240
+ for: 1m
241
+ labels:
242
+ severity: warning
243
+ component: system
244
+ alert_type: resource_usage
245
+ feedback_action: auto_remediate
246
+ annotations:
247
+ summary: "Agent memory consumption exceeds threshold"
248
+ description: "Memory usage is {{ $value | humanize1024 }}B (threshold: 500MB)"
249
+ feedback_action: "garbage_collect"
250
+ feedback_optimize_batch_size: "true"
251
+ runbook_url: "https://docs.agentic-qe.io/runbooks/high-memory"
252
+
253
+ # Critical Memory Usage
254
+ - alert: CriticalMemoryUsage
255
+ expr: aqe_system_memory_usage > 800000000
256
+ for: 30s
257
+ labels:
258
+ severity: critical
259
+ component: system
260
+ alert_type: resource_usage
261
+ feedback_action: escalate
262
+ annotations:
263
+ summary: "Critical memory usage detected"
264
+ description: "Memory usage is {{ $value | humanize1024 }}B (critical threshold: 800MB). OOM risk."
265
+ feedback_action: "emergency_memory_cleanup"
266
+
267
+ # CPU Usage High
268
+ - alert: HighCPUUsage
269
+ expr: aqe_system_cpu_usage > 80
270
+ for: 5m
271
+ labels:
272
+ severity: warning
273
+ component: system
274
+ alert_type: resource_usage
275
+ feedback_action: adjust_strategy
276
+ annotations:
277
+ summary: "High CPU utilization detected"
278
+ description: "CPU usage is {{ $value }}% (threshold: 80%)"
279
+ feedback_strategy: "reduce_concurrent_tasks"
280
+
281
+ # =========================================================================
282
+ # LEARNING & ADAPTATION ALERTS
283
+ # =========================================================================
284
+ - name: learning_metrics
285
+ interval: 30s
286
+ rules:
287
+ # Low Agent Success Rate
288
+ - alert: LowAgentSuccessRate
289
+ expr: aqe_agent_success_rate < 0.90
290
+ for: 1h
291
+ labels:
292
+ severity: warning
293
+ component: learning
294
+ alert_type: success_rate
295
+ feedback_action: retrain_model
296
+ annotations:
297
+ summary: "Agent success rate below target"
298
+ description: "Agent {{ $labels.agent_type }} success rate is {{ $value | humanizePercentage }} (threshold: 90%)"
299
+ feedback_exploration_rate: "0.3"
300
+ feedback_focus: "failed_task_patterns"
301
+ runbook_url: "https://docs.agentic-qe.io/runbooks/low-success-rate"
302
+
303
+ # Critical Agent Success Rate
304
+ - alert: CriticalAgentSuccessRate
305
+ expr: aqe_agent_success_rate < 0.70
306
+ for: 30m
307
+ labels:
308
+ severity: error
309
+ component: learning
310
+ alert_type: success_rate
311
+ feedback_action: escalate
312
+ annotations:
313
+ summary: "Critical agent success rate"
314
+ description: "Agent {{ $labels.agent_type }} success rate is {{ $value | humanizePercentage }} (critical: 70%)"
315
+ feedback_action: "emergency_retraining"
316
+
317
+ # Defect Density High
318
+ - alert: HighDefectDensity
319
+ expr: aqe_quality_defect_density > 2.0
320
+ for: 24h
321
+ labels:
322
+ severity: error
323
+ component: quality
324
+ alert_type: defect_density
325
+ feedback_action: adjust_strategy
326
+ agent_scope: qe-quality-analyzer
327
+ annotations:
328
+ summary: "Defect density exceeds threshold"
329
+ description: "Defect density is {{ $value }} per KLOC (threshold: 2.0)"
330
+ feedback_strategy: "increase_review_depth"
331
+ feedback_static_analysis: "true"
332
+ runbook_url: "https://docs.agentic-qe.io/runbooks/high-defect-density"
333
+
334
+ # Agent Task Failure Spike
335
+ - alert: AgentTaskFailureSpike
336
+ expr: |
337
+ (
338
+ sum(rate(aqe_agent_task_count{status="failed"}[5m])) by (agent_type)
339
+ /
340
+ sum(rate(aqe_agent_task_count[5m])) by (agent_type)
341
+ ) > 0.20
342
+ for: 10m
343
+ labels:
344
+ severity: warning
345
+ component: learning
346
+ alert_type: failure_spike
347
+ feedback_action: retrain_model
348
+ annotations:
349
+ summary: "Agent experiencing task failure spike"
350
+ description: "Agent {{ $labels.agent_type }} failure rate is {{ $value | humanizePercentage }} (threshold: 20%)"
351
+ feedback_action: "analyze_failure_patterns"
352
+
353
+ # =========================================================================
354
+ # FLEET COORDINATION ALERTS
355
+ # =========================================================================
356
+ - name: fleet_coordination
357
+ interval: 30s
358
+ rules:
359
+ # Agent Queue Depth High
360
+ - alert: HighAgentQueueDepth
361
+ expr: aqe_system_queue_depth > 50
362
+ for: 5m
363
+ labels:
364
+ severity: warning
365
+ component: coordination
366
+ alert_type: queue_depth
367
+ feedback_action: adjust_strategy
368
+ annotations:
369
+ summary: "Agent task queue is backing up"
370
+ description: "Queue depth is {{ $value }} tasks (threshold: 50)"
371
+ feedback_strategy: "scale_agents"
372
+ runbook_url: "https://docs.agentic-qe.io/runbooks/queue-backup"
373
+
374
+ # Agent Queue Depth Critical
375
+ - alert: CriticalAgentQueueDepth
376
+ expr: aqe_system_queue_depth > 100
377
+ for: 2m
378
+ labels:
379
+ severity: error
380
+ component: coordination
381
+ alert_type: queue_depth
382
+ feedback_action: auto_remediate
383
+ annotations:
384
+ summary: "Critical agent queue backlog"
385
+ description: "Queue depth is {{ $value }} tasks (critical: 100). System overloaded."
386
+ feedback_action: "emergency_queue_drain"
387
+
388
+ # Database Query Slow
389
+ - alert: SlowDatabaseQueries
390
+ expr: |
391
+ histogram_quantile(0.95,
392
+ rate(aqe_system_db_query_duration_bucket[5m])
393
+ ) > 1000
394
+ for: 5m
395
+ labels:
396
+ severity: warning
397
+ component: system
398
+ alert_type: database_slow
399
+ feedback_action: adjust_strategy
400
+ annotations:
401
+ summary: "Database queries are slow"
402
+ description: "P95 query duration is {{ $value | humanizeDuration }} (threshold: 1s)"
403
+ feedback_strategy: "optimize_database_access"
404
+ runbook_url: "https://docs.agentic-qe.io/runbooks/slow-database"
405
+
406
+ # Event Bus Latency High
407
+ - alert: HighEventBusLatency
408
+ expr: |
409
+ rate(aqe_system_eventbus_latency_sum[5m])
410
+ /
411
+ rate(aqe_system_eventbus_latency_count[5m])
412
+ > 500
413
+ for: 5m
414
+ labels:
415
+ severity: warning
416
+ component: coordination
417
+ alert_type: event_latency
418
+ feedback_action: adjust_strategy
419
+ annotations:
420
+ summary: "Event bus experiencing high latency"
421
+ description: "Average event latency is {{ $value | humanizeDuration }} (threshold: 500ms)"
422
+ feedback_strategy: "optimize_event_handling"
423
+
424
+ # =========================================================================
425
+ # TOKEN COST & EFFICIENCY ALERTS
426
+ # =========================================================================
427
+ - name: cost_efficiency
428
+ interval: 1m
429
+ rules:
430
+ # High Token Cost Rate
431
+ - alert: HighTokenCostRate
432
+ expr: |
433
+ rate(aqe_agent_cost_sum[1h])
434
+ > 10.0
435
+ for: 15m
436
+ labels:
437
+ severity: warning
438
+ component: cost
439
+ alert_type: token_cost
440
+ feedback_action: adjust_strategy
441
+ annotations:
442
+ summary: "Token costs increasing rapidly"
443
+ description: "Cost rate is ${{ $value }}/hour (threshold: $10/hour)"
444
+ feedback_strategy: "optimize_token_usage"
445
+ runbook_url: "https://docs.agentic-qe.io/runbooks/high-token-cost"
446
+
447
+ # Inefficient Agent Token Usage
448
+ - alert: InefficientAgentTokenUsage
449
+ expr: |
450
+ (
451
+ rate(aqe_agent_token_usage_sum[1h]) by (agent_type)
452
+ /
453
+ rate(aqe_agent_task_count{status="success"}[1h]) by (agent_type)
454
+ ) > 10000
455
+ for: 30m
456
+ labels:
457
+ severity: warning
458
+ component: efficiency
459
+ alert_type: token_efficiency
460
+ feedback_action: retrain_model
461
+ annotations:
462
+ summary: "Agent using excessive tokens per successful task"
463
+ description: "Agent {{ $labels.agent_type }} uses {{ $value }} tokens per success (threshold: 10k)"
464
+ feedback_strategy: "optimize_prompt_efficiency"
465
+
466
+ # =========================================================================
467
+ # ALERTING SYSTEM HEALTH
468
+ # =========================================================================
469
+ - name: alerting_system_health
470
+ interval: 30s
471
+ rules:
472
+ # High Alert Fire Rate (Alert Fatigue)
473
+ - alert: AlertFatigueDetected
474
+ expr: |
475
+ sum(rate(aqe_alerting_alerts_fired[1h])) > 20
476
+ for: 1h
477
+ labels:
478
+ severity: warning
479
+ component: alerting
480
+ alert_type: alert_fatigue
481
+ annotations:
482
+ summary: "Excessive alerts being fired"
483
+ description: "{{ $value }} alerts fired in the last hour. Potential alert fatigue."
484
+ action: "Review and tune alert thresholds"
485
+ runbook_url: "https://docs.agentic-qe.io/runbooks/alert-fatigue"
486
+
487
+ # High Alert Suppression Rate
488
+ - alert: HighAlertSuppressionRate
489
+ expr: |
490
+ (
491
+ rate(aqe_alerting_alerts_suppressed[1h])
492
+ /
493
+ rate(aqe_alerting_alerts_fired[1h])
494
+ ) > 0.5
495
+ for: 1h
496
+ labels:
497
+ severity: info
498
+ component: alerting
499
+ alert_type: suppression_rate
500
+ annotations:
501
+ summary: "High alert suppression rate"
502
+ description: "{{ $value | humanizePercentage }} of alerts are being suppressed. Review cooldown settings."
503
+
504
+ # Feedback Processing Slow
505
+ - alert: SlowFeedbackProcessing
506
+ expr: |
507
+ histogram_quantile(0.95,
508
+ rate(aqe_alerting_feedback_duration_bucket[5m])
509
+ ) > 5000
510
+ for: 5m
511
+ labels:
512
+ severity: warning
513
+ component: alerting
514
+ alert_type: feedback_slow
515
+ annotations:
516
+ summary: "Feedback loop processing is slow"
517
+ description: "P95 feedback processing time is {{ $value | humanizeDuration }} (threshold: 5s)"
518
+ action: "Investigate feedback router performance"
@@ -0,0 +1,187 @@
1
+ version: '3.8'
2
+
3
+ # OTEL Observability Stack for Agentic QE Fleet
4
+ # Issue #71: Complete OTEL Stack Docker Compose Configuration
5
+ #
6
+ # This compose file sets up a complete observability stack:
7
+ # - OTEL Collector: Receives telemetry via OTLP (gRPC:4317, HTTP:4318)
8
+ # - Prometheus: Scrapes metrics from OTEL Collector (port 9090)
9
+ # - Jaeger: Distributed tracing backend (UI on port 16686)
10
+ # - Grafana: Visualization and dashboards (port 3001)
11
+ #
12
+ # Usage:
13
+ # docker-compose -f config/docker-compose.otel.yml up -d
14
+ # docker-compose -f docker-compose.yml -f config/docker-compose.otel.yml up -d
15
+
16
+ services:
17
+ # OpenTelemetry Collector
18
+ otel-collector:
19
+ image: otel/opentelemetry-collector-contrib:latest
20
+ container_name: agentic-qe-otel-collector
21
+ command: ["--config=/etc/otel-collector-config.yaml"]
22
+ volumes:
23
+ - ./otel-collector-config.yaml.example:/etc/otel-collector-config.yaml:ro
24
+ - otel-data:/var/log/otel
25
+ ports:
26
+ # OTLP gRPC receiver
27
+ - "4317:4317"
28
+ # OTLP HTTP receiver
29
+ - "4318:4318"
30
+ # Prometheus exporter
31
+ - "8889:8889"
32
+ # Collector metrics (self-monitoring)
33
+ - "8888:8888"
34
+ # Health check endpoint
35
+ - "13133:13133"
36
+ # pprof profiling (development only)
37
+ - "1777:1777"
38
+ # zpages debug interface (development only)
39
+ - "55679:55679"
40
+ environment:
41
+ - DEPLOYMENT_ENVIRONMENT=${DEPLOYMENT_ENVIRONMENT:-development}
42
+ networks:
43
+ - agentic-qe-otel
44
+ restart: unless-stopped
45
+ healthcheck:
46
+ test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133/health"]
47
+ interval: 30s
48
+ timeout: 10s
49
+ retries: 3
50
+ start_period: 40s
51
+
52
+ # Prometheus - Metrics storage and querying
53
+ prometheus:
54
+ image: prom/prometheus:latest
55
+ container_name: agentic-qe-prometheus
56
+ command:
57
+ - '--config.file=/etc/prometheus/prometheus.yml'
58
+ - '--storage.tsdb.path=/prometheus'
59
+ - '--storage.tsdb.retention.time=15d'
60
+ - '--storage.tsdb.retention.size=10GB'
61
+ - '--web.console.libraries=/usr/share/prometheus/console_libraries'
62
+ - '--web.console.templates=/usr/share/prometheus/consoles'
63
+ - '--web.enable-lifecycle'
64
+ volumes:
65
+ - ./prometheus.yml.example:/etc/prometheus/prometheus.yml:ro
66
+ - prometheus-data:/prometheus
67
+ # Optionally mount alerting rules
68
+ # - ./prometheus-rules:/etc/prometheus/rules:ro
69
+ ports:
70
+ - "9090:9090"
71
+ networks:
72
+ - agentic-qe-otel
73
+ restart: unless-stopped
74
+ depends_on:
75
+ otel-collector:
76
+ condition: service_healthy
77
+ healthcheck:
78
+ test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
79
+ interval: 30s
80
+ timeout: 10s
81
+ retries: 3
82
+ start_period: 30s
83
+
84
+ # Jaeger - Distributed tracing backend
85
+ jaeger:
86
+ image: jaegertracing/all-in-one:latest
87
+ container_name: agentic-qe-jaeger
88
+ environment:
89
+ # Collector settings
90
+ - COLLECTOR_OTLP_ENABLED=true
91
+ - COLLECTOR_ZIPKIN_HOST_PORT=:9411
92
+ # Storage settings (in-memory for development)
93
+ - SPAN_STORAGE_TYPE=badger
94
+ - BADGER_EPHEMERAL=false
95
+ - BADGER_DIRECTORY_VALUE=/badger/data
96
+ - BADGER_DIRECTORY_KEY=/badger/key
97
+ # Query settings
98
+ - QUERY_BASE_PATH=/
99
+ # Metrics backend
100
+ - METRICS_BACKEND=prometheus
101
+ - METRICS_HTTP_ROUTE=/metrics
102
+ volumes:
103
+ - jaeger-data:/badger
104
+ ports:
105
+ # Jaeger UI
106
+ - "16686:16686"
107
+ # OTLP gRPC receiver
108
+ - "4327:4317"
109
+ # OTLP HTTP receiver
110
+ - "4328:4318"
111
+ # Zipkin compatible endpoint
112
+ - "9411:9411"
113
+ # Admin port (health check, metrics)
114
+ - "14269:14269"
115
+ # Jaeger Thrift compact
116
+ - "6831:6831/udp"
117
+ # Jaeger Thrift binary
118
+ - "6832:6832/udp"
119
+ # Jaeger gRPC
120
+ - "14250:14250"
121
+ networks:
122
+ - agentic-qe-otel
123
+ restart: unless-stopped
124
+ healthcheck:
125
+ test: ["CMD", "wget", "--spider", "-q", "http://localhost:14269/"]
126
+ interval: 30s
127
+ timeout: 10s
128
+ retries: 3
129
+ start_period: 30s
130
+
131
+ # Grafana - Visualization and dashboards
132
+ grafana:
133
+ image: grafana/grafana:latest
134
+ container_name: agentic-qe-grafana
135
+ environment:
136
+ # Admin credentials (change in production!)
137
+ - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
138
+ - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
139
+ # Server settings
140
+ - GF_SERVER_ROOT_URL=http://localhost:3001
141
+ - GF_SERVER_SERVE_FROM_SUB_PATH=false
142
+ # Enable anonymous access (development only)
143
+ - GF_AUTH_ANONYMOUS_ENABLED=false
144
+ # Provisioning
145
+ - GF_PATHS_PROVISIONING=/etc/grafana/provisioning
146
+ # Plugins
147
+ - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
148
+ volumes:
149
+ # Datasource provisioning
150
+ - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
151
+ # Dashboard provisioning
152
+ - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
153
+ # Dashboard JSON files
154
+ - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
155
+ # Persistent storage
156
+ - grafana-data:/var/lib/grafana
157
+ ports:
158
+ - "3001:3000"
159
+ networks:
160
+ - agentic-qe-otel
161
+ restart: unless-stopped
162
+ depends_on:
163
+ prometheus:
164
+ condition: service_healthy
165
+ jaeger:
166
+ condition: service_healthy
167
+ healthcheck:
168
+ test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health"]
169
+ interval: 30s
170
+ timeout: 10s
171
+ retries: 3
172
+ start_period: 40s
173
+
174
+ volumes:
175
+ otel-data:
176
+ driver: local
177
+ prometheus-data:
178
+ driver: local
179
+ jaeger-data:
180
+ driver: local
181
+ grafana-data:
182
+ driver: local
183
+
184
+ networks:
185
+ agentic-qe-otel:
186
+ driver: bridge
187
+ name: agentic-qe-otel