agentic-qe 1.9.3 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/qe-api-contract-validator.md +95 -1336
- package/.claude/agents/qe-chaos-engineer.md +152 -1211
- package/.claude/agents/qe-code-complexity.md +144 -707
- package/.claude/agents/qe-coverage-analyzer.md +147 -743
- package/.claude/agents/qe-deployment-readiness.md +143 -1496
- package/.claude/agents/qe-flaky-test-hunter.md +132 -1529
- package/.claude/agents/qe-fleet-commander.md +12 -12
- package/.claude/agents/qe-performance-tester.md +150 -886
- package/.claude/agents/qe-production-intelligence.md +155 -1396
- package/.claude/agents/qe-quality-analyzer.md +6 -6
- package/.claude/agents/qe-quality-gate.md +151 -648
- package/.claude/agents/qe-regression-risk-analyzer.md +132 -1150
- package/.claude/agents/qe-requirements-validator.md +149 -932
- package/.claude/agents/qe-security-scanner.md +157 -797
- package/.claude/agents/qe-test-data-architect.md +96 -1365
- package/.claude/agents/qe-test-executor.md +8 -8
- package/.claude/agents/qe-test-generator.md +145 -1540
- package/.claude/agents/qe-visual-tester.md +153 -1257
- package/.claude/agents/qx-partner.md +235 -0
- package/.claude/agents/subagents/qe-code-reviewer.md +40 -136
- package/.claude/agents/subagents/qe-coverage-gap-analyzer.md +40 -480
- package/.claude/agents/subagents/qe-data-generator.md +41 -125
- package/.claude/agents/subagents/qe-flaky-investigator.md +55 -411
- package/.claude/agents/subagents/qe-integration-tester.md +53 -141
- package/.claude/agents/subagents/qe-performance-validator.md +54 -130
- package/.claude/agents/subagents/qe-security-auditor.md +56 -114
- package/.claude/agents/subagents/qe-test-data-architect-sub.md +57 -548
- package/.claude/agents/subagents/qe-test-implementer.md +58 -551
- package/.claude/agents/subagents/qe-test-refactorer.md +65 -722
- package/.claude/agents/subagents/qe-test-writer.md +63 -726
- package/.claude/skills/skills-manifest.json +632 -0
- package/.claude/skills/testability-scoring/README.md +71 -0
- package/.claude/skills/testability-scoring/SKILL.md +611 -0
- package/.claude/skills/testability-scoring/resources/templates/config.template.js +84 -0
- package/.claude/skills/testability-scoring/resources/templates/testability-scoring.spec.template.js +532 -0
- package/.claude/skills/testability-scoring/scripts/generate-html-report.js +1007 -0
- package/.claude/skills/testability-scoring/scripts/run-assessment.sh +70 -0
- package/CHANGELOG.md +116 -0
- package/README.md +59 -7
- package/config/.env.otel.example +25 -0
- package/config/OTEL-QUICK-REFERENCE.md +137 -0
- package/config/README-OTEL.md +222 -0
- package/config/alerting-rules.yml +518 -0
- package/config/docker-compose.otel.yml +187 -0
- package/config/grafana/dashboards/agentic-qe-overview.json +286 -0
- package/config/grafana/provisioning/dashboards/dashboards.yml +19 -0
- package/config/grafana/provisioning/datasources/datasources.yml +53 -0
- package/config/otel-collector-config.yaml.example +145 -0
- package/config/prometheus.yml.example +106 -0
- package/dist/agents/QXPartnerAgent.d.ts +139 -0
- package/dist/agents/QXPartnerAgent.d.ts.map +1 -0
- package/dist/agents/QXPartnerAgent.js +769 -0
- package/dist/agents/QXPartnerAgent.js.map +1 -0
- package/dist/agents/index.d.ts +1 -0
- package/dist/agents/index.d.ts.map +1 -1
- package/dist/agents/index.js +82 -2
- package/dist/agents/index.js.map +1 -1
- package/dist/alerting/AlertManager.d.ts +120 -0
- package/dist/alerting/AlertManager.d.ts.map +1 -0
- package/dist/alerting/AlertManager.js +345 -0
- package/dist/alerting/AlertManager.js.map +1 -0
- package/dist/alerting/FeedbackRouter.d.ts +98 -0
- package/dist/alerting/FeedbackRouter.d.ts.map +1 -0
- package/dist/alerting/FeedbackRouter.js +331 -0
- package/dist/alerting/FeedbackRouter.js.map +1 -0
- package/dist/alerting/StrategyApplicator.d.ts +120 -0
- package/dist/alerting/StrategyApplicator.d.ts.map +1 -0
- package/dist/alerting/StrategyApplicator.js +299 -0
- package/dist/alerting/StrategyApplicator.js.map +1 -0
- package/dist/alerting/index.d.ts +68 -0
- package/dist/alerting/index.d.ts.map +1 -0
- package/dist/alerting/index.js +112 -0
- package/dist/alerting/index.js.map +1 -0
- package/dist/alerting/types.d.ts +118 -0
- package/dist/alerting/types.d.ts.map +1 -0
- package/dist/alerting/types.js +11 -0
- package/dist/alerting/types.js.map +1 -0
- package/dist/cli/commands/debug/agent.d.ts.map +1 -1
- package/dist/cli/commands/debug/agent.js +19 -6
- package/dist/cli/commands/debug/agent.js.map +1 -1
- package/dist/cli/commands/debug/health-check.js +20 -7
- package/dist/cli/commands/debug/health-check.js.map +1 -1
- package/dist/cli/commands/init-claude-md-template.d.ts +1 -0
- package/dist/cli/commands/init-claude-md-template.d.ts.map +1 -1
- package/dist/cli/commands/init-claude-md-template.js +4 -3
- package/dist/cli/commands/init-claude-md-template.js.map +1 -1
- package/dist/cli/commands/workflow/cancel.d.ts.map +1 -1
- package/dist/cli/commands/workflow/cancel.js +4 -3
- package/dist/cli/commands/workflow/cancel.js.map +1 -1
- package/dist/cli/commands/workflow/list.d.ts.map +1 -1
- package/dist/cli/commands/workflow/list.js +4 -3
- package/dist/cli/commands/workflow/list.js.map +1 -1
- package/dist/cli/commands/workflow/pause.d.ts.map +1 -1
- package/dist/cli/commands/workflow/pause.js +4 -3
- package/dist/cli/commands/workflow/pause.js.map +1 -1
- package/dist/cli/init/claude-config.d.ts.map +1 -1
- package/dist/cli/init/claude-config.js +13 -13
- package/dist/cli/init/claude-config.js.map +1 -1
- package/dist/cli/init/claude-md.d.ts.map +1 -1
- package/dist/cli/init/claude-md.js +44 -2
- package/dist/cli/init/claude-md.js.map +1 -1
- package/dist/cli/init/database-init.js +1 -1
- package/dist/cli/init/index.d.ts.map +1 -1
- package/dist/cli/init/index.js +13 -6
- package/dist/cli/init/index.js.map +1 -1
- package/dist/cli/init/skills.d.ts.map +1 -1
- package/dist/cli/init/skills.js +2 -1
- package/dist/cli/init/skills.js.map +1 -1
- package/dist/core/memory/AgentDBIntegration.d.ts +24 -6
- package/dist/core/memory/AgentDBIntegration.d.ts.map +1 -1
- package/dist/core/memory/AgentDBIntegration.js +66 -10
- package/dist/core/memory/AgentDBIntegration.js.map +1 -1
- package/dist/core/memory/IPatternStore.d.ts +209 -0
- package/dist/core/memory/IPatternStore.d.ts.map +1 -0
- package/dist/core/memory/IPatternStore.js +15 -0
- package/dist/core/memory/IPatternStore.js.map +1 -0
- package/dist/core/memory/MigrationTools.d.ts +192 -0
- package/dist/core/memory/MigrationTools.d.ts.map +1 -0
- package/dist/core/memory/MigrationTools.js +615 -0
- package/dist/core/memory/MigrationTools.js.map +1 -0
- package/dist/core/memory/NeuralEnhancement.d.ts +154 -0
- package/dist/core/memory/NeuralEnhancement.d.ts.map +1 -0
- package/dist/core/memory/NeuralEnhancement.js +598 -0
- package/dist/core/memory/NeuralEnhancement.js.map +1 -0
- package/dist/core/memory/PatternStoreFactory.d.ts +143 -0
- package/dist/core/memory/PatternStoreFactory.d.ts.map +1 -0
- package/dist/core/memory/PatternStoreFactory.js +370 -0
- package/dist/core/memory/PatternStoreFactory.js.map +1 -0
- package/dist/core/memory/RealAgentDBAdapter.d.ts +1 -0
- package/dist/core/memory/RealAgentDBAdapter.d.ts.map +1 -1
- package/dist/core/memory/RealAgentDBAdapter.js +28 -20
- package/dist/core/memory/RealAgentDBAdapter.js.map +1 -1
- package/dist/core/memory/RuVectorPatternStore.d.ts +198 -0
- package/dist/core/memory/RuVectorPatternStore.d.ts.map +1 -0
- package/dist/core/memory/RuVectorPatternStore.js +605 -0
- package/dist/core/memory/RuVectorPatternStore.js.map +1 -0
- package/dist/core/memory/SelfHealingMonitor.d.ts +186 -0
- package/dist/core/memory/SelfHealingMonitor.d.ts.map +1 -0
- package/dist/core/memory/SelfHealingMonitor.js +451 -0
- package/dist/core/memory/SelfHealingMonitor.js.map +1 -0
- package/dist/core/memory/SwarmMemoryManager.d.ts +62 -0
- package/dist/core/memory/SwarmMemoryManager.d.ts.map +1 -1
- package/dist/core/memory/SwarmMemoryManager.js +97 -0
- package/dist/core/memory/SwarmMemoryManager.js.map +1 -1
- package/dist/core/memory/UnifiedMemoryCoordinator.d.ts +341 -0
- package/dist/core/memory/UnifiedMemoryCoordinator.d.ts.map +1 -0
- package/dist/core/memory/UnifiedMemoryCoordinator.js +986 -0
- package/dist/core/memory/UnifiedMemoryCoordinator.js.map +1 -0
- package/dist/core/memory/index.d.ts +16 -0
- package/dist/core/memory/index.d.ts.map +1 -1
- package/dist/core/memory/index.js +58 -1
- package/dist/core/memory/index.js.map +1 -1
- package/dist/core/optimization/SwarmOptimizer.d.ts +185 -0
- package/dist/core/optimization/SwarmOptimizer.d.ts.map +1 -0
- package/dist/core/optimization/SwarmOptimizer.js +631 -0
- package/dist/core/optimization/SwarmOptimizer.js.map +1 -0
- package/dist/core/optimization/index.d.ts +9 -0
- package/dist/core/optimization/index.d.ts.map +1 -0
- package/dist/core/optimization/index.js +25 -0
- package/dist/core/optimization/index.js.map +1 -0
- package/dist/core/optimization/types.d.ts +53 -0
- package/dist/core/optimization/types.d.ts.map +1 -0
- package/dist/core/optimization/types.js +6 -0
- package/dist/core/optimization/types.js.map +1 -0
- package/dist/core/orchestration/PriorityQueue.d.ts +54 -0
- package/dist/core/orchestration/PriorityQueue.d.ts.map +1 -0
- package/dist/core/orchestration/PriorityQueue.js +122 -0
- package/dist/core/orchestration/PriorityQueue.js.map +1 -0
- package/dist/core/orchestration/WorkflowOrchestrator.d.ts +176 -0
- package/dist/core/orchestration/WorkflowOrchestrator.d.ts.map +1 -0
- package/dist/core/orchestration/WorkflowOrchestrator.js +813 -0
- package/dist/core/orchestration/WorkflowOrchestrator.js.map +1 -0
- package/dist/core/orchestration/index.d.ts +7 -0
- package/dist/core/orchestration/index.d.ts.map +1 -0
- package/dist/core/orchestration/index.js +11 -0
- package/dist/core/orchestration/index.js.map +1 -0
- package/dist/core/orchestration/types.d.ts +96 -0
- package/dist/core/orchestration/types.d.ts.map +1 -0
- package/dist/core/orchestration/types.js +6 -0
- package/dist/core/orchestration/types.js.map +1 -0
- package/dist/core/skills/DynamicSkillLoader.d.ts +96 -0
- package/dist/core/skills/DynamicSkillLoader.d.ts.map +1 -0
- package/dist/core/skills/DynamicSkillLoader.js +353 -0
- package/dist/core/skills/DynamicSkillLoader.js.map +1 -0
- package/dist/core/skills/types.d.ts +118 -0
- package/dist/core/skills/types.d.ts.map +1 -0
- package/dist/core/skills/types.js +7 -0
- package/dist/core/skills/types.js.map +1 -0
- package/dist/core/transport/QUICTransport.d.ts +320 -0
- package/dist/core/transport/QUICTransport.d.ts.map +1 -0
- package/dist/core/transport/QUICTransport.js +711 -0
- package/dist/core/transport/QUICTransport.js.map +1 -0
- package/dist/core/transport/index.d.ts +40 -0
- package/dist/core/transport/index.d.ts.map +1 -0
- package/dist/core/transport/index.js +46 -0
- package/dist/core/transport/index.js.map +1 -0
- package/dist/core/transport/quic-loader.d.ts +123 -0
- package/dist/core/transport/quic-loader.d.ts.map +1 -0
- package/dist/core/transport/quic-loader.js +293 -0
- package/dist/core/transport/quic-loader.js.map +1 -0
- package/dist/core/transport/quic.d.ts +154 -0
- package/dist/core/transport/quic.d.ts.map +1 -0
- package/dist/core/transport/quic.js +214 -0
- package/dist/core/transport/quic.js.map +1 -0
- package/dist/mcp/services/AgentRegistry.d.ts.map +1 -1
- package/dist/mcp/services/AgentRegistry.js +4 -1
- package/dist/mcp/services/AgentRegistry.js.map +1 -1
- package/dist/reasoning/RuVectorReasoningAdapter.d.ts +232 -0
- package/dist/reasoning/RuVectorReasoningAdapter.d.ts.map +1 -0
- package/dist/reasoning/RuVectorReasoningAdapter.js +585 -0
- package/dist/reasoning/RuVectorReasoningAdapter.js.map +1 -0
- package/dist/reasoning/index.d.ts +2 -0
- package/dist/reasoning/index.d.ts.map +1 -1
- package/dist/reasoning/index.js +6 -1
- package/dist/reasoning/index.js.map +1 -1
- package/dist/reporting/ResultAggregator.d.ts +107 -0
- package/dist/reporting/ResultAggregator.d.ts.map +1 -0
- package/dist/reporting/ResultAggregator.js +435 -0
- package/dist/reporting/ResultAggregator.js.map +1 -0
- package/dist/reporting/index.d.ts +48 -0
- package/dist/reporting/index.d.ts.map +1 -0
- package/dist/reporting/index.js +154 -0
- package/dist/reporting/index.js.map +1 -0
- package/dist/reporting/reporters/ControlLoopReporter.d.ts +128 -0
- package/dist/reporting/reporters/ControlLoopReporter.d.ts.map +1 -0
- package/dist/reporting/reporters/ControlLoopReporter.js +417 -0
- package/dist/reporting/reporters/ControlLoopReporter.js.map +1 -0
- package/dist/reporting/reporters/HumanReadableReporter.d.ts +140 -0
- package/dist/reporting/reporters/HumanReadableReporter.d.ts.map +1 -0
- package/dist/reporting/reporters/HumanReadableReporter.js +524 -0
- package/dist/reporting/reporters/HumanReadableReporter.js.map +1 -0
- package/dist/reporting/reporters/JSONReporter.d.ts +193 -0
- package/dist/reporting/reporters/JSONReporter.d.ts.map +1 -0
- package/dist/reporting/reporters/JSONReporter.js +324 -0
- package/dist/reporting/reporters/JSONReporter.js.map +1 -0
- package/dist/reporting/reporters/index.d.ts +14 -0
- package/dist/reporting/reporters/index.d.ts.map +1 -0
- package/dist/reporting/reporters/index.js +19 -0
- package/dist/reporting/reporters/index.js.map +1 -0
- package/dist/reporting/types.d.ts +427 -0
- package/dist/reporting/types.d.ts.map +1 -0
- package/dist/reporting/types.js +12 -0
- package/dist/reporting/types.js.map +1 -0
- package/dist/types/index.d.ts +2 -1
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/index.js +2 -0
- package/dist/types/index.js.map +1 -1
- package/dist/types/qx.d.ts +397 -0
- package/dist/types/qx.d.ts.map +1 -0
- package/dist/types/qx.js +71 -0
- package/dist/types/qx.js.map +1 -0
- package/dist/visualization/api/RestEndpoints.js +1 -1
- package/dist/visualization/api/RestEndpoints.js.map +1 -1
- package/dist/visualization/api/WebSocketServer.d.ts +44 -0
- package/dist/visualization/api/WebSocketServer.d.ts.map +1 -1
- package/dist/visualization/api/WebSocketServer.js +144 -23
- package/dist/visualization/api/WebSocketServer.js.map +1 -1
- package/dist/visualization/core/DataTransformer.d.ts +10 -0
- package/dist/visualization/core/DataTransformer.d.ts.map +1 -1
- package/dist/visualization/core/DataTransformer.js +60 -5
- package/dist/visualization/core/DataTransformer.js.map +1 -1
- package/dist/visualization/emit-event.d.ts +75 -0
- package/dist/visualization/emit-event.d.ts.map +1 -0
- package/dist/visualization/emit-event.js +213 -0
- package/dist/visualization/emit-event.js.map +1 -0
- package/dist/visualization/index.d.ts +1 -0
- package/dist/visualization/index.d.ts.map +1 -1
- package/dist/visualization/index.js +7 -1
- package/dist/visualization/index.js.map +1 -1
- package/docs/reference/skills.md +63 -1
- package/package.json +12 -4
|
@@ -0,0 +1,518 @@
|
|
|
1
|
+
# Prometheus Alerting Rules for Agentic QE Fleet
|
|
2
|
+
# Phase 4: Autonomous Alerting & Feedback Loop System
|
|
3
|
+
# Issue: #69
|
|
4
|
+
# Version: 1.0.0
|
|
5
|
+
# Date: 2025-11-29
|
|
6
|
+
|
|
7
|
+
groups:
|
|
8
|
+
# =========================================================================
|
|
9
|
+
# QUALITY METRIC ALERTS
|
|
10
|
+
# =========================================================================
|
|
11
|
+
- name: quality_metrics
|
|
12
|
+
interval: 15s
|
|
13
|
+
rules:
|
|
14
|
+
# Test Failure Rate Alert
|
|
15
|
+
- alert: HighTestFailureRate
|
|
16
|
+
expr: |
|
|
17
|
+
(
|
|
18
|
+
sum(rate(aqe_quality_test_count{status="failed"}[5m]))
|
|
19
|
+
/
|
|
20
|
+
sum(rate(aqe_quality_test_count[5m]))
|
|
21
|
+
) > 0.05
|
|
22
|
+
for: 5m
|
|
23
|
+
labels:
|
|
24
|
+
severity: error
|
|
25
|
+
component: quality
|
|
26
|
+
alert_type: test_failure
|
|
27
|
+
feedback_action: adjust_strategy
|
|
28
|
+
annotations:
|
|
29
|
+
summary: "Test failure rate exceeds 5%"
|
|
30
|
+
description: "Test failure rate is {{ $value | humanizePercentage }} (threshold: 5%). This indicates quality degradation."
|
|
31
|
+
feedback_strategy: "increase_test_isolation"
|
|
32
|
+
feedback_focus: "failing_tests"
|
|
33
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/high-test-failure-rate"
|
|
34
|
+
|
|
35
|
+
# Coverage Drop - Critical
|
|
36
|
+
- alert: CriticalCoverageDrop
|
|
37
|
+
expr: aqe_quality_coverage_line < 80
|
|
38
|
+
for: 1m
|
|
39
|
+
labels:
|
|
40
|
+
severity: critical
|
|
41
|
+
component: quality
|
|
42
|
+
alert_type: coverage_drop
|
|
43
|
+
feedback_action: auto_remediate
|
|
44
|
+
agent_scope: qe-coverage-analyzer
|
|
45
|
+
annotations:
|
|
46
|
+
summary: "Code coverage dropped below 80%"
|
|
47
|
+
description: "Line coverage is {{ $value }}% (threshold: 80%). Immediate action required."
|
|
48
|
+
feedback_action: "generate_additional_tests"
|
|
49
|
+
feedback_target_coverage: "85.0"
|
|
50
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/coverage-drop"
|
|
51
|
+
|
|
52
|
+
# Coverage Drop - Warning
|
|
53
|
+
- alert: WarningCoverageDrop
|
|
54
|
+
expr: aqe_quality_coverage_line < 85 and aqe_quality_coverage_line >= 80
|
|
55
|
+
for: 5m
|
|
56
|
+
labels:
|
|
57
|
+
severity: warning
|
|
58
|
+
component: quality
|
|
59
|
+
alert_type: coverage_drop
|
|
60
|
+
feedback_action: adjust_strategy
|
|
61
|
+
annotations:
|
|
62
|
+
summary: "Code coverage approaching threshold"
|
|
63
|
+
description: "Line coverage is {{ $value }}% (warning at 85%, critical at 80%)"
|
|
64
|
+
feedback_strategy: "proactive_test_generation"
|
|
65
|
+
|
|
66
|
+
# Branch Coverage Drop
|
|
67
|
+
- alert: BranchCoverageLow
|
|
68
|
+
expr: aqe_quality_coverage_branch < 75
|
|
69
|
+
for: 5m
|
|
70
|
+
labels:
|
|
71
|
+
severity: warning
|
|
72
|
+
component: quality
|
|
73
|
+
alert_type: coverage_drop
|
|
74
|
+
feedback_action: adjust_strategy
|
|
75
|
+
annotations:
|
|
76
|
+
summary: "Branch coverage below threshold"
|
|
77
|
+
description: "Branch coverage is {{ $value }}% (threshold: 75%)"
|
|
78
|
+
feedback_strategy: "focus_branch_coverage"
|
|
79
|
+
|
|
80
|
+
# Flaky Tests Increasing
|
|
81
|
+
- alert: FlakyTestsIncreasing
|
|
82
|
+
expr: aqe_quality_flaky_count > 5
|
|
83
|
+
for: 1h
|
|
84
|
+
labels:
|
|
85
|
+
severity: warning
|
|
86
|
+
component: quality
|
|
87
|
+
alert_type: flaky_tests
|
|
88
|
+
feedback_action: adjust_strategy
|
|
89
|
+
agent_scope: qe-flaky-detector
|
|
90
|
+
annotations:
|
|
91
|
+
summary: "Number of flaky tests is growing"
|
|
92
|
+
description: "{{ $value }} flaky tests detected (threshold: 5). Test stability degrading."
|
|
93
|
+
feedback_strategy: "stabilize_flaky_tests"
|
|
94
|
+
feedback_analysis_depth: "deep"
|
|
95
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/flaky-tests"
|
|
96
|
+
|
|
97
|
+
# Critical Flaky Test Count
|
|
98
|
+
- alert: CriticalFlakyTestCount
|
|
99
|
+
expr: aqe_quality_flaky_count > 10
|
|
100
|
+
for: 30m
|
|
101
|
+
labels:
|
|
102
|
+
severity: error
|
|
103
|
+
component: quality
|
|
104
|
+
alert_type: flaky_tests
|
|
105
|
+
feedback_action: escalate
|
|
106
|
+
annotations:
|
|
107
|
+
summary: "Critical number of flaky tests detected"
|
|
108
|
+
description: "{{ $value }} flaky tests (critical threshold: 10). Test suite reliability compromised."
|
|
109
|
+
feedback_action: "quarantine_flaky_tests"
|
|
110
|
+
|
|
111
|
+
# Security Vulnerabilities - Critical
|
|
112
|
+
- alert: CriticalSecurityVulnerabilities
|
|
113
|
+
expr: aqe_quality_security_vulnerability_count{severity="critical"} > 0
|
|
114
|
+
for: 0s
|
|
115
|
+
labels:
|
|
116
|
+
severity: critical
|
|
117
|
+
component: security
|
|
118
|
+
alert_type: vulnerability
|
|
119
|
+
feedback_action: escalate
|
|
120
|
+
agent_scope: qe-security-scanner
|
|
121
|
+
annotations:
|
|
122
|
+
summary: "Critical security vulnerabilities detected"
|
|
123
|
+
description: "{{ $value }} critical vulnerabilities found. Deployment must be blocked."
|
|
124
|
+
feedback_notify: "security_team"
|
|
125
|
+
feedback_block_deployment: "true"
|
|
126
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/security-vulnerabilities"
|
|
127
|
+
|
|
128
|
+
# Security Vulnerabilities - High
|
|
129
|
+
- alert: HighSecurityVulnerabilities
|
|
130
|
+
expr: aqe_quality_security_vulnerability_count{severity="high"} > 0
|
|
131
|
+
for: 0s
|
|
132
|
+
labels:
|
|
133
|
+
severity: error
|
|
134
|
+
component: security
|
|
135
|
+
alert_type: vulnerability
|
|
136
|
+
feedback_action: escalate
|
|
137
|
+
agent_scope: qe-security-scanner
|
|
138
|
+
annotations:
|
|
139
|
+
summary: "High severity security vulnerabilities detected"
|
|
140
|
+
description: "{{ $value }} high severity vulnerabilities found. Immediate remediation required."
|
|
141
|
+
feedback_notify: "security_team"
|
|
142
|
+
feedback_block_deployment: "true"
|
|
143
|
+
|
|
144
|
+
# Security Vulnerabilities - Medium (with threshold)
|
|
145
|
+
- alert: MediumSecurityVulnerabilities
|
|
146
|
+
expr: aqe_quality_security_vulnerability_count{severity="medium"} > 5
|
|
147
|
+
for: 5m
|
|
148
|
+
labels:
|
|
149
|
+
severity: warning
|
|
150
|
+
component: security
|
|
151
|
+
alert_type: vulnerability
|
|
152
|
+
feedback_action: auto_remediate
|
|
153
|
+
annotations:
|
|
154
|
+
summary: "Multiple medium severity vulnerabilities"
|
|
155
|
+
description: "{{ $value }} medium severity vulnerabilities (threshold: 5)"
|
|
156
|
+
feedback_action: "schedule_security_remediation"
|
|
157
|
+
|
|
158
|
+
# Quality Gate Failure
|
|
159
|
+
- alert: QualityGateFailed
|
|
160
|
+
expr: aqe_quality_gate_pass_rate < 1.0
|
|
161
|
+
for: 1m
|
|
162
|
+
labels:
|
|
163
|
+
severity: error
|
|
164
|
+
component: quality
|
|
165
|
+
alert_type: quality_gate
|
|
166
|
+
feedback_action: adjust_strategy
|
|
167
|
+
agent_scope: qe-quality-gate
|
|
168
|
+
annotations:
|
|
169
|
+
summary: "Quality gate evaluation failed"
|
|
170
|
+
description: "Quality gate pass rate: {{ $value | humanizePercentage }} (expected: 100%)"
|
|
171
|
+
feedback_strategy: "incremental_improvement"
|
|
172
|
+
feedback_focus_areas: "coverage,complexity,security"
|
|
173
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/quality-gate-failure"
|
|
174
|
+
|
|
175
|
+
# =========================================================================
|
|
176
|
+
# PERFORMANCE METRIC ALERTS
|
|
177
|
+
# =========================================================================
|
|
178
|
+
- name: performance_metrics
|
|
179
|
+
interval: 15s
|
|
180
|
+
rules:
|
|
181
|
+
# Test Execution Slow
|
|
182
|
+
- alert: TestExecutionSlow
|
|
183
|
+
expr: |
|
|
184
|
+
histogram_quantile(0.95,
|
|
185
|
+
rate(aqe_quality_test_duration_bucket[5m])
|
|
186
|
+
) > 30000
|
|
187
|
+
for: 5m
|
|
188
|
+
labels:
|
|
189
|
+
severity: warning
|
|
190
|
+
component: performance
|
|
191
|
+
alert_type: execution_slow
|
|
192
|
+
feedback_action: adjust_strategy
|
|
193
|
+
agent_scope: qe-test-executor
|
|
194
|
+
annotations:
|
|
195
|
+
summary: "Test execution time degraded"
|
|
196
|
+
description: "P95 test execution time is {{ $value | humanizeDuration }} (threshold: 30s)"
|
|
197
|
+
feedback_strategy: "optimize_test_suite"
|
|
198
|
+
feedback_action: "parallel_execution"
|
|
199
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/slow-tests"
|
|
200
|
+
|
|
201
|
+
# Critical Test Execution Time
|
|
202
|
+
- alert: CriticalTestExecutionTime
|
|
203
|
+
expr: |
|
|
204
|
+
histogram_quantile(0.95,
|
|
205
|
+
rate(aqe_quality_test_duration_bucket[5m])
|
|
206
|
+
) > 60000
|
|
207
|
+
for: 3m
|
|
208
|
+
labels:
|
|
209
|
+
severity: error
|
|
210
|
+
component: performance
|
|
211
|
+
alert_type: execution_slow
|
|
212
|
+
feedback_action: auto_remediate
|
|
213
|
+
annotations:
|
|
214
|
+
summary: "Test execution critically slow"
|
|
215
|
+
description: "P95 test execution time is {{ $value | humanizeDuration }} (critical threshold: 60s)"
|
|
216
|
+
feedback_action: "emergency_test_optimization"
|
|
217
|
+
|
|
218
|
+
# Agent Task Timeout
|
|
219
|
+
- alert: AgentTaskTimeout
|
|
220
|
+
expr: |
|
|
221
|
+
histogram_quantile(0.95,
|
|
222
|
+
rate(aqe_agent_task_duration_bucket[10m])
|
|
223
|
+
) > 120000
|
|
224
|
+
for: 10m
|
|
225
|
+
labels:
|
|
226
|
+
severity: error
|
|
227
|
+
component: performance
|
|
228
|
+
alert_type: task_timeout
|
|
229
|
+
feedback_action: retrain_model
|
|
230
|
+
annotations:
|
|
231
|
+
summary: "Agent tasks timing out frequently"
|
|
232
|
+
description: "P95 task duration is {{ $value | humanizeDuration }} (threshold: 2m)"
|
|
233
|
+
feedback_focus: "task_complexity_estimation"
|
|
234
|
+
feedback_learning_rate: "0.2"
|
|
235
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/agent-timeout"
|
|
236
|
+
|
|
237
|
+
# Memory Usage High
|
|
238
|
+
- alert: HighMemoryUsage
|
|
239
|
+
expr: aqe_system_memory_usage > 500000000
|
|
240
|
+
for: 1m
|
|
241
|
+
labels:
|
|
242
|
+
severity: warning
|
|
243
|
+
component: system
|
|
244
|
+
alert_type: resource_usage
|
|
245
|
+
feedback_action: auto_remediate
|
|
246
|
+
annotations:
|
|
247
|
+
summary: "Agent memory consumption exceeds threshold"
|
|
248
|
+
description: "Memory usage is {{ $value | humanize1024 }}B (threshold: 500MB)"
|
|
249
|
+
feedback_action: "garbage_collect"
|
|
250
|
+
feedback_optimize_batch_size: "true"
|
|
251
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/high-memory"
|
|
252
|
+
|
|
253
|
+
# Critical Memory Usage
|
|
254
|
+
- alert: CriticalMemoryUsage
|
|
255
|
+
expr: aqe_system_memory_usage > 800000000
|
|
256
|
+
for: 30s
|
|
257
|
+
labels:
|
|
258
|
+
severity: critical
|
|
259
|
+
component: system
|
|
260
|
+
alert_type: resource_usage
|
|
261
|
+
feedback_action: escalate
|
|
262
|
+
annotations:
|
|
263
|
+
summary: "Critical memory usage detected"
|
|
264
|
+
description: "Memory usage is {{ $value | humanize1024 }}B (critical threshold: 800MB). OOM risk."
|
|
265
|
+
feedback_action: "emergency_memory_cleanup"
|
|
266
|
+
|
|
267
|
+
# CPU Usage High
|
|
268
|
+
- alert: HighCPUUsage
|
|
269
|
+
expr: aqe_system_cpu_usage > 80
|
|
270
|
+
for: 5m
|
|
271
|
+
labels:
|
|
272
|
+
severity: warning
|
|
273
|
+
component: system
|
|
274
|
+
alert_type: resource_usage
|
|
275
|
+
feedback_action: adjust_strategy
|
|
276
|
+
annotations:
|
|
277
|
+
summary: "High CPU utilization detected"
|
|
278
|
+
description: "CPU usage is {{ $value }}% (threshold: 80%)"
|
|
279
|
+
feedback_strategy: "reduce_concurrent_tasks"
|
|
280
|
+
|
|
281
|
+
# =========================================================================
|
|
282
|
+
# LEARNING & ADAPTATION ALERTS
|
|
283
|
+
# =========================================================================
|
|
284
|
+
- name: learning_metrics
|
|
285
|
+
interval: 30s
|
|
286
|
+
rules:
|
|
287
|
+
# Low Agent Success Rate
|
|
288
|
+
- alert: LowAgentSuccessRate
|
|
289
|
+
expr: aqe_agent_success_rate < 0.90
|
|
290
|
+
for: 1h
|
|
291
|
+
labels:
|
|
292
|
+
severity: warning
|
|
293
|
+
component: learning
|
|
294
|
+
alert_type: success_rate
|
|
295
|
+
feedback_action: retrain_model
|
|
296
|
+
annotations:
|
|
297
|
+
summary: "Agent success rate below target"
|
|
298
|
+
description: "Agent {{ $labels.agent_type }} success rate is {{ $value | humanizePercentage }} (threshold: 90%)"
|
|
299
|
+
feedback_exploration_rate: "0.3"
|
|
300
|
+
feedback_focus: "failed_task_patterns"
|
|
301
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/low-success-rate"
|
|
302
|
+
|
|
303
|
+
# Critical Agent Success Rate
|
|
304
|
+
- alert: CriticalAgentSuccessRate
|
|
305
|
+
expr: aqe_agent_success_rate < 0.70
|
|
306
|
+
for: 30m
|
|
307
|
+
labels:
|
|
308
|
+
severity: error
|
|
309
|
+
component: learning
|
|
310
|
+
alert_type: success_rate
|
|
311
|
+
feedback_action: escalate
|
|
312
|
+
annotations:
|
|
313
|
+
summary: "Critical agent success rate"
|
|
314
|
+
description: "Agent {{ $labels.agent_type }} success rate is {{ $value | humanizePercentage }} (critical: 70%)"
|
|
315
|
+
feedback_action: "emergency_retraining"
|
|
316
|
+
|
|
317
|
+
# Defect Density High
|
|
318
|
+
- alert: HighDefectDensity
|
|
319
|
+
expr: aqe_quality_defect_density > 2.0
|
|
320
|
+
for: 24h
|
|
321
|
+
labels:
|
|
322
|
+
severity: error
|
|
323
|
+
component: quality
|
|
324
|
+
alert_type: defect_density
|
|
325
|
+
feedback_action: adjust_strategy
|
|
326
|
+
agent_scope: qe-quality-analyzer
|
|
327
|
+
annotations:
|
|
328
|
+
summary: "Defect density exceeds threshold"
|
|
329
|
+
description: "Defect density is {{ $value }} per KLOC (threshold: 2.0)"
|
|
330
|
+
feedback_strategy: "increase_review_depth"
|
|
331
|
+
feedback_static_analysis: "true"
|
|
332
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/high-defect-density"
|
|
333
|
+
|
|
334
|
+
# Agent Task Failure Spike
|
|
335
|
+
- alert: AgentTaskFailureSpike
|
|
336
|
+
expr: |
|
|
337
|
+
(
|
|
338
|
+
sum(rate(aqe_agent_task_count{status="failed"}[5m])) by (agent_type)
|
|
339
|
+
/
|
|
340
|
+
sum(rate(aqe_agent_task_count[5m])) by (agent_type)
|
|
341
|
+
) > 0.20
|
|
342
|
+
for: 10m
|
|
343
|
+
labels:
|
|
344
|
+
severity: warning
|
|
345
|
+
component: learning
|
|
346
|
+
alert_type: failure_spike
|
|
347
|
+
feedback_action: retrain_model
|
|
348
|
+
annotations:
|
|
349
|
+
summary: "Agent experiencing task failure spike"
|
|
350
|
+
description: "Agent {{ $labels.agent_type }} failure rate is {{ $value | humanizePercentage }} (threshold: 20%)"
|
|
351
|
+
feedback_action: "analyze_failure_patterns"
|
|
352
|
+
|
|
353
|
+
# =========================================================================
|
|
354
|
+
# FLEET COORDINATION ALERTS
|
|
355
|
+
# =========================================================================
|
|
356
|
+
- name: fleet_coordination
|
|
357
|
+
interval: 30s
|
|
358
|
+
rules:
|
|
359
|
+
# Agent Queue Depth High
|
|
360
|
+
- alert: HighAgentQueueDepth
|
|
361
|
+
expr: aqe_system_queue_depth > 50
|
|
362
|
+
for: 5m
|
|
363
|
+
labels:
|
|
364
|
+
severity: warning
|
|
365
|
+
component: coordination
|
|
366
|
+
alert_type: queue_depth
|
|
367
|
+
feedback_action: adjust_strategy
|
|
368
|
+
annotations:
|
|
369
|
+
summary: "Agent task queue is backing up"
|
|
370
|
+
description: "Queue depth is {{ $value }} tasks (threshold: 50)"
|
|
371
|
+
feedback_strategy: "scale_agents"
|
|
372
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/queue-backup"
|
|
373
|
+
|
|
374
|
+
# Agent Queue Depth Critical
|
|
375
|
+
- alert: CriticalAgentQueueDepth
|
|
376
|
+
expr: aqe_system_queue_depth > 100
|
|
377
|
+
for: 2m
|
|
378
|
+
labels:
|
|
379
|
+
severity: error
|
|
380
|
+
component: coordination
|
|
381
|
+
alert_type: queue_depth
|
|
382
|
+
feedback_action: auto_remediate
|
|
383
|
+
annotations:
|
|
384
|
+
summary: "Critical agent queue backlog"
|
|
385
|
+
description: "Queue depth is {{ $value }} tasks (critical: 100). System overloaded."
|
|
386
|
+
feedback_action: "emergency_queue_drain"
|
|
387
|
+
|
|
388
|
+
# Database Query Slow
|
|
389
|
+
- alert: SlowDatabaseQueries
|
|
390
|
+
expr: |
|
|
391
|
+
histogram_quantile(0.95,
|
|
392
|
+
rate(aqe_system_db_query_duration_bucket[5m])
|
|
393
|
+
) > 1000
|
|
394
|
+
for: 5m
|
|
395
|
+
labels:
|
|
396
|
+
severity: warning
|
|
397
|
+
component: system
|
|
398
|
+
alert_type: database_slow
|
|
399
|
+
feedback_action: adjust_strategy
|
|
400
|
+
annotations:
|
|
401
|
+
summary: "Database queries are slow"
|
|
402
|
+
description: "P95 query duration is {{ $value | humanizeDuration }} (threshold: 1s)"
|
|
403
|
+
feedback_strategy: "optimize_database_access"
|
|
404
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/slow-database"
|
|
405
|
+
|
|
406
|
+
# Event Bus Latency High
|
|
407
|
+
- alert: HighEventBusLatency
|
|
408
|
+
expr: |
|
|
409
|
+
rate(aqe_system_eventbus_latency_sum[5m])
|
|
410
|
+
/
|
|
411
|
+
rate(aqe_system_eventbus_latency_count[5m])
|
|
412
|
+
> 500
|
|
413
|
+
for: 5m
|
|
414
|
+
labels:
|
|
415
|
+
severity: warning
|
|
416
|
+
component: coordination
|
|
417
|
+
alert_type: event_latency
|
|
418
|
+
feedback_action: adjust_strategy
|
|
419
|
+
annotations:
|
|
420
|
+
summary: "Event bus experiencing high latency"
|
|
421
|
+
description: "Average event latency is {{ $value | humanizeDuration }} (threshold: 500ms)"
|
|
422
|
+
feedback_strategy: "optimize_event_handling"
|
|
423
|
+
|
|
424
|
+
# =========================================================================
|
|
425
|
+
# TOKEN COST & EFFICIENCY ALERTS
|
|
426
|
+
# =========================================================================
|
|
427
|
+
- name: cost_efficiency
|
|
428
|
+
interval: 1m
|
|
429
|
+
rules:
|
|
430
|
+
# High Token Cost Rate
|
|
431
|
+
- alert: HighTokenCostRate
|
|
432
|
+
expr: |
|
|
433
|
+
rate(aqe_agent_cost_sum[1h])
|
|
434
|
+
> 10.0
|
|
435
|
+
for: 15m
|
|
436
|
+
labels:
|
|
437
|
+
severity: warning
|
|
438
|
+
component: cost
|
|
439
|
+
alert_type: token_cost
|
|
440
|
+
feedback_action: adjust_strategy
|
|
441
|
+
annotations:
|
|
442
|
+
summary: "Token costs increasing rapidly"
|
|
443
|
+
description: "Cost rate is ${{ $value }}/hour (threshold: $10/hour)"
|
|
444
|
+
feedback_strategy: "optimize_token_usage"
|
|
445
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/high-token-cost"
|
|
446
|
+
|
|
447
|
+
# Inefficient Agent Token Usage
|
|
448
|
+
- alert: InefficientAgentTokenUsage
|
|
449
|
+
expr: |
|
|
450
|
+
(
|
|
451
|
+
rate(aqe_agent_token_usage_sum[1h]) by (agent_type)
|
|
452
|
+
/
|
|
453
|
+
rate(aqe_agent_task_count{status="success"}[1h]) by (agent_type)
|
|
454
|
+
) > 10000
|
|
455
|
+
for: 30m
|
|
456
|
+
labels:
|
|
457
|
+
severity: warning
|
|
458
|
+
component: efficiency
|
|
459
|
+
alert_type: token_efficiency
|
|
460
|
+
feedback_action: retrain_model
|
|
461
|
+
annotations:
|
|
462
|
+
summary: "Agent using excessive tokens per successful task"
|
|
463
|
+
description: "Agent {{ $labels.agent_type }} uses {{ $value }} tokens per success (threshold: 10k)"
|
|
464
|
+
feedback_strategy: "optimize_prompt_efficiency"
|
|
465
|
+
|
|
466
|
+
# =========================================================================
|
|
467
|
+
# ALERTING SYSTEM HEALTH
|
|
468
|
+
# =========================================================================
|
|
469
|
+
- name: alerting_system_health
|
|
470
|
+
interval: 30s
|
|
471
|
+
rules:
|
|
472
|
+
# High Alert Fire Rate (Alert Fatigue)
|
|
473
|
+
- alert: AlertFatigueDetected
|
|
474
|
+
expr: |
|
|
475
|
+
sum(rate(aqe_alerting_alerts_fired[1h])) > 20
|
|
476
|
+
for: 1h
|
|
477
|
+
labels:
|
|
478
|
+
severity: warning
|
|
479
|
+
component: alerting
|
|
480
|
+
alert_type: alert_fatigue
|
|
481
|
+
annotations:
|
|
482
|
+
summary: "Excessive alerts being fired"
|
|
483
|
+
description: "{{ $value }} alerts fired in the last hour. Potential alert fatigue."
|
|
484
|
+
action: "Review and tune alert thresholds"
|
|
485
|
+
runbook_url: "https://docs.agentic-qe.io/runbooks/alert-fatigue"
|
|
486
|
+
|
|
487
|
+
# High Alert Suppression Rate
|
|
488
|
+
- alert: HighAlertSuppressionRate
|
|
489
|
+
expr: |
|
|
490
|
+
(
|
|
491
|
+
rate(aqe_alerting_alerts_suppressed[1h])
|
|
492
|
+
/
|
|
493
|
+
rate(aqe_alerting_alerts_fired[1h])
|
|
494
|
+
) > 0.5
|
|
495
|
+
for: 1h
|
|
496
|
+
labels:
|
|
497
|
+
severity: info
|
|
498
|
+
component: alerting
|
|
499
|
+
alert_type: suppression_rate
|
|
500
|
+
annotations:
|
|
501
|
+
summary: "High alert suppression rate"
|
|
502
|
+
description: "{{ $value | humanizePercentage }} of alerts are being suppressed. Review cooldown settings."
|
|
503
|
+
|
|
504
|
+
# Feedback Processing Slow
|
|
505
|
+
- alert: SlowFeedbackProcessing
|
|
506
|
+
expr: |
|
|
507
|
+
histogram_quantile(0.95,
|
|
508
|
+
rate(aqe_alerting_feedback_duration_bucket[5m])
|
|
509
|
+
) > 5000
|
|
510
|
+
for: 5m
|
|
511
|
+
labels:
|
|
512
|
+
severity: warning
|
|
513
|
+
component: alerting
|
|
514
|
+
alert_type: feedback_slow
|
|
515
|
+
annotations:
|
|
516
|
+
summary: "Feedback loop processing is slow"
|
|
517
|
+
description: "P95 feedback processing time is {{ $value | humanizeDuration }} (threshold: 5s)"
|
|
518
|
+
action: "Investigate feedback router performance"
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
version: '3.8'
|
|
2
|
+
|
|
3
|
+
# OTEL Observability Stack for Agentic QE Fleet
|
|
4
|
+
# Issue #71: Complete OTEL Stack Docker Compose Configuration
|
|
5
|
+
#
|
|
6
|
+
# This compose file sets up a complete observability stack:
|
|
7
|
+
# - OTEL Collector: Receives telemetry via OTLP (gRPC:4317, HTTP:4318)
|
|
8
|
+
# - Prometheus: Scrapes metrics from OTEL Collector (port 9090)
|
|
9
|
+
# - Jaeger: Distributed tracing backend (UI on port 16686)
|
|
10
|
+
# - Grafana: Visualization and dashboards (port 3001)
|
|
11
|
+
#
|
|
12
|
+
# Usage:
|
|
13
|
+
# docker-compose -f config/docker-compose.otel.yml up -d
|
|
14
|
+
# docker-compose -f docker-compose.yml -f config/docker-compose.otel.yml up -d
|
|
15
|
+
|
|
16
|
+
services:
|
|
17
|
+
# OpenTelemetry Collector
|
|
18
|
+
otel-collector:
|
|
19
|
+
image: otel/opentelemetry-collector-contrib:latest
|
|
20
|
+
container_name: agentic-qe-otel-collector
|
|
21
|
+
command: ["--config=/etc/otel-collector-config.yaml"]
|
|
22
|
+
volumes:
|
|
23
|
+
- ./otel-collector-config.yaml.example:/etc/otel-collector-config.yaml:ro
|
|
24
|
+
- otel-data:/var/log/otel
|
|
25
|
+
ports:
|
|
26
|
+
# OTLP gRPC receiver
|
|
27
|
+
- "4317:4317"
|
|
28
|
+
# OTLP HTTP receiver
|
|
29
|
+
- "4318:4318"
|
|
30
|
+
# Prometheus exporter
|
|
31
|
+
- "8889:8889"
|
|
32
|
+
# Collector metrics (self-monitoring)
|
|
33
|
+
- "8888:8888"
|
|
34
|
+
# Health check endpoint
|
|
35
|
+
- "13133:13133"
|
|
36
|
+
# pprof profiling (development only)
|
|
37
|
+
- "1777:1777"
|
|
38
|
+
# zpages debug interface (development only)
|
|
39
|
+
- "55679:55679"
|
|
40
|
+
environment:
|
|
41
|
+
- DEPLOYMENT_ENVIRONMENT=${DEPLOYMENT_ENVIRONMENT:-development}
|
|
42
|
+
networks:
|
|
43
|
+
- agentic-qe-otel
|
|
44
|
+
restart: unless-stopped
|
|
45
|
+
healthcheck:
|
|
46
|
+
test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133/health"]
|
|
47
|
+
interval: 30s
|
|
48
|
+
timeout: 10s
|
|
49
|
+
retries: 3
|
|
50
|
+
start_period: 40s
|
|
51
|
+
|
|
52
|
+
# Prometheus - Metrics storage and querying
|
|
53
|
+
prometheus:
|
|
54
|
+
image: prom/prometheus:latest
|
|
55
|
+
container_name: agentic-qe-prometheus
|
|
56
|
+
command:
|
|
57
|
+
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
58
|
+
- '--storage.tsdb.path=/prometheus'
|
|
59
|
+
- '--storage.tsdb.retention.time=15d'
|
|
60
|
+
- '--storage.tsdb.retention.size=10GB'
|
|
61
|
+
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
|
62
|
+
- '--web.console.templates=/usr/share/prometheus/consoles'
|
|
63
|
+
- '--web.enable-lifecycle'
|
|
64
|
+
volumes:
|
|
65
|
+
- ./prometheus.yml.example:/etc/prometheus/prometheus.yml:ro
|
|
66
|
+
- prometheus-data:/prometheus
|
|
67
|
+
# Optionally mount alerting rules
|
|
68
|
+
# - ./prometheus-rules:/etc/prometheus/rules:ro
|
|
69
|
+
ports:
|
|
70
|
+
- "9090:9090"
|
|
71
|
+
networks:
|
|
72
|
+
- agentic-qe-otel
|
|
73
|
+
restart: unless-stopped
|
|
74
|
+
depends_on:
|
|
75
|
+
otel-collector:
|
|
76
|
+
condition: service_healthy
|
|
77
|
+
healthcheck:
|
|
78
|
+
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
|
|
79
|
+
interval: 30s
|
|
80
|
+
timeout: 10s
|
|
81
|
+
retries: 3
|
|
82
|
+
start_period: 30s
|
|
83
|
+
|
|
84
|
+
# Jaeger - Distributed tracing backend
|
|
85
|
+
jaeger:
|
|
86
|
+
image: jaegertracing/all-in-one:latest
|
|
87
|
+
container_name: agentic-qe-jaeger
|
|
88
|
+
environment:
|
|
89
|
+
# Collector settings
|
|
90
|
+
- COLLECTOR_OTLP_ENABLED=true
|
|
91
|
+
- COLLECTOR_ZIPKIN_HOST_PORT=:9411
|
|
92
|
+
# Storage settings (in-memory for development)
|
|
93
|
+
- SPAN_STORAGE_TYPE=badger
|
|
94
|
+
- BADGER_EPHEMERAL=false
|
|
95
|
+
- BADGER_DIRECTORY_VALUE=/badger/data
|
|
96
|
+
- BADGER_DIRECTORY_KEY=/badger/key
|
|
97
|
+
# Query settings
|
|
98
|
+
- QUERY_BASE_PATH=/
|
|
99
|
+
# Metrics backend
|
|
100
|
+
- METRICS_BACKEND=prometheus
|
|
101
|
+
- METRICS_HTTP_ROUTE=/metrics
|
|
102
|
+
volumes:
|
|
103
|
+
- jaeger-data:/badger
|
|
104
|
+
ports:
|
|
105
|
+
# Jaeger UI
|
|
106
|
+
- "16686:16686"
|
|
107
|
+
# OTLP gRPC receiver
|
|
108
|
+
- "4327:4317"
|
|
109
|
+
# OTLP HTTP receiver
|
|
110
|
+
- "4328:4318"
|
|
111
|
+
# Zipkin compatible endpoint
|
|
112
|
+
- "9411:9411"
|
|
113
|
+
# Admin port (health check, metrics)
|
|
114
|
+
- "14269:14269"
|
|
115
|
+
# Jaeger Thrift compact
|
|
116
|
+
- "6831:6831/udp"
|
|
117
|
+
# Jaeger Thrift binary
|
|
118
|
+
- "6832:6832/udp"
|
|
119
|
+
# Jaeger gRPC
|
|
120
|
+
- "14250:14250"
|
|
121
|
+
networks:
|
|
122
|
+
- agentic-qe-otel
|
|
123
|
+
restart: unless-stopped
|
|
124
|
+
healthcheck:
|
|
125
|
+
test: ["CMD", "wget", "--spider", "-q", "http://localhost:14269/"]
|
|
126
|
+
interval: 30s
|
|
127
|
+
timeout: 10s
|
|
128
|
+
retries: 3
|
|
129
|
+
start_period: 30s
|
|
130
|
+
|
|
131
|
+
# Grafana - Visualization and dashboards
|
|
132
|
+
grafana:
|
|
133
|
+
image: grafana/grafana:latest
|
|
134
|
+
container_name: agentic-qe-grafana
|
|
135
|
+
environment:
|
|
136
|
+
# Admin credentials (change in production!)
|
|
137
|
+
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
|
|
138
|
+
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
|
|
139
|
+
# Server settings
|
|
140
|
+
- GF_SERVER_ROOT_URL=http://localhost:3001
|
|
141
|
+
- GF_SERVER_SERVE_FROM_SUB_PATH=false
|
|
142
|
+
# Enable anonymous access (development only)
|
|
143
|
+
- GF_AUTH_ANONYMOUS_ENABLED=false
|
|
144
|
+
# Provisioning
|
|
145
|
+
- GF_PATHS_PROVISIONING=/etc/grafana/provisioning
|
|
146
|
+
# Plugins
|
|
147
|
+
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
|
|
148
|
+
volumes:
|
|
149
|
+
# Datasource provisioning
|
|
150
|
+
- ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
|
|
151
|
+
# Dashboard provisioning
|
|
152
|
+
- ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
|
|
153
|
+
# Dashboard JSON files
|
|
154
|
+
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
|
|
155
|
+
# Persistent storage
|
|
156
|
+
- grafana-data:/var/lib/grafana
|
|
157
|
+
ports:
|
|
158
|
+
- "3001:3000"
|
|
159
|
+
networks:
|
|
160
|
+
- agentic-qe-otel
|
|
161
|
+
restart: unless-stopped
|
|
162
|
+
depends_on:
|
|
163
|
+
prometheus:
|
|
164
|
+
condition: service_healthy
|
|
165
|
+
jaeger:
|
|
166
|
+
condition: service_healthy
|
|
167
|
+
healthcheck:
|
|
168
|
+
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health"]
|
|
169
|
+
interval: 30s
|
|
170
|
+
timeout: 10s
|
|
171
|
+
retries: 3
|
|
172
|
+
start_period: 40s
|
|
173
|
+
|
|
174
|
+
volumes:
|
|
175
|
+
otel-data:
|
|
176
|
+
driver: local
|
|
177
|
+
prometheus-data:
|
|
178
|
+
driver: local
|
|
179
|
+
jaeger-data:
|
|
180
|
+
driver: local
|
|
181
|
+
grafana-data:
|
|
182
|
+
driver: local
|
|
183
|
+
|
|
184
|
+
networks:
|
|
185
|
+
agentic-qe-otel:
|
|
186
|
+
driver: bridge
|
|
187
|
+
name: agentic-qe-otel
|