claude-flow-novice 2.2.4 → 2.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +9 -8
- package/scripts/.claude-flow/metrics/agent-metrics.json +1 -0
- package/scripts/.claude-flow/metrics/performance.json +9 -0
- package/scripts/.claude-flow/metrics/task-metrics.json +10 -0
- package/scripts/CLEANUP_OPTIMIZATION_REPORT.json +312 -0
- package/scripts/CLEANUP_PERFORMANCE_OPTIMIZATION.md +387 -0
- package/scripts/CLEANUP_QUICK_START.md +268 -0
- package/scripts/CLEANUP_TEST_RESULTS.md +205 -0
- package/scripts/README.md +339 -0
- package/scripts/ace-query.sh +384 -0
- package/scripts/agent-token-analysis.js +430 -0
- package/scripts/auto-setup.js +332 -0
- package/scripts/build/README.md +167 -0
- package/scripts/build/build-config.js +27 -0
- package/scripts/build/build-prompt-copier.sh +30 -0
- package/scripts/build/performance-monitor.js +869 -0
- package/scripts/build/prepare-publish.js +150 -0
- package/scripts/build/typescript-fixer.js +621 -0
- package/scripts/build/unified-builder.sh +428 -0
- package/scripts/build/update-bin-version.js +32 -0
- package/scripts/build/validate-agents.js +238 -0
- package/scripts/build-index.js +43 -0
- package/scripts/build-orchestrator.js +320 -0
- package/scripts/check-routing-stats.cjs +122 -0
- package/scripts/ci-validation.js +375 -0
- package/scripts/cleanup-blocking-coordination.sh +420 -0
- package/scripts/cleanup-idle-sessions.sh +59 -0
- package/scripts/collect-build-metrics.js +65 -0
- package/scripts/demo/README.md +79 -0
- package/scripts/demo/autoscaling-demo-simplified.js +963 -0
- package/scripts/demo/comprehensive-dashboard-test.js +693 -0
- package/scripts/demo/confidence-log.js +87 -0
- package/scripts/demo/confidence-report.js +82 -0
- package/scripts/demo/demo-multi-swarm-coordination.js +325 -0
- package/scripts/demo/demo-production-deployment.js +399 -0
- package/scripts/demo/demo-visualization-system.js +149 -0
- package/scripts/demo/performance-analysis.cjs +71 -0
- package/scripts/demo/performance-analysis.js +71 -0
- package/scripts/demo/test-autoscaling-demo.js +314 -0
- package/scripts/dependency-optimizer.js +349 -0
- package/scripts/dependency-security-assessment.js +331 -0
- package/scripts/deploy-sdk.sh +176 -0
- package/scripts/deployment-readiness-report.json +179 -0
- package/scripts/dev/README.md +264 -0
- package/scripts/dev/claude-flow-wrapper.sh +35 -0
- package/scripts/dev/claude-monitor.py +419 -0
- package/scripts/dev/claude-sparc.sh +562 -0
- package/scripts/dev/claude-wrapper.sh +17 -0
- package/scripts/dev/demo-phase3-compliance.js +172 -0
- package/scripts/dev/demo-task-system.ts +224 -0
- package/scripts/dev/deployment-validator.js +315 -0
- package/scripts/dev/spawn-claude-terminal.sh +32 -0
- package/scripts/dev/start-portal.sh +506 -0
- package/scripts/dev/start-web-ui.js +15 -0
- package/scripts/dev/stop-portal.sh +311 -0
- package/scripts/dev/validate-examples.ts +288 -0
- package/scripts/dev/validate-phase2.cjs +451 -0
- package/scripts/dev/validate-phase2.js +785 -0
- package/scripts/dev/validate-phase3.cjs +208 -0
- package/scripts/dev/validate-security-remediation.js +1 -0
- package/scripts/ecosystem.config.cjs +90 -0
- package/scripts/fix-js-extensions.js +167 -0
- package/scripts/generate-basic-types.js +73 -0
- package/scripts/generate-changelog.js +318 -0
- package/scripts/git-hooks/pre-commit.sh +143 -0
- package/scripts/health-checks.js +634 -0
- package/scripts/hook-wrapper.sh +54 -0
- package/scripts/install/README.md +375 -0
- package/scripts/install/REDIS_SETUP_VALIDATION.json +245 -0
- package/scripts/install/check-prerequisites.js +303 -0
- package/scripts/install/config-wizard.js +606 -0
- package/scripts/install/dependency-checker.js +385 -0
- package/scripts/install/health-check.js +765 -0
- package/scripts/install/install.js +256 -0
- package/scripts/install/installation-benchmark.js +461 -0
- package/scripts/install/quick-install.js +720 -0
- package/scripts/install/quick-start-wizard.js +295 -0
- package/scripts/install/redis-cli.js +289 -0
- package/scripts/install/redis-install-guides.md +407 -0
- package/scripts/install/redis-setup.js +559 -0
- package/scripts/install/redis-test.js +278 -0
- package/scripts/install/service-manager.js +672 -0
- package/scripts/install/setup.js +832 -0
- package/scripts/install/uninstall.js +526 -0
- package/scripts/install/update.js +461 -0
- package/scripts/install-pre-commit-hook.sh +127 -0
- package/scripts/legacy/README.md +272 -0
- package/scripts/legacy/batch-fix-ts.sh +54 -0
- package/scripts/legacy/build-migration.sh +105 -0
- package/scripts/legacy/build-monitor.js +209 -0
- package/scripts/legacy/build-with-filter.sh +84 -0
- package/scripts/legacy/build-workaround.sh +71 -0
- package/scripts/legacy/fix-ts-advanced.js +358 -0
- package/scripts/legacy/fix-ts-final.sh +50 -0
- package/scripts/legacy/fix-ts-targeted.sh +49 -0
- package/scripts/legacy/fix-typescript-errors.js +305 -0
- package/scripts/legacy/force-build.sh +63 -0
- package/scripts/legacy/optimize-performance.js +400 -0
- package/scripts/legacy/performance-monitor.js +263 -0
- package/scripts/legacy/performance-monitoring.js +532 -0
- package/scripts/legacy/performance-test-runner.js +645 -0
- package/scripts/legacy/quick-fix-ts.js +281 -0
- package/scripts/legacy/safe-build.sh +63 -0
- package/scripts/memory-monitor-coordinator.js +322 -0
- package/scripts/migrate-to-sdk.sh +520 -0
- package/scripts/migration/QUICK-START.md +189 -0
- package/scripts/migration/QUICK-START.md.backup-1760135091363 +189 -0
- package/scripts/migration/README.md +464 -0
- package/scripts/migration/TASK-1.3.2-COMPLETION-REPORT.md +500 -0
- package/scripts/migration/TASK-1.3.2-COMPLETION-REPORT.md.backup-1760135091348 +500 -0
- package/scripts/migration/UPDATE-PATHS-README.md +464 -0
- package/scripts/migration/UPDATE-PATHS-README.md.backup-1760135091337 +464 -0
- package/scripts/migration/example-patterns.json +19 -0
- package/scripts/migration/install-arm64.js +78 -0
- package/scripts/migration/install.js +83 -0
- package/scripts/migration/migrate-hooks.js +173 -0
- package/scripts/migration/migration-examples.ts +318 -0
- package/scripts/migration/reorganize-workspace.js +504 -0
- package/scripts/migration/test-update-paths.js +359 -0
- package/scripts/migration/update-paths.js +664 -0
- package/scripts/migration/validate-migration.js +647 -0
- package/scripts/monitor-loop.sh +65 -0
- package/scripts/monitor-memory.sh +47 -0
- package/scripts/monitor-migration.js +339 -0
- package/scripts/monitor.py +43 -0
- package/scripts/monitoring/README.md +178 -0
- package/scripts/monitoring/alert-monitor.sh +220 -0
- package/scripts/monitoring/analyze-resources.sh +199 -0
- package/scripts/monitoring/dashboards/rate-limiting-dashboard.json +211 -0
- package/scripts/monitoring/dynamic-monitor.sh +85 -0
- package/scripts/monitoring/launch-stability-test.sh +184 -0
- package/scripts/monitoring/monitor-test.sh +93 -0
- package/scripts/monitoring/pre-test-validation.sh +208 -0
- package/scripts/monitoring/quick-test-alerting.sh +118 -0
- package/scripts/monitoring/quick-test-rate-limiting.sh +206 -0
- package/scripts/monitoring/rate-limiting-monitor.sh +380 -0
- package/scripts/monitoring/resource-monitor.sh +126 -0
- package/scripts/monitoring/stability-monitor.js +429 -0
- package/scripts/monitoring/test-monitor-quick.sh +54 -0
- package/scripts/monitoring/view-alerts.sh +307 -0
- package/scripts/npm-metrics-collector.js +482 -0
- package/scripts/npm-package-validation.cjs +299 -0
- package/scripts/optimization/build-optimizer.js +438 -0
- package/scripts/optimization/config-validator.js +761 -0
- package/scripts/optimization/test-optimization.js +432 -0
- package/scripts/optimization/unified-activation.js +839 -0
- package/scripts/optimize-package-swarm.js +54 -0
- package/scripts/performance/ACTIVATION_COMMANDS.md +292 -0
- package/scripts/performance/sqlite-enhanced-activation.sh +583 -0
- package/scripts/performance/test-enhanced-backend.sh +504 -0
- package/scripts/performance-monitor.js +644 -0
- package/scripts/performance-test-runner.js +698 -0
- package/scripts/post-deployment-monitoring.js +350 -0
- package/scripts/post-edit-pipeline.js +2091 -0
- package/scripts/post-install-claude-md.js +78 -0
- package/scripts/postinstall.js +79 -0
- package/scripts/pre-publish-validation.cjs +212 -0
- package/scripts/pre-publish-validation.js +429 -0
- package/scripts/redis-lua/cleanup-blocking-coordination.lua +198 -0
- package/scripts/release-announcement.js +425 -0
- package/scripts/release-notification.js +248 -0
- package/scripts/release-rollback.js +376 -0
- package/scripts/release-validation.js +460 -0
- package/scripts/rollback-sdk.sh +66 -0
- package/scripts/run-production-validation.ts +590 -0
- package/scripts/run-stability-validation.sh +687 -0
- package/scripts/security/README.md +339 -0
- package/scripts/security/deployment-validation.cjs +279 -0
- package/scripts/security/envelope-encryption-confidence-report.cjs +422 -0
- package/scripts/security/install-git-hooks.sh +132 -0
- package/scripts/security/install-git-secrets.sh +295 -0
- package/scripts/security/rotate-api-keys.js +469 -0
- package/scripts/security/ruv-swarm-safe.js +74 -0
- package/scripts/security/security-audit.cjs +538 -0
- package/scripts/security/setup-redis-auth.sh +397 -0
- package/scripts/security/validate-envelope-encryption.cjs +340 -0
- package/scripts/security-scan.js +492 -0
- package/scripts/src/web/frontend/.claude-flow/metrics/agent-metrics.json +1 -0
- package/scripts/src/web/frontend/.claude-flow/metrics/performance.json +9 -0
- package/scripts/src/web/frontend/.claude-flow/metrics/task-metrics.json +10 -0
- package/scripts/switch-api.sh +158 -0
- package/scripts/sync-agents.js +290 -0
- package/scripts/test/50-agent-test.js +625 -0
- package/scripts/test/NEW_STABILITY_TEST_GUIDE.md +407 -0
- package/scripts/test/README.md +236 -0
- package/scripts/test/STABILITY_TEST_EXAMPLE.md +347 -0
- package/scripts/test/STABILITY_TEST_README.md +480 -0
- package/scripts/test/agent-worker.js +309 -0
- package/scripts/test/ai-coordination-test.js +650 -0
- package/scripts/test/ai-mesh-coordination-test.js +416 -0
- package/scripts/test/check-links.ts +274 -0
- package/scripts/test/check-performance-regression.ts +168 -0
- package/scripts/test/cli-agent-coordination-test.js +313 -0
- package/scripts/test/coordinator-multilingual-test.js +396 -0
- package/scripts/test/coordinator-transparency-demo.js +585 -0
- package/scripts/test/coverage-report.ts +692 -0
- package/scripts/test/generate-swarm-tests.js +633 -0
- package/scripts/test/integration-test-validation.cjs +253 -0
- package/scripts/test/load-test-swarm.js +576 -0
- package/scripts/test/mesh-coordination-zero-overlap-test.js +740 -0
- package/scripts/test/multilingual-hello-world-test.js +390 -0
- package/scripts/test/quick-multilingual-demo.js +464 -0
- package/scripts/test/real-agent-test.js +312 -0
- package/scripts/test/run-phase3-compliance-tests.js +427 -0
- package/scripts/test/run-stability-test-examples.sh +292 -0
- package/scripts/test/stability-results/stability-metrics.jsonl +83 -0
- package/scripts/test/stability-results/stability-test-report.json +128 -0
- package/scripts/test/stability-results/stability-test.log +1827 -0
- package/scripts/test/stability-test-50-agents.js +734 -0
- package/scripts/test/test-batch-tasks.ts +29 -0
- package/scripts/test/test-byzantine-resolution.js +246 -0
- package/scripts/test/test-claude-spawn-options.sh +63 -0
- package/scripts/test/test-cli-wizard.js +331 -0
- package/scripts/test/test-comprehensive.js +401 -0
- package/scripts/test/test-coordination-features.ts +238 -0
- package/scripts/test/test-fallback-systems.js +276 -0
- package/scripts/test/test-init-command.ts +302 -0
- package/scripts/test/test-mcp.ts +251 -0
- package/scripts/test/test-runner.ts +568 -0
- package/scripts/test/test-swarm-integration.sh +92 -0
- package/scripts/test/test-swarm.ts +142 -0
- package/scripts/test/validation-summary.ts +408 -0
- package/scripts/test-cleanup-performance.sh +416 -0
- package/scripts/test-dashboard-auth.cjs +203 -0
- package/scripts/test-docker-deployment.sh +207 -0
- package/scripts/test-npm-package.cjs +167 -0
- package/scripts/test-provider-routing.cjs +226 -0
- package/scripts/test-routing-telemetry.cjs +147 -0
- package/scripts/test-runner.cjs +154 -0
- package/scripts/test-zai-10k.cjs +81 -0
- package/scripts/test-zai-api.cjs +191 -0
- package/scripts/test-zai-diagnostic.cjs +151 -0
- package/scripts/test-zai-final.cjs +128 -0
- package/scripts/test-zai-with-env.cjs +85 -0
- package/scripts/utils/README.md +261 -0
- package/scripts/utils/clean-build-artifacts.sh +94 -0
- package/scripts/utils/cleanup-root.sh +69 -0
- package/scripts/utils/fix-cliffy-imports.js +307 -0
- package/scripts/utils/fix-duplicate-imports.js +114 -0
- package/scripts/utils/fix-error-handling.cjs +70 -0
- package/scripts/utils/fix-import-paths.js +104 -0
- package/scripts/utils/fix-imports.js +116 -0
- package/scripts/utils/fix-shebang.js +78 -0
- package/scripts/utils/fix-test-modules.js +27 -0
- package/scripts/utils/fix-timezone-issue-246.js +200 -0
- package/scripts/utils/fix-ts-comprehensive.py +182 -0
- package/scripts/utils/fix-ts-targeted-batch.js +250 -0
- package/scripts/utils/remove-benchmark-conflicts.sh +140 -0
- package/scripts/utils/simple-test-fixer.js +190 -0
- package/scripts/utils/validate-metrics-structure.cjs +144 -0
- package/scripts/validate-agent-hooks.js +506 -0
- package/scripts/validate-changelog.js +241 -0
- package/scripts/validate-coordination-cli.js +69 -0
- package/scripts/validate-coordination-toggle-integration.cjs +501 -0
- package/scripts/validate-docker-infrastructure.sh +502 -0
- package/scripts/validate-entry-points.js +300 -0
- package/scripts/validate-stage3-performance.ts +377 -0
- package/scripts/validate-template-bundling.js +180 -0
- package/scripts/validation/README.md +33 -0
- package/scripts/validation/acl-security-validation.cjs +214 -0
- package/scripts/validation/acl-security-validation.js +402 -0
- package/scripts/validation/byzantine-verification.js +407 -0
- package/scripts/validation/final-phase-2-consensus.cjs +219 -0
- package/scripts/validation/final-security-validation.js +791 -0
- package/scripts/validation/final-wasm-validation.cjs +840 -0
- package/scripts/validation/integration-test-analysis.js +105 -0
- package/scripts/validation/phase-0-comprehensive-validation.js +474 -0
- package/scripts/validation/phase-0-consensus-report.js +139 -0
- package/scripts/validation/phase-0-final-report.js +112 -0
- package/scripts/validation/phase-0-redis-consensus-report.js +129 -0
- package/scripts/validation/phase-0-validation-improved.js +490 -0
- package/scripts/validation/phase-0-validation-test.js +65 -0
- package/scripts/validation/phase-1-consensus-report.cjs +342 -0
- package/scripts/validation/phase-1-consensus-validation.cjs +551 -0
- package/scripts/validation/phase-1-consensus-validation.js +551 -0
- package/scripts/validation/phase-2-consensus-report.cjs +186 -0
- package/scripts/validation/phase-2-validation.cjs +171 -0
- package/scripts/validation/phase-2-validation.js +171 -0
- package/scripts/validation/phase-4-consensus-report.js +181 -0
- package/scripts/validation/phase-4-final-validation.js +351 -0
- package/scripts/validation/phase-5-consensus-report.cjs +113 -0
- package/scripts/validation/phase-5-consensus-report.js +113 -0
- package/scripts/validation/security-analysis.js +49 -0
- package/scripts/validation/security-validation.js +492 -0
- package/scripts/validation/simple-security-validation.js +464 -0
- package/scripts/verify-installation.js +112 -0
- package/scripts/verify-mcp-server.js +86 -0
- package/scripts/verify-sdk-phase1.cjs +293 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# scripts/monitoring/alert-monitor.sh - Continuous monitoring daemon with alerting
|
|
3
|
+
# Phase 1 Sprint 1.1: Monitoring loop integration
|
|
4
|
+
|
|
5
|
+
set -euo pipefail
|
|
6
|
+
|
|
7
|
+
# ==============================================================================
|
|
8
|
+
# CONFIGURATION
|
|
9
|
+
# ==============================================================================
|
|
10
|
+
|
|
11
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
12
|
+
LIB_DIR="${SCRIPT_DIR}/../../lib"
|
|
13
|
+
METRICS_FILE="${METRICS_FILE:-/dev/shm/cfn-metrics.jsonl}"
|
|
14
|
+
ALERT_LOG_FILE="${ALERT_LOG_FILE:-/dev/shm/cfn-alerts.jsonl}"
|
|
15
|
+
MONITOR_PID_FILE="${MONITOR_PID_FILE:-/dev/shm/alert-monitor.pid}"
|
|
16
|
+
|
|
17
|
+
# Check interval (seconds)
|
|
18
|
+
CHECK_INTERVAL="${CHECK_INTERVAL:-30}"
|
|
19
|
+
|
|
20
|
+
# Cleanup retention (hours)
|
|
21
|
+
ALERT_RETENTION_HOURS="${ALERT_RETENTION_HOURS:-24}"
|
|
22
|
+
METRICS_RETENTION_HOURS="${METRICS_RETENTION_HOURS:-48}"
|
|
23
|
+
|
|
24
|
+
# ==============================================================================
|
|
25
|
+
# DEPENDENCIES
|
|
26
|
+
# ==============================================================================
|
|
27
|
+
|
|
28
|
+
# Source alerting library
|
|
29
|
+
if [ -f "$LIB_DIR/alerting.sh" ]; then
|
|
30
|
+
# shellcheck source=../../lib/alerting.sh
|
|
31
|
+
source "$LIB_DIR/alerting.sh"
|
|
32
|
+
else
|
|
33
|
+
echo "[ERROR] Alerting library not found at $LIB_DIR/alerting.sh" >&2
|
|
34
|
+
exit 1
|
|
35
|
+
fi
|
|
36
|
+
|
|
37
|
+
# ==============================================================================
|
|
38
|
+
# SIGNAL HANDLERS
|
|
39
|
+
# ==============================================================================
|
|
40
|
+
|
|
41
|
+
cleanup() {
|
|
42
|
+
echo "[INFO] Shutting down alert monitor (PID: $$)" >&2
|
|
43
|
+
rm -f "$MONITOR_PID_FILE"
|
|
44
|
+
exit 0
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
trap cleanup SIGTERM SIGINT
|
|
48
|
+
|
|
49
|
+
# ==============================================================================
|
|
50
|
+
# MONITORING FUNCTIONS
|
|
51
|
+
# ==============================================================================
|
|
52
|
+
|
|
53
|
+
# start_monitor - Begin continuous threshold monitoring
|
|
54
|
+
start_monitor() {
|
|
55
|
+
local iteration=0
|
|
56
|
+
|
|
57
|
+
echo "[INFO] Alert monitor started (PID: $$)" >&2
|
|
58
|
+
echo "[INFO] Check interval: ${CHECK_INTERVAL}s" >&2
|
|
59
|
+
echo "[INFO] Metrics file: $METRICS_FILE" >&2
|
|
60
|
+
echo "[INFO] Alert log: $ALERT_LOG_FILE" >&2
|
|
61
|
+
|
|
62
|
+
# Write PID file
|
|
63
|
+
echo $$ > "$MONITOR_PID_FILE"
|
|
64
|
+
|
|
65
|
+
while true; do
|
|
66
|
+
iteration=$((iteration + 1))
|
|
67
|
+
|
|
68
|
+
# Check thresholds
|
|
69
|
+
if [ -f "$METRICS_FILE" ]; then
|
|
70
|
+
check_thresholds "$METRICS_FILE" 2>&1 | while IFS= read -r line; do
|
|
71
|
+
echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] $line"
|
|
72
|
+
done
|
|
73
|
+
fi
|
|
74
|
+
|
|
75
|
+
# Periodic cleanup (every 100 iterations)
|
|
76
|
+
if [ $((iteration % 100)) -eq 0 ]; then
|
|
77
|
+
echo "[INFO] Running periodic cleanup (iteration $iteration)" >&2
|
|
78
|
+
cleanup_old_data
|
|
79
|
+
fi
|
|
80
|
+
|
|
81
|
+
# Sleep until next check
|
|
82
|
+
sleep "$CHECK_INTERVAL"
|
|
83
|
+
done
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# cleanup_old_data - Remove old metrics and alerts
|
|
87
|
+
cleanup_old_data() {
|
|
88
|
+
# Clear old alerts
|
|
89
|
+
if [ -f "$ALERT_LOG_FILE" ]; then
|
|
90
|
+
local alert_count_before
|
|
91
|
+
alert_count_before=$(wc -l < "$ALERT_LOG_FILE" 2>/dev/null || echo "0")
|
|
92
|
+
|
|
93
|
+
clear_old_alerts "$ALERT_RETENTION_HOURS"
|
|
94
|
+
|
|
95
|
+
local alert_count_after
|
|
96
|
+
alert_count_after=$(wc -l < "$ALERT_LOG_FILE" 2>/dev/null || echo "0")
|
|
97
|
+
|
|
98
|
+
echo "[INFO] Cleared $((alert_count_before - alert_count_after)) old alerts" >&2
|
|
99
|
+
fi
|
|
100
|
+
|
|
101
|
+
# Clear old metrics
|
|
102
|
+
if [ -f "$METRICS_FILE" ]; then
|
|
103
|
+
local metrics_count_before
|
|
104
|
+
metrics_count_before=$(wc -l < "$METRICS_FILE" 2>/dev/null || echo "0")
|
|
105
|
+
|
|
106
|
+
local cutoff_time
|
|
107
|
+
cutoff_time=$(date -u -d "$METRICS_RETENTION_HOURS hours ago" +"%Y-%m-%dT%H:%M:%S" 2>/dev/null || \
|
|
108
|
+
date -u -v-"${METRICS_RETENTION_HOURS}H" +"%Y-%m-%dT%H:%M:%S" 2>/dev/null || \
|
|
109
|
+
echo "1970-01-01T00:00:00")
|
|
110
|
+
|
|
111
|
+
local temp_file="${METRICS_FILE}.tmp"
|
|
112
|
+
jq -c --arg cutoff "$cutoff_time" \
|
|
113
|
+
'select(.timestamp >= $cutoff)' \
|
|
114
|
+
"$METRICS_FILE" > "$temp_file" 2>/dev/null || true
|
|
115
|
+
|
|
116
|
+
if [ -f "$temp_file" ]; then
|
|
117
|
+
mv "$temp_file" "$METRICS_FILE"
|
|
118
|
+
|
|
119
|
+
local metrics_count_after
|
|
120
|
+
metrics_count_after=$(wc -l < "$METRICS_FILE" 2>/dev/null || echo "0")
|
|
121
|
+
|
|
122
|
+
echo "[INFO] Cleared $((metrics_count_before - metrics_count_after)) old metrics" >&2
|
|
123
|
+
fi
|
|
124
|
+
fi
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# get_monitor_status - Check if monitor is running
|
|
128
|
+
get_monitor_status() {
|
|
129
|
+
if [ -f "$MONITOR_PID_FILE" ]; then
|
|
130
|
+
local pid
|
|
131
|
+
pid=$(cat "$MONITOR_PID_FILE")
|
|
132
|
+
|
|
133
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
134
|
+
echo "running (PID: $pid)"
|
|
135
|
+
return 0
|
|
136
|
+
else
|
|
137
|
+
echo "stale (PID file exists but process not running)"
|
|
138
|
+
rm -f "$MONITOR_PID_FILE"
|
|
139
|
+
return 1
|
|
140
|
+
fi
|
|
141
|
+
else
|
|
142
|
+
echo "stopped"
|
|
143
|
+
return 1
|
|
144
|
+
fi
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
# stop_monitor - Stop running monitor
|
|
148
|
+
stop_monitor() {
|
|
149
|
+
if [ -f "$MONITOR_PID_FILE" ]; then
|
|
150
|
+
local pid
|
|
151
|
+
pid=$(cat "$MONITOR_PID_FILE")
|
|
152
|
+
|
|
153
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
154
|
+
echo "[INFO] Stopping monitor (PID: $pid)" >&2
|
|
155
|
+
kill -TERM "$pid"
|
|
156
|
+
|
|
157
|
+
# Wait for graceful shutdown (max 5 seconds)
|
|
158
|
+
for i in {1..10}; do
|
|
159
|
+
if ! kill -0 "$pid" 2>/dev/null; then
|
|
160
|
+
echo "[INFO] Monitor stopped successfully" >&2
|
|
161
|
+
return 0
|
|
162
|
+
fi
|
|
163
|
+
sleep 0.5
|
|
164
|
+
done
|
|
165
|
+
|
|
166
|
+
# Force kill if still running
|
|
167
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
168
|
+
echo "[WARN] Monitor did not stop gracefully, forcing..." >&2
|
|
169
|
+
kill -KILL "$pid" 2>/dev/null || true
|
|
170
|
+
fi
|
|
171
|
+
fi
|
|
172
|
+
|
|
173
|
+
rm -f "$MONITOR_PID_FILE"
|
|
174
|
+
else
|
|
175
|
+
echo "[INFO] No monitor running" >&2
|
|
176
|
+
fi
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
# ==============================================================================
|
|
180
|
+
# MAIN EXECUTION
|
|
181
|
+
# ==============================================================================
|
|
182
|
+
|
|
183
|
+
case "${1:-start}" in
|
|
184
|
+
start)
|
|
185
|
+
if [ -f "$MONITOR_PID_FILE" ]; then
|
|
186
|
+
echo "[ERROR] Monitor already running (PID: $(cat "$MONITOR_PID_FILE"))" >&2
|
|
187
|
+
exit 1
|
|
188
|
+
fi
|
|
189
|
+
start_monitor
|
|
190
|
+
;;
|
|
191
|
+
|
|
192
|
+
stop)
|
|
193
|
+
stop_monitor
|
|
194
|
+
;;
|
|
195
|
+
|
|
196
|
+
restart)
|
|
197
|
+
stop_monitor
|
|
198
|
+
sleep 1
|
|
199
|
+
start_monitor
|
|
200
|
+
;;
|
|
201
|
+
|
|
202
|
+
status)
|
|
203
|
+
get_monitor_status
|
|
204
|
+
;;
|
|
205
|
+
|
|
206
|
+
background)
|
|
207
|
+
# Start in background
|
|
208
|
+
if [ -f "$MONITOR_PID_FILE" ]; then
|
|
209
|
+
echo "[ERROR] Monitor already running (PID: $(cat "$MONITOR_PID_FILE"))" >&2
|
|
210
|
+
exit 1
|
|
211
|
+
fi
|
|
212
|
+
nohup "$0" start > /dev/shm/alert-monitor.log 2>&1 &
|
|
213
|
+
echo "[INFO] Monitor started in background (PID: $!)" >&2
|
|
214
|
+
;;
|
|
215
|
+
|
|
216
|
+
*)
|
|
217
|
+
echo "Usage: $0 {start|stop|restart|status|background}" >&2
|
|
218
|
+
exit 1
|
|
219
|
+
;;
|
|
220
|
+
esac
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Analysis script for resource monitoring results
|
|
3
|
+
# Identifies leaks, spikes, and anomalies
|
|
4
|
+
|
|
5
|
+
set -euo pipefail
|
|
6
|
+
|
|
7
|
+
CSV_FILE="${1:-}"
|
|
8
|
+
|
|
9
|
+
if [ -z "${CSV_FILE}" ] || [ ! -f "${CSV_FILE}" ]; then
|
|
10
|
+
echo "Usage: $0 <csv_file>"
|
|
11
|
+
echo "Example: $0 ./reports/monitoring/resource-usage-20250106_120000.csv"
|
|
12
|
+
exit 1
|
|
13
|
+
fi
|
|
14
|
+
|
|
15
|
+
OUTPUT_DIR=$(dirname "${CSV_FILE}")
|
|
16
|
+
REPORT_FILE="${OUTPUT_DIR}/analysis-report-$(date +%Y%m%d_%H%M%S).txt"
|
|
17
|
+
|
|
18
|
+
echo "========================================" | tee "${REPORT_FILE}"
|
|
19
|
+
echo "RESOURCE MONITORING ANALYSIS REPORT" | tee -a "${REPORT_FILE}"
|
|
20
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
21
|
+
echo "Input: ${CSV_FILE}" | tee -a "${REPORT_FILE}"
|
|
22
|
+
echo "Generated: $(date)" | tee -a "${REPORT_FILE}"
|
|
23
|
+
echo "" | tee -a "${REPORT_FILE}"
|
|
24
|
+
|
|
25
|
+
# Skip header, get data
|
|
26
|
+
DATA=$(tail -n +2 "${CSV_FILE}")
|
|
27
|
+
|
|
28
|
+
if [ -z "${DATA}" ]; then
|
|
29
|
+
echo "ERROR: No data found in CSV file" | tee -a "${REPORT_FILE}"
|
|
30
|
+
exit 1
|
|
31
|
+
fi
|
|
32
|
+
|
|
33
|
+
# Total samples
|
|
34
|
+
TOTAL_SAMPLES=$(echo "${DATA}" | wc -l)
|
|
35
|
+
echo "Total Samples: ${TOTAL_SAMPLES}" | tee -a "${REPORT_FILE}"
|
|
36
|
+
|
|
37
|
+
# Duration
|
|
38
|
+
FIRST_ELAPSED=$(echo "${DATA}" | head -1 | cut -d',' -f2)
|
|
39
|
+
LAST_ELAPSED=$(echo "${DATA}" | tail -1 | cut -d',' -f2)
|
|
40
|
+
DURATION=$((LAST_ELAPSED - FIRST_ELAPSED))
|
|
41
|
+
echo "Duration: ${DURATION} seconds" | tee -a "${REPORT_FILE}"
|
|
42
|
+
echo "" | tee -a "${REPORT_FILE}"
|
|
43
|
+
|
|
44
|
+
# Memory RSS analysis
|
|
45
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
46
|
+
echo "MEMORY (RSS) ANALYSIS" | tee -a "${REPORT_FILE}"
|
|
47
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
48
|
+
|
|
49
|
+
MEMORY_RSS_VALUES=$(echo "${DATA}" | cut -d',' -f3)
|
|
50
|
+
MEMORY_RSS_MIN=$(echo "${MEMORY_RSS_VALUES}" | sort -n | head -1)
|
|
51
|
+
MEMORY_RSS_MAX=$(echo "${MEMORY_RSS_VALUES}" | sort -n | tail -1)
|
|
52
|
+
MEMORY_RSS_AVG=$(echo "${MEMORY_RSS_VALUES}" | awk '{sum+=$1; count++} END {printf "%.2f", sum/count}')
|
|
53
|
+
|
|
54
|
+
MEMORY_RSS_FIRST=$(echo "${MEMORY_RSS_VALUES}" | head -1)
|
|
55
|
+
MEMORY_RSS_LAST=$(echo "${MEMORY_RSS_VALUES}" | tail -1)
|
|
56
|
+
MEMORY_RSS_GROWTH=$(echo "scale=2; ${MEMORY_RSS_LAST} - ${MEMORY_RSS_FIRST}" | bc)
|
|
57
|
+
MEMORY_RSS_GROWTH_RATE=$(echo "scale=4; ${MEMORY_RSS_GROWTH} / ${DURATION}" | bc 2>/dev/null || echo "0")
|
|
58
|
+
|
|
59
|
+
echo "Min: ${MEMORY_RSS_MIN} MB" | tee -a "${REPORT_FILE}"
|
|
60
|
+
echo "Max: ${MEMORY_RSS_MAX} MB" | tee -a "${REPORT_FILE}"
|
|
61
|
+
echo "Avg: ${MEMORY_RSS_AVG} MB" | tee -a "${REPORT_FILE}"
|
|
62
|
+
echo "First: ${MEMORY_RSS_FIRST} MB" | tee -a "${REPORT_FILE}"
|
|
63
|
+
echo "Last: ${MEMORY_RSS_LAST} MB" | tee -a "${REPORT_FILE}"
|
|
64
|
+
echo "Growth: ${MEMORY_RSS_GROWTH} MB" | tee -a "${REPORT_FILE}"
|
|
65
|
+
echo "Growth Rate: ${MEMORY_RSS_GROWTH_RATE} MB/sec" | tee -a "${REPORT_FILE}"
|
|
66
|
+
|
|
67
|
+
# Memory leak detection (growth rate > 1 MB/sec)
|
|
68
|
+
if (( $(echo "${MEMORY_RSS_GROWTH_RATE} > 1.0" | bc -l 2>/dev/null || echo 0) )); then
|
|
69
|
+
echo "⚠️ LEAK DETECTED: Memory growing at ${MEMORY_RSS_GROWTH_RATE} MB/sec" | tee -a "${REPORT_FILE}"
|
|
70
|
+
elif (( $(echo "${MEMORY_RSS_GROWTH_RATE} > 0.1" | bc -l 2>/dev/null || echo 0) )); then
|
|
71
|
+
echo "⚠️ WARNING: Slow memory growth detected (${MEMORY_RSS_GROWTH_RATE} MB/sec)" | tee -a "${REPORT_FILE}"
|
|
72
|
+
else
|
|
73
|
+
echo "✅ No significant memory leak detected" | tee -a "${REPORT_FILE}"
|
|
74
|
+
fi
|
|
75
|
+
echo "" | tee -a "${REPORT_FILE}"
|
|
76
|
+
|
|
77
|
+
# CPU analysis
|
|
78
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
79
|
+
echo "CPU ANALYSIS" | tee -a "${REPORT_FILE}"
|
|
80
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
81
|
+
|
|
82
|
+
CPU_VALUES=$(echo "${DATA}" | cut -d',' -f6)
|
|
83
|
+
CPU_MIN=$(echo "${CPU_VALUES}" | sort -n | head -1)
|
|
84
|
+
CPU_MAX=$(echo "${CPU_VALUES}" | sort -n | tail -1)
|
|
85
|
+
CPU_AVG=$(echo "${CPU_VALUES}" | awk '{sum+=$1; count++} END {printf "%.2f", sum/count}')
|
|
86
|
+
|
|
87
|
+
echo "Min: ${CPU_MIN}%" | tee -a "${REPORT_FILE}"
|
|
88
|
+
echo "Max: ${CPU_MAX}%" | tee -a "${REPORT_FILE}"
|
|
89
|
+
echo "Avg: ${CPU_AVG}%" | tee -a "${REPORT_FILE}"
|
|
90
|
+
|
|
91
|
+
# CPU spike detection (>80% sustained for >5 samples)
|
|
92
|
+
CPU_SPIKES=$(echo "${CPU_VALUES}" | awk '{if ($1 > 80) count++} END {print count}')
|
|
93
|
+
if [ "${CPU_SPIKES}" -gt 5 ]; then
|
|
94
|
+
echo "⚠️ CPU SPIKES: ${CPU_SPIKES} samples above 80%" | tee -a "${REPORT_FILE}"
|
|
95
|
+
else
|
|
96
|
+
echo "✅ No sustained CPU spikes detected" | tee -a "${REPORT_FILE}"
|
|
97
|
+
fi
|
|
98
|
+
echo "" | tee -a "${REPORT_FILE}"
|
|
99
|
+
|
|
100
|
+
# File descriptor analysis
|
|
101
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
102
|
+
echo "FILE DESCRIPTOR ANALYSIS" | tee -a "${REPORT_FILE}"
|
|
103
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
104
|
+
|
|
105
|
+
FD_VALUES=$(echo "${DATA}" | cut -d',' -f7)
|
|
106
|
+
FD_MIN=$(echo "${FD_VALUES}" | sort -n | head -1)
|
|
107
|
+
FD_MAX=$(echo "${FD_VALUES}" | sort -n | tail -1)
|
|
108
|
+
FD_AVG=$(echo "${FD_VALUES}" | awk '{sum+=$1; count++} END {printf "%.0f", sum/count}')
|
|
109
|
+
|
|
110
|
+
FD_FIRST=$(echo "${FD_VALUES}" | head -1)
|
|
111
|
+
FD_LAST=$(echo "${FD_VALUES}" | tail -1)
|
|
112
|
+
FD_GROWTH=$((FD_LAST - FD_FIRST))
|
|
113
|
+
|
|
114
|
+
echo "Min: ${FD_MIN}" | tee -a "${REPORT_FILE}"
|
|
115
|
+
echo "Max: ${FD_MAX}" | tee -a "${REPORT_FILE}"
|
|
116
|
+
echo "Avg: ${FD_AVG}" | tee -a "${REPORT_FILE}"
|
|
117
|
+
echo "First: ${FD_FIRST}" | tee -a "${REPORT_FILE}"
|
|
118
|
+
echo "Last: ${FD_LAST}" | tee -a "${REPORT_FILE}"
|
|
119
|
+
echo "Growth: ${FD_GROWTH}" | tee -a "${REPORT_FILE}"
|
|
120
|
+
|
|
121
|
+
# FD leak detection (growth > 100)
|
|
122
|
+
if [ "${FD_GROWTH}" -gt 100 ]; then
|
|
123
|
+
echo "⚠️ FD LEAK DETECTED: ${FD_GROWTH} unclosed file descriptors" | tee -a "${REPORT_FILE}"
|
|
124
|
+
elif [ "${FD_GROWTH}" -gt 20 ]; then
|
|
125
|
+
echo "⚠️ WARNING: FD growth detected (${FD_GROWTH})" | tee -a "${REPORT_FILE}"
|
|
126
|
+
else
|
|
127
|
+
echo "✅ No significant FD leak detected" | tee -a "${REPORT_FILE}"
|
|
128
|
+
fi
|
|
129
|
+
echo "" | tee -a "${REPORT_FILE}"
|
|
130
|
+
|
|
131
|
+
# Process count analysis
|
|
132
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
133
|
+
echo "PROCESS COUNT ANALYSIS" | tee -a "${REPORT_FILE}"
|
|
134
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
135
|
+
|
|
136
|
+
PROC_VALUES=$(echo "${DATA}" | cut -d',' -f8)
|
|
137
|
+
PROC_MIN=$(echo "${PROC_VALUES}" | sort -n | head -1)
|
|
138
|
+
PROC_MAX=$(echo "${PROC_VALUES}" | sort -n | tail -1)
|
|
139
|
+
PROC_AVG=$(echo "${PROC_VALUES}" | awk '{sum+=$1; count++} END {printf "%.0f", sum/count}')
|
|
140
|
+
|
|
141
|
+
PROC_FIRST=$(echo "${PROC_VALUES}" | head -1)
|
|
142
|
+
PROC_LAST=$(echo "${PROC_VALUES}" | tail -1)
|
|
143
|
+
PROC_GROWTH=$((PROC_LAST - PROC_FIRST))
|
|
144
|
+
|
|
145
|
+
echo "Min: ${PROC_MIN}" | tee -a "${REPORT_FILE}"
|
|
146
|
+
echo "Max: ${PROC_MAX}" | tee -a "${REPORT_FILE}"
|
|
147
|
+
echo "Avg: ${PROC_AVG}" | tee -a "${REPORT_FILE}"
|
|
148
|
+
echo "First: ${PROC_FIRST}" | tee -a "${REPORT_FILE}"
|
|
149
|
+
echo "Last: ${PROC_LAST}" | tee -a "${REPORT_FILE}"
|
|
150
|
+
echo "Growth: ${PROC_GROWTH}" | tee -a "${REPORT_FILE}"
|
|
151
|
+
|
|
152
|
+
# Process leak detection (growth > 50)
|
|
153
|
+
if [ "${PROC_GROWTH}" -gt 50 ]; then
|
|
154
|
+
echo "⚠️ PROCESS LEAK DETECTED: ${PROC_GROWTH} orphaned processes" | tee -a "${REPORT_FILE}"
|
|
155
|
+
elif [ "${PROC_GROWTH}" -gt 10 ]; then
|
|
156
|
+
echo "⚠️ WARNING: Process growth detected (${PROC_GROWTH})" | tee -a "${REPORT_FILE}"
|
|
157
|
+
else
|
|
158
|
+
echo "✅ No significant process leak detected" | tee -a "${REPORT_FILE}"
|
|
159
|
+
fi
|
|
160
|
+
echo "" | tee -a "${REPORT_FILE}"
|
|
161
|
+
|
|
162
|
+
# Anomaly summary
|
|
163
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
164
|
+
echo "ANOMALY SUMMARY" | tee -a "${REPORT_FILE}"
|
|
165
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
166
|
+
|
|
167
|
+
ANOMALY_COUNT=0
|
|
168
|
+
|
|
169
|
+
if (( $(echo "${MEMORY_RSS_GROWTH_RATE} > 0.1" | bc -l 2>/dev/null || echo 0) )); then
|
|
170
|
+
echo "• Memory growth: ${MEMORY_RSS_GROWTH_RATE} MB/sec" | tee -a "${REPORT_FILE}"
|
|
171
|
+
ANOMALY_COUNT=$((ANOMALY_COUNT + 1))
|
|
172
|
+
fi
|
|
173
|
+
|
|
174
|
+
if [ "${CPU_SPIKES}" -gt 5 ]; then
|
|
175
|
+
echo "• CPU spikes: ${CPU_SPIKES} samples above 80%" | tee -a "${REPORT_FILE}"
|
|
176
|
+
ANOMALY_COUNT=$((ANOMALY_COUNT + 1))
|
|
177
|
+
fi
|
|
178
|
+
|
|
179
|
+
if [ "${FD_GROWTH}" -gt 20 ]; then
|
|
180
|
+
echo "• FD growth: ${FD_GROWTH}" | tee -a "${REPORT_FILE}"
|
|
181
|
+
ANOMALY_COUNT=$((ANOMALY_COUNT + 1))
|
|
182
|
+
fi
|
|
183
|
+
|
|
184
|
+
if [ "${PROC_GROWTH}" -gt 10 ]; then
|
|
185
|
+
echo "• Process growth: ${PROC_GROWTH}" | tee -a "${REPORT_FILE}"
|
|
186
|
+
ANOMALY_COUNT=$((ANOMALY_COUNT + 1))
|
|
187
|
+
fi
|
|
188
|
+
|
|
189
|
+
if [ "${ANOMALY_COUNT}" -eq 0 ]; then
|
|
190
|
+
echo "✅ No anomalies detected - system healthy" | tee -a "${REPORT_FILE}"
|
|
191
|
+
else
|
|
192
|
+
echo "" | tee -a "${REPORT_FILE}"
|
|
193
|
+
echo "⚠️ Total anomalies: ${ANOMALY_COUNT}" | tee -a "${REPORT_FILE}"
|
|
194
|
+
fi
|
|
195
|
+
|
|
196
|
+
echo "" | tee -a "${REPORT_FILE}"
|
|
197
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
198
|
+
echo "Report saved: ${REPORT_FILE}" | tee -a "${REPORT_FILE}"
|
|
199
|
+
echo "========================================" | tee -a "${REPORT_FILE}"
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
{
|
|
2
|
+
"dashboard": {
|
|
3
|
+
"title": "CFN Rate Limiting & Backpressure Monitoring",
|
|
4
|
+
"description": "Real-time monitoring of message inbox utilization, backpressure events, and overflow alerts",
|
|
5
|
+
"version": "1.0.0",
|
|
6
|
+
"tags": ["rate-limiting", "backpressure", "inbox", "coordination"],
|
|
7
|
+
"timezone": "UTC",
|
|
8
|
+
"refresh": "10s",
|
|
9
|
+
"panels": [
|
|
10
|
+
{
|
|
11
|
+
"id": 1,
|
|
12
|
+
"title": "Inbox Utilization by Agent",
|
|
13
|
+
"type": "timeseries",
|
|
14
|
+
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
|
|
15
|
+
"targets": [
|
|
16
|
+
{
|
|
17
|
+
"metric": "inbox.utilization",
|
|
18
|
+
"legend": "{{agent}}",
|
|
19
|
+
"unit": "percent"
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"thresholds": [
|
|
23
|
+
{ "value": 75, "color": "yellow", "label": "Warning" },
|
|
24
|
+
{ "value": 90, "color": "red", "label": "Critical" }
|
|
25
|
+
],
|
|
26
|
+
"description": "Message inbox utilization percentage per agent (max 100 messages)"
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"id": 2,
|
|
30
|
+
"title": "Inbox Message Count",
|
|
31
|
+
"type": "timeseries",
|
|
32
|
+
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
|
|
33
|
+
"targets": [
|
|
34
|
+
{
|
|
35
|
+
"metric": "inbox.size",
|
|
36
|
+
"legend": "{{agent}}",
|
|
37
|
+
"unit": "count"
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
"thresholds": [
|
|
41
|
+
{ "value": 75, "color": "yellow", "label": "Warning (75 msgs)" },
|
|
42
|
+
{ "value": 90, "color": "red", "label": "Critical (90 msgs)" }
|
|
43
|
+
],
|
|
44
|
+
"description": "Absolute message count in agent inboxes"
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"id": 3,
|
|
48
|
+
"title": "Backpressure Events Rate",
|
|
49
|
+
"type": "timeseries",
|
|
50
|
+
"gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 },
|
|
51
|
+
"targets": [
|
|
52
|
+
{
|
|
53
|
+
"metric": "backpressure.events_per_min",
|
|
54
|
+
"legend": "Backpressure events/min",
|
|
55
|
+
"unit": "count"
|
|
56
|
+
}
|
|
57
|
+
],
|
|
58
|
+
"thresholds": [
|
|
59
|
+
{ "value": 100, "color": "yellow", "label": "Warning threshold" }
|
|
60
|
+
],
|
|
61
|
+
"description": "Rate of backpressure wait events (high rate indicates system load)"
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
"id": 4,
|
|
65
|
+
"title": "Message Send Failures",
|
|
66
|
+
"type": "timeseries",
|
|
67
|
+
"gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 },
|
|
68
|
+
"targets": [
|
|
69
|
+
{
|
|
70
|
+
"metric": "coordination.send_failures_per_min",
|
|
71
|
+
"legend": "Send failures/min",
|
|
72
|
+
"unit": "count"
|
|
73
|
+
}
|
|
74
|
+
],
|
|
75
|
+
"thresholds": [
|
|
76
|
+
{ "value": 10, "color": "red", "label": "Critical threshold" }
|
|
77
|
+
],
|
|
78
|
+
"description": "Message delivery failure rate (critical if >10/min)"
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"id": 5,
|
|
82
|
+
"title": "Inbox Overflow Events",
|
|
83
|
+
"type": "timeseries",
|
|
84
|
+
"gridPos": { "x": 0, "y": 16, "w": 12, "h": 8 },
|
|
85
|
+
"targets": [
|
|
86
|
+
{
|
|
87
|
+
"metric": "inbox.overflow_events_per_min",
|
|
88
|
+
"legend": "Overflow events/min",
|
|
89
|
+
"unit": "count"
|
|
90
|
+
}
|
|
91
|
+
],
|
|
92
|
+
"thresholds": [
|
|
93
|
+
{ "value": 1, "color": "red", "label": "Any overflow is critical" }
|
|
94
|
+
],
|
|
95
|
+
"description": "Inbox overflow events (messages dropped due to full inbox)"
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"id": 6,
|
|
99
|
+
"title": "Alert Summary",
|
|
100
|
+
"type": "stat",
|
|
101
|
+
"gridPos": { "x": 12, "y": 16, "w": 6, "h": 8 },
|
|
102
|
+
"targets": [
|
|
103
|
+
{
|
|
104
|
+
"query": "count_alerts_by_severity",
|
|
105
|
+
"fields": ["critical", "warning", "info"]
|
|
106
|
+
}
|
|
107
|
+
],
|
|
108
|
+
"description": "Alert counts by severity level (last hour)"
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
"id": 7,
|
|
112
|
+
"title": "Top Utilized Agents",
|
|
113
|
+
"type": "table",
|
|
114
|
+
"gridPos": { "x": 18, "y": 16, "w": 6, "h": 8 },
|
|
115
|
+
"targets": [
|
|
116
|
+
{
|
|
117
|
+
"query": "top_inbox_utilization",
|
|
118
|
+
"fields": ["agent", "utilization", "message_count"],
|
|
119
|
+
"limit": 10,
|
|
120
|
+
"order": "desc"
|
|
121
|
+
}
|
|
122
|
+
],
|
|
123
|
+
"description": "Agents with highest inbox utilization"
|
|
124
|
+
}
|
|
125
|
+
],
|
|
126
|
+
"annotations": [
|
|
127
|
+
{
|
|
128
|
+
"name": "Rate Limiting Alerts",
|
|
129
|
+
"datasource": "cfn-alerts",
|
|
130
|
+
"filter": {
|
|
131
|
+
"tags": ["inbox_high_utilization", "backpressure_high_rate", "inbox_overflow_detected"]
|
|
132
|
+
},
|
|
133
|
+
"color": "red"
|
|
134
|
+
}
|
|
135
|
+
],
|
|
136
|
+
"variables": [
|
|
137
|
+
{
|
|
138
|
+
"name": "agent",
|
|
139
|
+
"type": "query",
|
|
140
|
+
"query": "SELECT DISTINCT agent FROM inbox_metrics",
|
|
141
|
+
"description": "Filter by specific agent",
|
|
142
|
+
"multi": true,
|
|
143
|
+
"includeAll": true
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"name": "timeRange",
|
|
147
|
+
"type": "interval",
|
|
148
|
+
"options": ["5m", "15m", "1h", "6h", "24h"],
|
|
149
|
+
"default": "1h",
|
|
150
|
+
"description": "Time range for metrics"
|
|
151
|
+
}
|
|
152
|
+
]
|
|
153
|
+
},
|
|
154
|
+
"queries": {
|
|
155
|
+
"count_alerts_by_severity": {
|
|
156
|
+
"description": "Count alerts grouped by severity",
|
|
157
|
+
"source": "/dev/shm/cfn-alerts.jsonl",
|
|
158
|
+
"aggregation": "GROUP BY severity, COUNT(*)",
|
|
159
|
+
"timeWindow": "1h"
|
|
160
|
+
},
|
|
161
|
+
"top_inbox_utilization": {
|
|
162
|
+
"description": "Rank agents by inbox utilization",
|
|
163
|
+
"source": "/dev/shm/cfn-metrics.jsonl",
|
|
164
|
+
"query": "SELECT agent, MAX(value) as utilization FROM metrics WHERE metric='inbox.utilization' GROUP BY agent ORDER BY utilization DESC LIMIT 10",
|
|
165
|
+
"timeWindow": "5m"
|
|
166
|
+
}
|
|
167
|
+
},
|
|
168
|
+
"alertRules": [
|
|
169
|
+
{
|
|
170
|
+
"name": "Inbox Critical Utilization",
|
|
171
|
+
"condition": "inbox.utilization >= 90",
|
|
172
|
+
"severity": "critical",
|
|
173
|
+
"message": "Agent {{agent}} inbox at {{value}}% utilization (critical threshold: 90%)",
|
|
174
|
+
"actions": ["emit_alert", "notify_oncall"]
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
"name": "Inbox Warning Utilization",
|
|
178
|
+
"condition": "inbox.utilization >= 75 AND inbox.utilization < 90",
|
|
179
|
+
"severity": "warning",
|
|
180
|
+
"message": "Agent {{agent}} inbox at {{value}}% utilization (warning threshold: 75%)",
|
|
181
|
+
"actions": ["emit_alert"]
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
"name": "Backpressure High Rate",
|
|
185
|
+
"condition": "backpressure.events_per_min > 100",
|
|
186
|
+
"severity": "warning",
|
|
187
|
+
"message": "Backpressure events exceeding threshold: {{value}} events/min (threshold: 100/min)",
|
|
188
|
+
"actions": ["emit_alert"]
|
|
189
|
+
},
|
|
190
|
+
{
|
|
191
|
+
"name": "Message Send Failures Critical",
|
|
192
|
+
"condition": "coordination.send_failures_per_min > 10",
|
|
193
|
+
"severity": "critical",
|
|
194
|
+
"message": "Message send failures critical: {{value}} failures/min (threshold: 10/min)",
|
|
195
|
+
"actions": ["emit_alert", "notify_oncall", "trigger_incident"]
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"name": "Inbox Overflow Detected",
|
|
199
|
+
"condition": "inbox.overflow_events_per_min > 0",
|
|
200
|
+
"severity": "critical",
|
|
201
|
+
"message": "Inbox overflow detected: {{value}} overflow events in last minute",
|
|
202
|
+
"actions": ["emit_alert", "notify_oncall", "trigger_incident"]
|
|
203
|
+
}
|
|
204
|
+
],
|
|
205
|
+
"metadata": {
|
|
206
|
+
"createdBy": "devops-engineer",
|
|
207
|
+
"phase": "1",
|
|
208
|
+
"sprint": "1.5",
|
|
209
|
+
"lastUpdated": "2025-10-06T19:35:00Z"
|
|
210
|
+
}
|
|
211
|
+
}
|