claude-flow-novice 2.2.4 → 2.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/package.json +9 -8
  2. package/scripts/.claude-flow/metrics/agent-metrics.json +1 -0
  3. package/scripts/.claude-flow/metrics/performance.json +9 -0
  4. package/scripts/.claude-flow/metrics/task-metrics.json +10 -0
  5. package/scripts/CLEANUP_OPTIMIZATION_REPORT.json +312 -0
  6. package/scripts/CLEANUP_PERFORMANCE_OPTIMIZATION.md +387 -0
  7. package/scripts/CLEANUP_QUICK_START.md +268 -0
  8. package/scripts/CLEANUP_TEST_RESULTS.md +205 -0
  9. package/scripts/README.md +339 -0
  10. package/scripts/ace-query.sh +384 -0
  11. package/scripts/agent-token-analysis.js +430 -0
  12. package/scripts/auto-setup.js +332 -0
  13. package/scripts/build/README.md +167 -0
  14. package/scripts/build/build-config.js +27 -0
  15. package/scripts/build/build-prompt-copier.sh +30 -0
  16. package/scripts/build/performance-monitor.js +869 -0
  17. package/scripts/build/prepare-publish.js +150 -0
  18. package/scripts/build/typescript-fixer.js +621 -0
  19. package/scripts/build/unified-builder.sh +428 -0
  20. package/scripts/build/update-bin-version.js +32 -0
  21. package/scripts/build/validate-agents.js +238 -0
  22. package/scripts/build-index.js +43 -0
  23. package/scripts/build-orchestrator.js +320 -0
  24. package/scripts/check-routing-stats.cjs +122 -0
  25. package/scripts/ci-validation.js +375 -0
  26. package/scripts/cleanup-blocking-coordination.sh +420 -0
  27. package/scripts/cleanup-idle-sessions.sh +59 -0
  28. package/scripts/collect-build-metrics.js +65 -0
  29. package/scripts/demo/README.md +79 -0
  30. package/scripts/demo/autoscaling-demo-simplified.js +963 -0
  31. package/scripts/demo/comprehensive-dashboard-test.js +693 -0
  32. package/scripts/demo/confidence-log.js +87 -0
  33. package/scripts/demo/confidence-report.js +82 -0
  34. package/scripts/demo/demo-multi-swarm-coordination.js +325 -0
  35. package/scripts/demo/demo-production-deployment.js +399 -0
  36. package/scripts/demo/demo-visualization-system.js +149 -0
  37. package/scripts/demo/performance-analysis.cjs +71 -0
  38. package/scripts/demo/performance-analysis.js +71 -0
  39. package/scripts/demo/test-autoscaling-demo.js +314 -0
  40. package/scripts/dependency-optimizer.js +349 -0
  41. package/scripts/dependency-security-assessment.js +331 -0
  42. package/scripts/deploy-sdk.sh +176 -0
  43. package/scripts/deployment-readiness-report.json +179 -0
  44. package/scripts/dev/README.md +264 -0
  45. package/scripts/dev/claude-flow-wrapper.sh +35 -0
  46. package/scripts/dev/claude-monitor.py +419 -0
  47. package/scripts/dev/claude-sparc.sh +562 -0
  48. package/scripts/dev/claude-wrapper.sh +17 -0
  49. package/scripts/dev/demo-phase3-compliance.js +172 -0
  50. package/scripts/dev/demo-task-system.ts +224 -0
  51. package/scripts/dev/deployment-validator.js +315 -0
  52. package/scripts/dev/spawn-claude-terminal.sh +32 -0
  53. package/scripts/dev/start-portal.sh +506 -0
  54. package/scripts/dev/start-web-ui.js +15 -0
  55. package/scripts/dev/stop-portal.sh +311 -0
  56. package/scripts/dev/validate-examples.ts +288 -0
  57. package/scripts/dev/validate-phase2.cjs +451 -0
  58. package/scripts/dev/validate-phase2.js +785 -0
  59. package/scripts/dev/validate-phase3.cjs +208 -0
  60. package/scripts/dev/validate-security-remediation.js +1 -0
  61. package/scripts/ecosystem.config.cjs +90 -0
  62. package/scripts/fix-js-extensions.js +167 -0
  63. package/scripts/generate-basic-types.js +73 -0
  64. package/scripts/generate-changelog.js +318 -0
  65. package/scripts/git-hooks/pre-commit.sh +143 -0
  66. package/scripts/health-checks.js +634 -0
  67. package/scripts/hook-wrapper.sh +54 -0
  68. package/scripts/install/README.md +375 -0
  69. package/scripts/install/REDIS_SETUP_VALIDATION.json +245 -0
  70. package/scripts/install/check-prerequisites.js +303 -0
  71. package/scripts/install/config-wizard.js +606 -0
  72. package/scripts/install/dependency-checker.js +385 -0
  73. package/scripts/install/health-check.js +765 -0
  74. package/scripts/install/install.js +256 -0
  75. package/scripts/install/installation-benchmark.js +461 -0
  76. package/scripts/install/quick-install.js +720 -0
  77. package/scripts/install/quick-start-wizard.js +295 -0
  78. package/scripts/install/redis-cli.js +289 -0
  79. package/scripts/install/redis-install-guides.md +407 -0
  80. package/scripts/install/redis-setup.js +559 -0
  81. package/scripts/install/redis-test.js +278 -0
  82. package/scripts/install/service-manager.js +672 -0
  83. package/scripts/install/setup.js +832 -0
  84. package/scripts/install/uninstall.js +526 -0
  85. package/scripts/install/update.js +461 -0
  86. package/scripts/install-pre-commit-hook.sh +127 -0
  87. package/scripts/legacy/README.md +272 -0
  88. package/scripts/legacy/batch-fix-ts.sh +54 -0
  89. package/scripts/legacy/build-migration.sh +105 -0
  90. package/scripts/legacy/build-monitor.js +209 -0
  91. package/scripts/legacy/build-with-filter.sh +84 -0
  92. package/scripts/legacy/build-workaround.sh +71 -0
  93. package/scripts/legacy/fix-ts-advanced.js +358 -0
  94. package/scripts/legacy/fix-ts-final.sh +50 -0
  95. package/scripts/legacy/fix-ts-targeted.sh +49 -0
  96. package/scripts/legacy/fix-typescript-errors.js +305 -0
  97. package/scripts/legacy/force-build.sh +63 -0
  98. package/scripts/legacy/optimize-performance.js +400 -0
  99. package/scripts/legacy/performance-monitor.js +263 -0
  100. package/scripts/legacy/performance-monitoring.js +532 -0
  101. package/scripts/legacy/performance-test-runner.js +645 -0
  102. package/scripts/legacy/quick-fix-ts.js +281 -0
  103. package/scripts/legacy/safe-build.sh +63 -0
  104. package/scripts/memory-monitor-coordinator.js +322 -0
  105. package/scripts/migrate-to-sdk.sh +520 -0
  106. package/scripts/migration/QUICK-START.md +189 -0
  107. package/scripts/migration/QUICK-START.md.backup-1760135091363 +189 -0
  108. package/scripts/migration/README.md +464 -0
  109. package/scripts/migration/TASK-1.3.2-COMPLETION-REPORT.md +500 -0
  110. package/scripts/migration/TASK-1.3.2-COMPLETION-REPORT.md.backup-1760135091348 +500 -0
  111. package/scripts/migration/UPDATE-PATHS-README.md +464 -0
  112. package/scripts/migration/UPDATE-PATHS-README.md.backup-1760135091337 +464 -0
  113. package/scripts/migration/example-patterns.json +19 -0
  114. package/scripts/migration/install-arm64.js +78 -0
  115. package/scripts/migration/install.js +83 -0
  116. package/scripts/migration/migrate-hooks.js +173 -0
  117. package/scripts/migration/migration-examples.ts +318 -0
  118. package/scripts/migration/reorganize-workspace.js +504 -0
  119. package/scripts/migration/test-update-paths.js +359 -0
  120. package/scripts/migration/update-paths.js +664 -0
  121. package/scripts/migration/validate-migration.js +647 -0
  122. package/scripts/monitor-loop.sh +65 -0
  123. package/scripts/monitor-memory.sh +47 -0
  124. package/scripts/monitor-migration.js +339 -0
  125. package/scripts/monitor.py +43 -0
  126. package/scripts/monitoring/README.md +178 -0
  127. package/scripts/monitoring/alert-monitor.sh +220 -0
  128. package/scripts/monitoring/analyze-resources.sh +199 -0
  129. package/scripts/monitoring/dashboards/rate-limiting-dashboard.json +211 -0
  130. package/scripts/monitoring/dynamic-monitor.sh +85 -0
  131. package/scripts/monitoring/launch-stability-test.sh +184 -0
  132. package/scripts/monitoring/monitor-test.sh +93 -0
  133. package/scripts/monitoring/pre-test-validation.sh +208 -0
  134. package/scripts/monitoring/quick-test-alerting.sh +118 -0
  135. package/scripts/monitoring/quick-test-rate-limiting.sh +206 -0
  136. package/scripts/monitoring/rate-limiting-monitor.sh +380 -0
  137. package/scripts/monitoring/resource-monitor.sh +126 -0
  138. package/scripts/monitoring/stability-monitor.js +429 -0
  139. package/scripts/monitoring/test-monitor-quick.sh +54 -0
  140. package/scripts/monitoring/view-alerts.sh +307 -0
  141. package/scripts/npm-metrics-collector.js +482 -0
  142. package/scripts/npm-package-validation.cjs +299 -0
  143. package/scripts/optimization/build-optimizer.js +438 -0
  144. package/scripts/optimization/config-validator.js +761 -0
  145. package/scripts/optimization/test-optimization.js +432 -0
  146. package/scripts/optimization/unified-activation.js +839 -0
  147. package/scripts/optimize-package-swarm.js +54 -0
  148. package/scripts/performance/ACTIVATION_COMMANDS.md +292 -0
  149. package/scripts/performance/sqlite-enhanced-activation.sh +583 -0
  150. package/scripts/performance/test-enhanced-backend.sh +504 -0
  151. package/scripts/performance-monitor.js +644 -0
  152. package/scripts/performance-test-runner.js +698 -0
  153. package/scripts/post-deployment-monitoring.js +350 -0
  154. package/scripts/post-edit-pipeline.js +2091 -0
  155. package/scripts/post-install-claude-md.js +78 -0
  156. package/scripts/postinstall.js +79 -0
  157. package/scripts/pre-publish-validation.cjs +212 -0
  158. package/scripts/pre-publish-validation.js +429 -0
  159. package/scripts/redis-lua/cleanup-blocking-coordination.lua +198 -0
  160. package/scripts/release-announcement.js +425 -0
  161. package/scripts/release-notification.js +248 -0
  162. package/scripts/release-rollback.js +376 -0
  163. package/scripts/release-validation.js +460 -0
  164. package/scripts/rollback-sdk.sh +66 -0
  165. package/scripts/run-production-validation.ts +590 -0
  166. package/scripts/run-stability-validation.sh +687 -0
  167. package/scripts/security/README.md +339 -0
  168. package/scripts/security/deployment-validation.cjs +279 -0
  169. package/scripts/security/envelope-encryption-confidence-report.cjs +422 -0
  170. package/scripts/security/install-git-hooks.sh +132 -0
  171. package/scripts/security/install-git-secrets.sh +295 -0
  172. package/scripts/security/rotate-api-keys.js +469 -0
  173. package/scripts/security/ruv-swarm-safe.js +74 -0
  174. package/scripts/security/security-audit.cjs +538 -0
  175. package/scripts/security/setup-redis-auth.sh +397 -0
  176. package/scripts/security/validate-envelope-encryption.cjs +340 -0
  177. package/scripts/security-scan.js +492 -0
  178. package/scripts/src/web/frontend/.claude-flow/metrics/agent-metrics.json +1 -0
  179. package/scripts/src/web/frontend/.claude-flow/metrics/performance.json +9 -0
  180. package/scripts/src/web/frontend/.claude-flow/metrics/task-metrics.json +10 -0
  181. package/scripts/switch-api.sh +158 -0
  182. package/scripts/sync-agents.js +290 -0
  183. package/scripts/test/50-agent-test.js +625 -0
  184. package/scripts/test/NEW_STABILITY_TEST_GUIDE.md +407 -0
  185. package/scripts/test/README.md +236 -0
  186. package/scripts/test/STABILITY_TEST_EXAMPLE.md +347 -0
  187. package/scripts/test/STABILITY_TEST_README.md +480 -0
  188. package/scripts/test/agent-worker.js +309 -0
  189. package/scripts/test/ai-coordination-test.js +650 -0
  190. package/scripts/test/ai-mesh-coordination-test.js +416 -0
  191. package/scripts/test/check-links.ts +274 -0
  192. package/scripts/test/check-performance-regression.ts +168 -0
  193. package/scripts/test/cli-agent-coordination-test.js +313 -0
  194. package/scripts/test/coordinator-multilingual-test.js +396 -0
  195. package/scripts/test/coordinator-transparency-demo.js +585 -0
  196. package/scripts/test/coverage-report.ts +692 -0
  197. package/scripts/test/generate-swarm-tests.js +633 -0
  198. package/scripts/test/integration-test-validation.cjs +253 -0
  199. package/scripts/test/load-test-swarm.js +576 -0
  200. package/scripts/test/mesh-coordination-zero-overlap-test.js +740 -0
  201. package/scripts/test/multilingual-hello-world-test.js +390 -0
  202. package/scripts/test/quick-multilingual-demo.js +464 -0
  203. package/scripts/test/real-agent-test.js +312 -0
  204. package/scripts/test/run-phase3-compliance-tests.js +427 -0
  205. package/scripts/test/run-stability-test-examples.sh +292 -0
  206. package/scripts/test/stability-results/stability-metrics.jsonl +83 -0
  207. package/scripts/test/stability-results/stability-test-report.json +128 -0
  208. package/scripts/test/stability-results/stability-test.log +1827 -0
  209. package/scripts/test/stability-test-50-agents.js +734 -0
  210. package/scripts/test/test-batch-tasks.ts +29 -0
  211. package/scripts/test/test-byzantine-resolution.js +246 -0
  212. package/scripts/test/test-claude-spawn-options.sh +63 -0
  213. package/scripts/test/test-cli-wizard.js +331 -0
  214. package/scripts/test/test-comprehensive.js +401 -0
  215. package/scripts/test/test-coordination-features.ts +238 -0
  216. package/scripts/test/test-fallback-systems.js +276 -0
  217. package/scripts/test/test-init-command.ts +302 -0
  218. package/scripts/test/test-mcp.ts +251 -0
  219. package/scripts/test/test-runner.ts +568 -0
  220. package/scripts/test/test-swarm-integration.sh +92 -0
  221. package/scripts/test/test-swarm.ts +142 -0
  222. package/scripts/test/validation-summary.ts +408 -0
  223. package/scripts/test-cleanup-performance.sh +416 -0
  224. package/scripts/test-dashboard-auth.cjs +203 -0
  225. package/scripts/test-docker-deployment.sh +207 -0
  226. package/scripts/test-npm-package.cjs +167 -0
  227. package/scripts/test-provider-routing.cjs +226 -0
  228. package/scripts/test-routing-telemetry.cjs +147 -0
  229. package/scripts/test-runner.cjs +154 -0
  230. package/scripts/test-zai-10k.cjs +81 -0
  231. package/scripts/test-zai-api.cjs +191 -0
  232. package/scripts/test-zai-diagnostic.cjs +151 -0
  233. package/scripts/test-zai-final.cjs +128 -0
  234. package/scripts/test-zai-with-env.cjs +85 -0
  235. package/scripts/utils/README.md +261 -0
  236. package/scripts/utils/clean-build-artifacts.sh +94 -0
  237. package/scripts/utils/cleanup-root.sh +69 -0
  238. package/scripts/utils/fix-cliffy-imports.js +307 -0
  239. package/scripts/utils/fix-duplicate-imports.js +114 -0
  240. package/scripts/utils/fix-error-handling.cjs +70 -0
  241. package/scripts/utils/fix-import-paths.js +104 -0
  242. package/scripts/utils/fix-imports.js +116 -0
  243. package/scripts/utils/fix-shebang.js +78 -0
  244. package/scripts/utils/fix-test-modules.js +27 -0
  245. package/scripts/utils/fix-timezone-issue-246.js +200 -0
  246. package/scripts/utils/fix-ts-comprehensive.py +182 -0
  247. package/scripts/utils/fix-ts-targeted-batch.js +250 -0
  248. package/scripts/utils/remove-benchmark-conflicts.sh +140 -0
  249. package/scripts/utils/simple-test-fixer.js +190 -0
  250. package/scripts/utils/validate-metrics-structure.cjs +144 -0
  251. package/scripts/validate-agent-hooks.js +506 -0
  252. package/scripts/validate-changelog.js +241 -0
  253. package/scripts/validate-coordination-cli.js +69 -0
  254. package/scripts/validate-coordination-toggle-integration.cjs +501 -0
  255. package/scripts/validate-docker-infrastructure.sh +502 -0
  256. package/scripts/validate-entry-points.js +300 -0
  257. package/scripts/validate-stage3-performance.ts +377 -0
  258. package/scripts/validate-template-bundling.js +180 -0
  259. package/scripts/validation/README.md +33 -0
  260. package/scripts/validation/acl-security-validation.cjs +214 -0
  261. package/scripts/validation/acl-security-validation.js +402 -0
  262. package/scripts/validation/byzantine-verification.js +407 -0
  263. package/scripts/validation/final-phase-2-consensus.cjs +219 -0
  264. package/scripts/validation/final-security-validation.js +791 -0
  265. package/scripts/validation/final-wasm-validation.cjs +840 -0
  266. package/scripts/validation/integration-test-analysis.js +105 -0
  267. package/scripts/validation/phase-0-comprehensive-validation.js +474 -0
  268. package/scripts/validation/phase-0-consensus-report.js +139 -0
  269. package/scripts/validation/phase-0-final-report.js +112 -0
  270. package/scripts/validation/phase-0-redis-consensus-report.js +129 -0
  271. package/scripts/validation/phase-0-validation-improved.js +490 -0
  272. package/scripts/validation/phase-0-validation-test.js +65 -0
  273. package/scripts/validation/phase-1-consensus-report.cjs +342 -0
  274. package/scripts/validation/phase-1-consensus-validation.cjs +551 -0
  275. package/scripts/validation/phase-1-consensus-validation.js +551 -0
  276. package/scripts/validation/phase-2-consensus-report.cjs +186 -0
  277. package/scripts/validation/phase-2-validation.cjs +171 -0
  278. package/scripts/validation/phase-2-validation.js +171 -0
  279. package/scripts/validation/phase-4-consensus-report.js +181 -0
  280. package/scripts/validation/phase-4-final-validation.js +351 -0
  281. package/scripts/validation/phase-5-consensus-report.cjs +113 -0
  282. package/scripts/validation/phase-5-consensus-report.js +113 -0
  283. package/scripts/validation/security-analysis.js +49 -0
  284. package/scripts/validation/security-validation.js +492 -0
  285. package/scripts/validation/simple-security-validation.js +464 -0
  286. package/scripts/verify-installation.js +112 -0
  287. package/scripts/verify-mcp-server.js +86 -0
  288. package/scripts/verify-sdk-phase1.cjs +293 -0
@@ -0,0 +1,220 @@
1
+ #!/usr/bin/env bash
2
+ # scripts/monitoring/alert-monitor.sh - Continuous monitoring daemon with alerting
3
+ # Phase 1 Sprint 1.1: Monitoring loop integration
4
+
5
+ set -euo pipefail
6
+
7
+ # ==============================================================================
8
+ # CONFIGURATION
9
+ # ==============================================================================
10
+
11
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12
+ LIB_DIR="${SCRIPT_DIR}/../../lib"
13
+ METRICS_FILE="${METRICS_FILE:-/dev/shm/cfn-metrics.jsonl}"
14
+ ALERT_LOG_FILE="${ALERT_LOG_FILE:-/dev/shm/cfn-alerts.jsonl}"
15
+ MONITOR_PID_FILE="${MONITOR_PID_FILE:-/dev/shm/alert-monitor.pid}"
16
+
17
+ # Check interval (seconds)
18
+ CHECK_INTERVAL="${CHECK_INTERVAL:-30}"
19
+
20
+ # Cleanup retention (hours)
21
+ ALERT_RETENTION_HOURS="${ALERT_RETENTION_HOURS:-24}"
22
+ METRICS_RETENTION_HOURS="${METRICS_RETENTION_HOURS:-48}"
23
+
24
+ # ==============================================================================
25
+ # DEPENDENCIES
26
+ # ==============================================================================
27
+
28
+ # Source alerting library
29
+ if [ -f "$LIB_DIR/alerting.sh" ]; then
30
+ # shellcheck source=../../lib/alerting.sh
31
+ source "$LIB_DIR/alerting.sh"
32
+ else
33
+ echo "[ERROR] Alerting library not found at $LIB_DIR/alerting.sh" >&2
34
+ exit 1
35
+ fi
36
+
37
+ # ==============================================================================
38
+ # SIGNAL HANDLERS
39
+ # ==============================================================================
40
+
41
+ cleanup() {
42
+ echo "[INFO] Shutting down alert monitor (PID: $$)" >&2
43
+ rm -f "$MONITOR_PID_FILE"
44
+ exit 0
45
+ }
46
+
47
+ trap cleanup SIGTERM SIGINT
48
+
49
+ # ==============================================================================
50
+ # MONITORING FUNCTIONS
51
+ # ==============================================================================
52
+
53
+ # start_monitor - Begin continuous threshold monitoring
54
+ start_monitor() {
55
+ local iteration=0
56
+
57
+ echo "[INFO] Alert monitor started (PID: $$)" >&2
58
+ echo "[INFO] Check interval: ${CHECK_INTERVAL}s" >&2
59
+ echo "[INFO] Metrics file: $METRICS_FILE" >&2
60
+ echo "[INFO] Alert log: $ALERT_LOG_FILE" >&2
61
+
62
+ # Write PID file
63
+ echo $$ > "$MONITOR_PID_FILE"
64
+
65
+ while true; do
66
+ iteration=$((iteration + 1))
67
+
68
+ # Check thresholds
69
+ if [ -f "$METRICS_FILE" ]; then
70
+ check_thresholds "$METRICS_FILE" 2>&1 | while IFS= read -r line; do
71
+ echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] $line"
72
+ done
73
+ fi
74
+
75
+ # Periodic cleanup (every 100 iterations)
76
+ if [ $((iteration % 100)) -eq 0 ]; then
77
+ echo "[INFO] Running periodic cleanup (iteration $iteration)" >&2
78
+ cleanup_old_data
79
+ fi
80
+
81
+ # Sleep until next check
82
+ sleep "$CHECK_INTERVAL"
83
+ done
84
+ }
85
+
86
+ # cleanup_old_data - Remove old metrics and alerts
87
+ cleanup_old_data() {
88
+ # Clear old alerts
89
+ if [ -f "$ALERT_LOG_FILE" ]; then
90
+ local alert_count_before
91
+ alert_count_before=$(wc -l < "$ALERT_LOG_FILE" 2>/dev/null || echo "0")
92
+
93
+ clear_old_alerts "$ALERT_RETENTION_HOURS"
94
+
95
+ local alert_count_after
96
+ alert_count_after=$(wc -l < "$ALERT_LOG_FILE" 2>/dev/null || echo "0")
97
+
98
+ echo "[INFO] Cleared $((alert_count_before - alert_count_after)) old alerts" >&2
99
+ fi
100
+
101
+ # Clear old metrics
102
+ if [ -f "$METRICS_FILE" ]; then
103
+ local metrics_count_before
104
+ metrics_count_before=$(wc -l < "$METRICS_FILE" 2>/dev/null || echo "0")
105
+
106
+ local cutoff_time
107
+ cutoff_time=$(date -u -d "$METRICS_RETENTION_HOURS hours ago" +"%Y-%m-%dT%H:%M:%S" 2>/dev/null || \
108
+ date -u -v-"${METRICS_RETENTION_HOURS}H" +"%Y-%m-%dT%H:%M:%S" 2>/dev/null || \
109
+ echo "1970-01-01T00:00:00")
110
+
111
+ local temp_file="${METRICS_FILE}.tmp"
112
+ jq -c --arg cutoff "$cutoff_time" \
113
+ 'select(.timestamp >= $cutoff)' \
114
+ "$METRICS_FILE" > "$temp_file" 2>/dev/null || true
115
+
116
+ if [ -f "$temp_file" ]; then
117
+ mv "$temp_file" "$METRICS_FILE"
118
+
119
+ local metrics_count_after
120
+ metrics_count_after=$(wc -l < "$METRICS_FILE" 2>/dev/null || echo "0")
121
+
122
+ echo "[INFO] Cleared $((metrics_count_before - metrics_count_after)) old metrics" >&2
123
+ fi
124
+ fi
125
+ }
126
+
127
+ # get_monitor_status - Check if monitor is running
128
+ get_monitor_status() {
129
+ if [ -f "$MONITOR_PID_FILE" ]; then
130
+ local pid
131
+ pid=$(cat "$MONITOR_PID_FILE")
132
+
133
+ if kill -0 "$pid" 2>/dev/null; then
134
+ echo "running (PID: $pid)"
135
+ return 0
136
+ else
137
+ echo "stale (PID file exists but process not running)"
138
+ rm -f "$MONITOR_PID_FILE"
139
+ return 1
140
+ fi
141
+ else
142
+ echo "stopped"
143
+ return 1
144
+ fi
145
+ }
146
+
147
+ # stop_monitor - Stop running monitor
148
+ stop_monitor() {
149
+ if [ -f "$MONITOR_PID_FILE" ]; then
150
+ local pid
151
+ pid=$(cat "$MONITOR_PID_FILE")
152
+
153
+ if kill -0 "$pid" 2>/dev/null; then
154
+ echo "[INFO] Stopping monitor (PID: $pid)" >&2
155
+ kill -TERM "$pid"
156
+
157
+ # Wait for graceful shutdown (max 5 seconds)
158
+ for i in {1..10}; do
159
+ if ! kill -0 "$pid" 2>/dev/null; then
160
+ echo "[INFO] Monitor stopped successfully" >&2
161
+ return 0
162
+ fi
163
+ sleep 0.5
164
+ done
165
+
166
+ # Force kill if still running
167
+ if kill -0 "$pid" 2>/dev/null; then
168
+ echo "[WARN] Monitor did not stop gracefully, forcing..." >&2
169
+ kill -KILL "$pid" 2>/dev/null || true
170
+ fi
171
+ fi
172
+
173
+ rm -f "$MONITOR_PID_FILE"
174
+ else
175
+ echo "[INFO] No monitor running" >&2
176
+ fi
177
+ }
178
+
179
+ # ==============================================================================
180
+ # MAIN EXECUTION
181
+ # ==============================================================================
182
+
183
+ case "${1:-start}" in
184
+ start)
185
+ if [ -f "$MONITOR_PID_FILE" ]; then
186
+ echo "[ERROR] Monitor already running (PID: $(cat "$MONITOR_PID_FILE"))" >&2
187
+ exit 1
188
+ fi
189
+ start_monitor
190
+ ;;
191
+
192
+ stop)
193
+ stop_monitor
194
+ ;;
195
+
196
+ restart)
197
+ stop_monitor
198
+ sleep 1
199
+ start_monitor
200
+ ;;
201
+
202
+ status)
203
+ get_monitor_status
204
+ ;;
205
+
206
+ background)
207
+ # Start in background
208
+ if [ -f "$MONITOR_PID_FILE" ]; then
209
+ echo "[ERROR] Monitor already running (PID: $(cat "$MONITOR_PID_FILE"))" >&2
210
+ exit 1
211
+ fi
212
+ nohup "$0" start > /dev/shm/alert-monitor.log 2>&1 &
213
+ echo "[INFO] Monitor started in background (PID: $!)" >&2
214
+ ;;
215
+
216
+ *)
217
+ echo "Usage: $0 {start|stop|restart|status|background}" >&2
218
+ exit 1
219
+ ;;
220
+ esac
@@ -0,0 +1,199 @@
1
+ #!/bin/bash
2
+ # Analysis script for resource monitoring results
3
+ # Identifies leaks, spikes, and anomalies
4
+
5
+ set -euo pipefail
6
+
7
+ CSV_FILE="${1:-}"
8
+
9
+ if [ -z "${CSV_FILE}" ] || [ ! -f "${CSV_FILE}" ]; then
10
+ echo "Usage: $0 <csv_file>"
11
+ echo "Example: $0 ./reports/monitoring/resource-usage-20250106_120000.csv"
12
+ exit 1
13
+ fi
14
+
15
+ OUTPUT_DIR=$(dirname "${CSV_FILE}")
16
+ REPORT_FILE="${OUTPUT_DIR}/analysis-report-$(date +%Y%m%d_%H%M%S).txt"
17
+
18
+ echo "========================================" | tee "${REPORT_FILE}"
19
+ echo "RESOURCE MONITORING ANALYSIS REPORT" | tee -a "${REPORT_FILE}"
20
+ echo "========================================" | tee -a "${REPORT_FILE}"
21
+ echo "Input: ${CSV_FILE}" | tee -a "${REPORT_FILE}"
22
+ echo "Generated: $(date)" | tee -a "${REPORT_FILE}"
23
+ echo "" | tee -a "${REPORT_FILE}"
24
+
25
+ # Skip header, get data
26
+ DATA=$(tail -n +2 "${CSV_FILE}")
27
+
28
+ if [ -z "${DATA}" ]; then
29
+ echo "ERROR: No data found in CSV file" | tee -a "${REPORT_FILE}"
30
+ exit 1
31
+ fi
32
+
33
+ # Total samples
34
+ TOTAL_SAMPLES=$(echo "${DATA}" | wc -l)
35
+ echo "Total Samples: ${TOTAL_SAMPLES}" | tee -a "${REPORT_FILE}"
36
+
37
+ # Duration
38
+ FIRST_ELAPSED=$(echo "${DATA}" | head -1 | cut -d',' -f2)
39
+ LAST_ELAPSED=$(echo "${DATA}" | tail -1 | cut -d',' -f2)
40
+ DURATION=$((LAST_ELAPSED - FIRST_ELAPSED))
41
+ echo "Duration: ${DURATION} seconds" | tee -a "${REPORT_FILE}"
42
+ echo "" | tee -a "${REPORT_FILE}"
43
+
44
+ # Memory RSS analysis
45
+ echo "========================================" | tee -a "${REPORT_FILE}"
46
+ echo "MEMORY (RSS) ANALYSIS" | tee -a "${REPORT_FILE}"
47
+ echo "========================================" | tee -a "${REPORT_FILE}"
48
+
49
+ MEMORY_RSS_VALUES=$(echo "${DATA}" | cut -d',' -f3)
50
+ MEMORY_RSS_MIN=$(echo "${MEMORY_RSS_VALUES}" | sort -n | head -1)
51
+ MEMORY_RSS_MAX=$(echo "${MEMORY_RSS_VALUES}" | sort -n | tail -1)
52
+ MEMORY_RSS_AVG=$(echo "${MEMORY_RSS_VALUES}" | awk '{sum+=$1; count++} END {printf "%.2f", sum/count}')
53
+
54
+ MEMORY_RSS_FIRST=$(echo "${MEMORY_RSS_VALUES}" | head -1)
55
+ MEMORY_RSS_LAST=$(echo "${MEMORY_RSS_VALUES}" | tail -1)
56
+ MEMORY_RSS_GROWTH=$(echo "scale=2; ${MEMORY_RSS_LAST} - ${MEMORY_RSS_FIRST}" | bc)
57
+ MEMORY_RSS_GROWTH_RATE=$(echo "scale=4; ${MEMORY_RSS_GROWTH} / ${DURATION}" | bc 2>/dev/null || echo "0")
58
+
59
+ echo "Min: ${MEMORY_RSS_MIN} MB" | tee -a "${REPORT_FILE}"
60
+ echo "Max: ${MEMORY_RSS_MAX} MB" | tee -a "${REPORT_FILE}"
61
+ echo "Avg: ${MEMORY_RSS_AVG} MB" | tee -a "${REPORT_FILE}"
62
+ echo "First: ${MEMORY_RSS_FIRST} MB" | tee -a "${REPORT_FILE}"
63
+ echo "Last: ${MEMORY_RSS_LAST} MB" | tee -a "${REPORT_FILE}"
64
+ echo "Growth: ${MEMORY_RSS_GROWTH} MB" | tee -a "${REPORT_FILE}"
65
+ echo "Growth Rate: ${MEMORY_RSS_GROWTH_RATE} MB/sec" | tee -a "${REPORT_FILE}"
66
+
67
+ # Memory leak detection (growth rate > 1 MB/sec)
68
+ if (( $(echo "${MEMORY_RSS_GROWTH_RATE} > 1.0" | bc -l 2>/dev/null || echo 0) )); then
69
+ echo "⚠️ LEAK DETECTED: Memory growing at ${MEMORY_RSS_GROWTH_RATE} MB/sec" | tee -a "${REPORT_FILE}"
70
+ elif (( $(echo "${MEMORY_RSS_GROWTH_RATE} > 0.1" | bc -l 2>/dev/null || echo 0) )); then
71
+ echo "⚠️ WARNING: Slow memory growth detected (${MEMORY_RSS_GROWTH_RATE} MB/sec)" | tee -a "${REPORT_FILE}"
72
+ else
73
+ echo "✅ No significant memory leak detected" | tee -a "${REPORT_FILE}"
74
+ fi
75
+ echo "" | tee -a "${REPORT_FILE}"
76
+
77
+ # CPU analysis
78
+ echo "========================================" | tee -a "${REPORT_FILE}"
79
+ echo "CPU ANALYSIS" | tee -a "${REPORT_FILE}"
80
+ echo "========================================" | tee -a "${REPORT_FILE}"
81
+
82
+ CPU_VALUES=$(echo "${DATA}" | cut -d',' -f6)
83
+ CPU_MIN=$(echo "${CPU_VALUES}" | sort -n | head -1)
84
+ CPU_MAX=$(echo "${CPU_VALUES}" | sort -n | tail -1)
85
+ CPU_AVG=$(echo "${CPU_VALUES}" | awk '{sum+=$1; count++} END {printf "%.2f", sum/count}')
86
+
87
+ echo "Min: ${CPU_MIN}%" | tee -a "${REPORT_FILE}"
88
+ echo "Max: ${CPU_MAX}%" | tee -a "${REPORT_FILE}"
89
+ echo "Avg: ${CPU_AVG}%" | tee -a "${REPORT_FILE}"
90
+
91
+ # CPU spike detection (>80% sustained for >5 samples)
92
+ CPU_SPIKES=$(echo "${CPU_VALUES}" | awk '{if ($1 > 80) count++} END {print count}')
93
+ if [ "${CPU_SPIKES}" -gt 5 ]; then
94
+ echo "⚠️ CPU SPIKES: ${CPU_SPIKES} samples above 80%" | tee -a "${REPORT_FILE}"
95
+ else
96
+ echo "✅ No sustained CPU spikes detected" | tee -a "${REPORT_FILE}"
97
+ fi
98
+ echo "" | tee -a "${REPORT_FILE}"
99
+
100
+ # File descriptor analysis
101
+ echo "========================================" | tee -a "${REPORT_FILE}"
102
+ echo "FILE DESCRIPTOR ANALYSIS" | tee -a "${REPORT_FILE}"
103
+ echo "========================================" | tee -a "${REPORT_FILE}"
104
+
105
+ FD_VALUES=$(echo "${DATA}" | cut -d',' -f7)
106
+ FD_MIN=$(echo "${FD_VALUES}" | sort -n | head -1)
107
+ FD_MAX=$(echo "${FD_VALUES}" | sort -n | tail -1)
108
+ FD_AVG=$(echo "${FD_VALUES}" | awk '{sum+=$1; count++} END {printf "%.0f", sum/count}')
109
+
110
+ FD_FIRST=$(echo "${FD_VALUES}" | head -1)
111
+ FD_LAST=$(echo "${FD_VALUES}" | tail -1)
112
+ FD_GROWTH=$((FD_LAST - FD_FIRST))
113
+
114
+ echo "Min: ${FD_MIN}" | tee -a "${REPORT_FILE}"
115
+ echo "Max: ${FD_MAX}" | tee -a "${REPORT_FILE}"
116
+ echo "Avg: ${FD_AVG}" | tee -a "${REPORT_FILE}"
117
+ echo "First: ${FD_FIRST}" | tee -a "${REPORT_FILE}"
118
+ echo "Last: ${FD_LAST}" | tee -a "${REPORT_FILE}"
119
+ echo "Growth: ${FD_GROWTH}" | tee -a "${REPORT_FILE}"
120
+
121
+ # FD leak detection (growth > 100)
122
+ if [ "${FD_GROWTH}" -gt 100 ]; then
123
+ echo "⚠️ FD LEAK DETECTED: ${FD_GROWTH} unclosed file descriptors" | tee -a "${REPORT_FILE}"
124
+ elif [ "${FD_GROWTH}" -gt 20 ]; then
125
+ echo "⚠️ WARNING: FD growth detected (${FD_GROWTH})" | tee -a "${REPORT_FILE}"
126
+ else
127
+ echo "✅ No significant FD leak detected" | tee -a "${REPORT_FILE}"
128
+ fi
129
+ echo "" | tee -a "${REPORT_FILE}"
130
+
131
+ # Process count analysis
132
+ echo "========================================" | tee -a "${REPORT_FILE}"
133
+ echo "PROCESS COUNT ANALYSIS" | tee -a "${REPORT_FILE}"
134
+ echo "========================================" | tee -a "${REPORT_FILE}"
135
+
136
+ PROC_VALUES=$(echo "${DATA}" | cut -d',' -f8)
137
+ PROC_MIN=$(echo "${PROC_VALUES}" | sort -n | head -1)
138
+ PROC_MAX=$(echo "${PROC_VALUES}" | sort -n | tail -1)
139
+ PROC_AVG=$(echo "${PROC_VALUES}" | awk '{sum+=$1; count++} END {printf "%.0f", sum/count}')
140
+
141
+ PROC_FIRST=$(echo "${PROC_VALUES}" | head -1)
142
+ PROC_LAST=$(echo "${PROC_VALUES}" | tail -1)
143
+ PROC_GROWTH=$((PROC_LAST - PROC_FIRST))
144
+
145
+ echo "Min: ${PROC_MIN}" | tee -a "${REPORT_FILE}"
146
+ echo "Max: ${PROC_MAX}" | tee -a "${REPORT_FILE}"
147
+ echo "Avg: ${PROC_AVG}" | tee -a "${REPORT_FILE}"
148
+ echo "First: ${PROC_FIRST}" | tee -a "${REPORT_FILE}"
149
+ echo "Last: ${PROC_LAST}" | tee -a "${REPORT_FILE}"
150
+ echo "Growth: ${PROC_GROWTH}" | tee -a "${REPORT_FILE}"
151
+
152
+ # Process leak detection (growth > 50)
153
+ if [ "${PROC_GROWTH}" -gt 50 ]; then
154
+ echo "⚠️ PROCESS LEAK DETECTED: ${PROC_GROWTH} orphaned processes" | tee -a "${REPORT_FILE}"
155
+ elif [ "${PROC_GROWTH}" -gt 10 ]; then
156
+ echo "⚠️ WARNING: Process growth detected (${PROC_GROWTH})" | tee -a "${REPORT_FILE}"
157
+ else
158
+ echo "✅ No significant process leak detected" | tee -a "${REPORT_FILE}"
159
+ fi
160
+ echo "" | tee -a "${REPORT_FILE}"
161
+
162
+ # Anomaly summary
163
+ echo "========================================" | tee -a "${REPORT_FILE}"
164
+ echo "ANOMALY SUMMARY" | tee -a "${REPORT_FILE}"
165
+ echo "========================================" | tee -a "${REPORT_FILE}"
166
+
167
+ ANOMALY_COUNT=0
168
+
169
+ if (( $(echo "${MEMORY_RSS_GROWTH_RATE} > 0.1" | bc -l 2>/dev/null || echo 0) )); then
170
+ echo "• Memory growth: ${MEMORY_RSS_GROWTH_RATE} MB/sec" | tee -a "${REPORT_FILE}"
171
+ ANOMALY_COUNT=$((ANOMALY_COUNT + 1))
172
+ fi
173
+
174
+ if [ "${CPU_SPIKES}" -gt 5 ]; then
175
+ echo "• CPU spikes: ${CPU_SPIKES} samples above 80%" | tee -a "${REPORT_FILE}"
176
+ ANOMALY_COUNT=$((ANOMALY_COUNT + 1))
177
+ fi
178
+
179
+ if [ "${FD_GROWTH}" -gt 20 ]; then
180
+ echo "• FD growth: ${FD_GROWTH}" | tee -a "${REPORT_FILE}"
181
+ ANOMALY_COUNT=$((ANOMALY_COUNT + 1))
182
+ fi
183
+
184
+ if [ "${PROC_GROWTH}" -gt 10 ]; then
185
+ echo "• Process growth: ${PROC_GROWTH}" | tee -a "${REPORT_FILE}"
186
+ ANOMALY_COUNT=$((ANOMALY_COUNT + 1))
187
+ fi
188
+
189
+ if [ "${ANOMALY_COUNT}" -eq 0 ]; then
190
+ echo "✅ No anomalies detected - system healthy" | tee -a "${REPORT_FILE}"
191
+ else
192
+ echo "" | tee -a "${REPORT_FILE}"
193
+ echo "⚠️ Total anomalies: ${ANOMALY_COUNT}" | tee -a "${REPORT_FILE}"
194
+ fi
195
+
196
+ echo "" | tee -a "${REPORT_FILE}"
197
+ echo "========================================" | tee -a "${REPORT_FILE}"
198
+ echo "Report saved: ${REPORT_FILE}" | tee -a "${REPORT_FILE}"
199
+ echo "========================================" | tee -a "${REPORT_FILE}"
@@ -0,0 +1,211 @@
1
+ {
2
+ "dashboard": {
3
+ "title": "CFN Rate Limiting & Backpressure Monitoring",
4
+ "description": "Real-time monitoring of message inbox utilization, backpressure events, and overflow alerts",
5
+ "version": "1.0.0",
6
+ "tags": ["rate-limiting", "backpressure", "inbox", "coordination"],
7
+ "timezone": "UTC",
8
+ "refresh": "10s",
9
+ "panels": [
10
+ {
11
+ "id": 1,
12
+ "title": "Inbox Utilization by Agent",
13
+ "type": "timeseries",
14
+ "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
15
+ "targets": [
16
+ {
17
+ "metric": "inbox.utilization",
18
+ "legend": "{{agent}}",
19
+ "unit": "percent"
20
+ }
21
+ ],
22
+ "thresholds": [
23
+ { "value": 75, "color": "yellow", "label": "Warning" },
24
+ { "value": 90, "color": "red", "label": "Critical" }
25
+ ],
26
+ "description": "Message inbox utilization percentage per agent (max 100 messages)"
27
+ },
28
+ {
29
+ "id": 2,
30
+ "title": "Inbox Message Count",
31
+ "type": "timeseries",
32
+ "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
33
+ "targets": [
34
+ {
35
+ "metric": "inbox.size",
36
+ "legend": "{{agent}}",
37
+ "unit": "count"
38
+ }
39
+ ],
40
+ "thresholds": [
41
+ { "value": 75, "color": "yellow", "label": "Warning (75 msgs)" },
42
+ { "value": 90, "color": "red", "label": "Critical (90 msgs)" }
43
+ ],
44
+ "description": "Absolute message count in agent inboxes"
45
+ },
46
+ {
47
+ "id": 3,
48
+ "title": "Backpressure Events Rate",
49
+ "type": "timeseries",
50
+ "gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 },
51
+ "targets": [
52
+ {
53
+ "metric": "backpressure.events_per_min",
54
+ "legend": "Backpressure events/min",
55
+ "unit": "count"
56
+ }
57
+ ],
58
+ "thresholds": [
59
+ { "value": 100, "color": "yellow", "label": "Warning threshold" }
60
+ ],
61
+ "description": "Rate of backpressure wait events (high rate indicates system load)"
62
+ },
63
+ {
64
+ "id": 4,
65
+ "title": "Message Send Failures",
66
+ "type": "timeseries",
67
+ "gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 },
68
+ "targets": [
69
+ {
70
+ "metric": "coordination.send_failures_per_min",
71
+ "legend": "Send failures/min",
72
+ "unit": "count"
73
+ }
74
+ ],
75
+ "thresholds": [
76
+ { "value": 10, "color": "red", "label": "Critical threshold" }
77
+ ],
78
+ "description": "Message delivery failure rate (critical if >10/min)"
79
+ },
80
+ {
81
+ "id": 5,
82
+ "title": "Inbox Overflow Events",
83
+ "type": "timeseries",
84
+ "gridPos": { "x": 0, "y": 16, "w": 12, "h": 8 },
85
+ "targets": [
86
+ {
87
+ "metric": "inbox.overflow_events_per_min",
88
+ "legend": "Overflow events/min",
89
+ "unit": "count"
90
+ }
91
+ ],
92
+ "thresholds": [
93
+ { "value": 1, "color": "red", "label": "Any overflow is critical" }
94
+ ],
95
+ "description": "Inbox overflow events (messages dropped due to full inbox)"
96
+ },
97
+ {
98
+ "id": 6,
99
+ "title": "Alert Summary",
100
+ "type": "stat",
101
+ "gridPos": { "x": 12, "y": 16, "w": 6, "h": 8 },
102
+ "targets": [
103
+ {
104
+ "query": "count_alerts_by_severity",
105
+ "fields": ["critical", "warning", "info"]
106
+ }
107
+ ],
108
+ "description": "Alert counts by severity level (last hour)"
109
+ },
110
+ {
111
+ "id": 7,
112
+ "title": "Top Utilized Agents",
113
+ "type": "table",
114
+ "gridPos": { "x": 18, "y": 16, "w": 6, "h": 8 },
115
+ "targets": [
116
+ {
117
+ "query": "top_inbox_utilization",
118
+ "fields": ["agent", "utilization", "message_count"],
119
+ "limit": 10,
120
+ "order": "desc"
121
+ }
122
+ ],
123
+ "description": "Agents with highest inbox utilization"
124
+ }
125
+ ],
126
+ "annotations": [
127
+ {
128
+ "name": "Rate Limiting Alerts",
129
+ "datasource": "cfn-alerts",
130
+ "filter": {
131
+ "tags": ["inbox_high_utilization", "backpressure_high_rate", "inbox_overflow_detected"]
132
+ },
133
+ "color": "red"
134
+ }
135
+ ],
136
+ "variables": [
137
+ {
138
+ "name": "agent",
139
+ "type": "query",
140
+ "query": "SELECT DISTINCT agent FROM inbox_metrics",
141
+ "description": "Filter by specific agent",
142
+ "multi": true,
143
+ "includeAll": true
144
+ },
145
+ {
146
+ "name": "timeRange",
147
+ "type": "interval",
148
+ "options": ["5m", "15m", "1h", "6h", "24h"],
149
+ "default": "1h",
150
+ "description": "Time range for metrics"
151
+ }
152
+ ]
153
+ },
154
+ "queries": {
155
+ "count_alerts_by_severity": {
156
+ "description": "Count alerts grouped by severity",
157
+ "source": "/dev/shm/cfn-alerts.jsonl",
158
+ "aggregation": "GROUP BY severity, COUNT(*)",
159
+ "timeWindow": "1h"
160
+ },
161
+ "top_inbox_utilization": {
162
+ "description": "Rank agents by inbox utilization",
163
+ "source": "/dev/shm/cfn-metrics.jsonl",
164
+ "query": "SELECT agent, MAX(value) as utilization FROM metrics WHERE metric='inbox.utilization' GROUP BY agent ORDER BY utilization DESC LIMIT 10",
165
+ "timeWindow": "5m"
166
+ }
167
+ },
168
+ "alertRules": [
169
+ {
170
+ "name": "Inbox Critical Utilization",
171
+ "condition": "inbox.utilization >= 90",
172
+ "severity": "critical",
173
+ "message": "Agent {{agent}} inbox at {{value}}% utilization (critical threshold: 90%)",
174
+ "actions": ["emit_alert", "notify_oncall"]
175
+ },
176
+ {
177
+ "name": "Inbox Warning Utilization",
178
+ "condition": "inbox.utilization >= 75 AND inbox.utilization < 90",
179
+ "severity": "warning",
180
+ "message": "Agent {{agent}} inbox at {{value}}% utilization (warning threshold: 75%)",
181
+ "actions": ["emit_alert"]
182
+ },
183
+ {
184
+ "name": "Backpressure High Rate",
185
+ "condition": "backpressure.events_per_min > 100",
186
+ "severity": "warning",
187
+ "message": "Backpressure events exceeding threshold: {{value}} events/min (threshold: 100/min)",
188
+ "actions": ["emit_alert"]
189
+ },
190
+ {
191
+ "name": "Message Send Failures Critical",
192
+ "condition": "coordination.send_failures_per_min > 10",
193
+ "severity": "critical",
194
+ "message": "Message send failures critical: {{value}} failures/min (threshold: 10/min)",
195
+ "actions": ["emit_alert", "notify_oncall", "trigger_incident"]
196
+ },
197
+ {
198
+ "name": "Inbox Overflow Detected",
199
+ "condition": "inbox.overflow_events_per_min > 0",
200
+ "severity": "critical",
201
+ "message": "Inbox overflow detected: {{value}} overflow events in last minute",
202
+ "actions": ["emit_alert", "notify_oncall", "trigger_incident"]
203
+ }
204
+ ],
205
+ "metadata": {
206
+ "createdBy": "devops-engineer",
207
+ "phase": "1",
208
+ "sprint": "1.5",
209
+ "lastUpdated": "2025-10-06T19:35:00Z"
210
+ }
211
+ }