claude-flow-novice 2.14.3 → 2.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/.claude/commands/CFN_LOOP_TASK_MODE.md +4 -47
  2. package/.claude/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
  3. package/claude-assets/commands/CFN_LOOP_TASK_MODE.md +4 -47
  4. package/claude-assets/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
  5. package/dist/agents/agent-loader.js +165 -146
  6. package/dist/agents/agent-loader.js.map +1 -1
  7. package/dist/cli/agent-prompt-builder.js +25 -0
  8. package/dist/cli/agent-prompt-builder.js.map +1 -1
  9. package/dist/cli/config-manager.js +91 -109
  10. package/package.json +1 -1
  11. package/.claude/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
  12. package/.claude/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
  13. package/.claude/skills/cfn-redis-coordination/LOGGING.md +0 -260
  14. package/.claude/skills/cfn-redis-coordination/README.md +0 -65
  15. package/.claude/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
  16. package/.claude/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
  17. package/.claude/skills/cfn-redis-coordination/SKILL.md +0 -720
  18. package/.claude/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
  19. package/.claude/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
  20. package/.claude/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
  21. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
  22. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
  23. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
  24. package/.claude/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
  25. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
  26. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
  27. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
  28. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
  29. package/.claude/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
  30. package/.claude/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
  31. package/.claude/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
  32. package/.claude/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
  33. package/.claude/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
  34. package/.claude/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
  35. package/.claude/skills/cfn-redis-coordination/examples/README.md +0 -73
  36. package/.claude/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
  37. package/.claude/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
  38. package/.claude/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
  39. package/.claude/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
  40. package/.claude/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
  41. package/.claude/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
  42. package/.claude/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
  43. package/.claude/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
  44. package/.claude/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
  45. package/.claude/skills/cfn-redis-coordination/heartbeat.sh +0 -126
  46. package/.claude/skills/cfn-redis-coordination/init-swarm.sh +0 -148
  47. package/.claude/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
  48. package/.claude/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
  49. package/.claude/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
  50. package/.claude/skills/cfn-redis-coordination/log-event.sh +0 -109
  51. package/.claude/skills/cfn-redis-coordination/metrics-export.sh +0 -674
  52. package/.claude/skills/cfn-redis-coordination/metrics-schema.json +0 -66
  53. package/.claude/skills/cfn-redis-coordination/metrics-storage.md +0 -31
  54. package/.claude/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
  55. package/.claude/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
  56. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
  57. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
  58. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
  59. package/.claude/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
  60. package/.claude/skills/cfn-redis-coordination/priority_wake.py +0 -134
  61. package/.claude/skills/cfn-redis-coordination/query-dlq.sh +0 -162
  62. package/.claude/skills/cfn-redis-coordination/query-logs.sh +0 -103
  63. package/.claude/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
  64. package/.claude/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
  65. package/.claude/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
  66. package/.claude/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
  67. package/.claude/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
  68. package/.claude/skills/cfn-redis-coordination/signal.sh +0 -38
  69. package/.claude/skills/cfn-redis-coordination/store-context.sh +0 -86
  70. package/.claude/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
  71. package/.claude/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
  72. package/.claude/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
  73. package/.claude/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
  74. package/.claude/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
  75. package/.claude/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
  76. package/.claude/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
  77. package/.claude/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
  78. package/.claude/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
  79. package/.claude/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
  80. package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
  81. package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
  82. package/.claude/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
  83. package/.claude/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
  84. package/.claude/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
  85. package/.claude/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
  86. package/.claude/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
  87. package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
  88. package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
  89. package/claude-assets/skills/cfn-redis-coordination/LOGGING.md +0 -260
  90. package/claude-assets/skills/cfn-redis-coordination/README.md +0 -65
  91. package/claude-assets/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
  92. package/claude-assets/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
  93. package/claude-assets/skills/cfn-redis-coordination/SKILL.md +0 -720
  94. package/claude-assets/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
  95. package/claude-assets/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
  96. package/claude-assets/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
  97. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
  98. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
  99. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
  100. package/claude-assets/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
  101. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
  102. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
  103. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
  104. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
  105. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
  106. package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
  107. package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
  108. package/claude-assets/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
  109. package/claude-assets/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
  110. package/claude-assets/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
  111. package/claude-assets/skills/cfn-redis-coordination/examples/README.md +0 -73
  112. package/claude-assets/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
  113. package/claude-assets/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
  114. package/claude-assets/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
  115. package/claude-assets/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
  116. package/claude-assets/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
  117. package/claude-assets/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
  118. package/claude-assets/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
  119. package/claude-assets/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
  120. package/claude-assets/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
  121. package/claude-assets/skills/cfn-redis-coordination/heartbeat.sh +0 -126
  122. package/claude-assets/skills/cfn-redis-coordination/init-swarm.sh +0 -148
  123. package/claude-assets/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
  124. package/claude-assets/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
  125. package/claude-assets/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
  126. package/claude-assets/skills/cfn-redis-coordination/log-event.sh +0 -109
  127. package/claude-assets/skills/cfn-redis-coordination/metrics-export.sh +0 -674
  128. package/claude-assets/skills/cfn-redis-coordination/metrics-schema.json +0 -66
  129. package/claude-assets/skills/cfn-redis-coordination/metrics-storage.md +0 -31
  130. package/claude-assets/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
  131. package/claude-assets/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
  132. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
  133. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
  134. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
  135. package/claude-assets/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
  136. package/claude-assets/skills/cfn-redis-coordination/priority_wake.py +0 -134
  137. package/claude-assets/skills/cfn-redis-coordination/query-dlq.sh +0 -162
  138. package/claude-assets/skills/cfn-redis-coordination/query-logs.sh +0 -103
  139. package/claude-assets/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
  140. package/claude-assets/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
  141. package/claude-assets/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
  142. package/claude-assets/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
  143. package/claude-assets/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
  144. package/claude-assets/skills/cfn-redis-coordination/signal.sh +0 -38
  145. package/claude-assets/skills/cfn-redis-coordination/store-context.sh +0 -86
  146. package/claude-assets/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
  147. package/claude-assets/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
  148. package/claude-assets/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
  149. package/claude-assets/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
  150. package/claude-assets/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
  151. package/claude-assets/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
  152. package/claude-assets/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
  153. package/claude-assets/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
  154. package/claude-assets/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
  155. package/claude-assets/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
  156. package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
  157. package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
  158. package/claude-assets/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
  159. package/claude-assets/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
  160. package/claude-assets/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
  161. package/claude-assets/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
  162. package/claude-assets/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
@@ -1,66 +0,0 @@
1
- {
2
- "$schema": "http://json-schema.org/draft-07/schema#",
3
- "title": "CFN Loop Observability Metrics Schema",
4
- "version": "1.0.0",
5
- "type": "object",
6
- "properties": {
7
- "task_metadata": {
8
- "type": "object",
9
- "properties": {
10
- "task_id": {"type": "string"},
11
- "mode": {"enum": ["mvp", "standard", "enterprise"]},
12
- "current_iteration": {"type": "number", "minimum": 1},
13
- "start_timestamp": {"type": "number"}
14
- },
15
- "required": ["task_id", "mode"]
16
- },
17
- "iteration_metrics": {
18
- "type": "object",
19
- "properties": {
20
- "duration_ms": {"type": "number", "minimum": 0},
21
- "iteration_count": {"type": "number", "minimum": 1},
22
- "gate_pass_rate": {"type": "number", "minimum": 0, "maximum": 1}
23
- }
24
- },
25
- "agent_metrics": {
26
- "type": "object",
27
- "properties": {
28
- "latency_ms": {
29
- "type": "object",
30
- "additionalProperties": {
31
- "type": "object",
32
- "properties": {
33
- "min": {"type": "number"},
34
- "max": {"type": "number"},
35
- "avg": {"type": "number"}
36
- }
37
- }
38
- },
39
- "timeout_count": {"type": "number", "minimum": 0},
40
- "retry_count": {"type": "number", "minimum": 0},
41
- "heartbeat_miss_count": {"type": "number", "minimum": 0}
42
- }
43
- },
44
- "consensus_metrics": {
45
- "type": "object",
46
- "properties": {
47
- "loop3": {
48
- "type": "object",
49
- "properties": {
50
- "confidence_score": {"type": "number", "minimum": 0, "maximum": 1},
51
- "gate_pass": {"type": "boolean"}
52
- }
53
- },
54
- "loop2": {
55
- "type": "object",
56
- "properties": {
57
- "consensus_score": {"type": "number", "minimum": 0, "maximum": 1},
58
- "final_consensus": {"type": "boolean"}
59
- }
60
- },
61
- "quorum_fallback_count": {"type": "number", "minimum": 0}
62
- }
63
- }
64
- },
65
- "required": ["task_metadata", "iteration_metrics", "agent_metrics", "consensus_metrics"]
66
- }
@@ -1,31 +0,0 @@
1
- # CFN Loop Metrics Storage Strategy
2
-
3
- ## Redis Key Patterns
4
-
5
- ### Task-Level Metrics
6
- - `swarm:{task_id}:metrics:metadata`
7
- - `swarm:{task_id}:metrics:iteration`
8
- - `swarm:{task_id}:metrics:agent`
9
- - `swarm:{task_id}:metrics:consensus`
10
-
11
- ### Iteration-Specific Keys
12
- - `swarm:{task_id}:iteration:{iteration_number}:duration`
13
- - `swarm:{task_id}:iteration:{iteration_number}:gate_pass_rate`
14
-
15
- ### Agent-Level Keys
16
- - `swarm:{task_id}:agent:{agent_id}:latency`
17
- - `swarm:{task_id}:agent:{agent_id}:timeouts`
18
-
19
- ### Consensus Keys
20
- - `swarm:{task_id}:consensus:loop3:confidence`
21
- - `swarm:{task_id}:consensus:loop2:score`
22
-
23
- ## Storage Mechanisms
24
- - Hash (HSET): Detailed metrics
25
- - List (LPUSH): Time-series events
26
- - Sorted Set (ZADD): Ranked metrics
27
-
28
- ## Retention Policy
29
- - Default: 30 days
30
- - Can be configured via environment variable
31
- - Automatic pruning after task completion
@@ -1,391 +0,0 @@
1
- #!/bin/bash
2
- # monitor-cfn-violations.sh - Real-time CFN Loop violation detector
3
- # Part of Redis Coordination Skill
4
- #
5
- # Monitors active CFN Loop executions and detects common violations:
6
- # - Orchestrator never started
7
- # - Loop 2 started before Loop 3 complete (gate bypass)
8
- # - Missing agent completion signals
9
- # - Heartbeat monitoring not started
10
- # - Product Owner not consulted
11
- # - Coordinator timeout issues
12
- #
13
- # Alerts sent via Redis pub/sub and WebSocket (web portal integration)
14
- #
15
- # Usage: ./monitor-cfn-violations.sh [--interval 30] [--websocket-port 3001]
16
- #
17
- # Version: 1.0.0
18
- # Last Updated: 2025-10-20
19
-
20
- set -euo pipefail
21
-
22
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
23
- REDIS_HOST="${REDIS_HOST:-localhost}"
24
- REDIS_PORT="${REDIS_PORT:-6379}"
25
- CHECK_INTERVAL=30 # seconds between checks
26
- WEBSOCKET_PORT=3001
27
- VIOLATION_LOG="/tmp/cfn-violations.log"
28
-
29
- # Parse arguments
30
- while [[ $# -gt 0 ]]; do
31
- case $1 in
32
- --interval)
33
- CHECK_INTERVAL="$2"
34
- shift 2
35
- ;;
36
- --websocket-port)
37
- WEBSOCKET_PORT="$2"
38
- shift 2
39
- ;;
40
- *)
41
- echo "Unknown argument: $1"
42
- exit 1
43
- ;;
44
- esac
45
- done
46
-
47
- echo "=== CFN Loop Violation Monitor ==="
48
- echo "Redis: ${REDIS_HOST}:${REDIS_PORT}"
49
- echo "Check interval: ${CHECK_INTERVAL}s"
50
- echo "WebSocket port: ${WEBSOCKET_PORT}"
51
- echo "Log: ${VIOLATION_LOG}"
52
- echo ""
53
-
54
- # Initialize violation log
55
- echo "[$(date -Iseconds)] Monitor started" > "$VIOLATION_LOG"
56
-
57
- # Function: Send violation alert via Redis pub/sub
58
- send_violation_alert() {
59
- local task_id="$1"
60
- local violation_type="$2"
61
- local severity="$3" # critical, warning, info
62
- local description="$4"
63
- local recommendation="$5"
64
- local evidence="$6" # JSON string
65
-
66
- local timestamp=$(date -Iseconds)
67
-
68
- # Build JSON alert
69
- local alert=$(jq -nc \
70
- --arg ts "$timestamp" \
71
- --arg tid "$task_id" \
72
- --arg vtype "$violation_type" \
73
- --arg sev "$severity" \
74
- --arg desc "$description" \
75
- --arg rec "$recommendation" \
76
- --argjson ev "$evidence" \
77
- '{
78
- timestamp: $ts,
79
- task_id: $tid,
80
- violation_type: $vtype,
81
- severity: $sev,
82
- description: $desc,
83
- recommendation: $rec,
84
- evidence: $ev
85
- }')
86
-
87
- # Publish to task-specific channel
88
- echo "$alert" | redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
89
- PUBLISH "swarm:${task_id}:violations" >/dev/null
90
-
91
- # Publish to global violations channel (for web portal)
92
- echo "$alert" | redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
93
- PUBLISH "cfn:violations:all" >/dev/null
94
-
95
- # Log violation
96
- echo "[$(date -Iseconds)] [$severity] $violation_type: $description (task: $task_id)" >> "$VIOLATION_LOG"
97
-
98
- # Send to WebSocket server if available
99
- if command -v curl &>/dev/null; then
100
- curl -s -X POST "http://localhost:${WEBSOCKET_PORT}/api/violations" \
101
- -H "Content-Type: application/json" \
102
- -d "$alert" >/dev/null 2>&1 || true
103
- fi
104
-
105
- echo " 🚨 [$severity] $violation_type: $description"
106
- }
107
-
108
- # Function: Check if orchestrator never started
109
- check_orchestrator_not_started() {
110
- local swarm_id="$1"
111
-
112
- # Get swarm metadata
113
- local created_at=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
114
- HGET "$swarm_id" created_at 2>/dev/null || echo "")
115
-
116
- if [ -z "$created_at" ]; then
117
- return 0 # Swarm doesn't exist, skip
118
- fi
119
-
120
- # Calculate time elapsed
121
- local created_ts=$(date -d "$created_at" +%s 2>/dev/null || echo "0")
122
- local now_ts=$(date +%s)
123
- local elapsed=$((now_ts - created_ts))
124
-
125
- # If swarm exists >2 minutes but no status key, orchestrator never started
126
- if [ $elapsed -gt 120 ]; then
127
- local task_id=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
128
- HGET "$swarm_id" task_id 2>/dev/null || echo "unknown")
129
-
130
- local status_key="swarm:${task_id}:status"
131
- local status=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
132
- GET "$status_key" 2>/dev/null || echo "")
133
-
134
- if [ -z "$status" ]; then
135
- # Check if already alerted
136
- local alert_key="violation:${task_id}:orchestrator_not_started"
137
- if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
138
- local evidence=$(jq -nc \
139
- --arg created "$created_at" \
140
- --arg elapsed "$elapsed" \
141
- --arg status_exists "false" \
142
- '{
143
- swarm_created_at: $created,
144
- time_elapsed_seconds: ($elapsed | tonumber),
145
- status_key_exists: ($status_exists == "true"),
146
- agent_keys_count: 0
147
- }')
148
-
149
- send_violation_alert \
150
- "$task_id" \
151
- "orchestrator_never_started" \
152
- "critical" \
153
- "Orchestrator was never spawned after ${elapsed}s. Coordinator may have failed at Step 2." \
154
- "Check coordinator logs. Ensure orchestrator spawned with run_in_background: true" \
155
- "$evidence"
156
-
157
- # Mark as alerted (TTL 1 hour)
158
- redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
159
- SETEX "$alert_key" 3600 "alerted" >/dev/null
160
- fi
161
- fi
162
- fi
163
- }
164
-
165
- # Function: Check if Loop 2 started before Loop 3 completed (gate bypass)
166
- check_gate_bypass() {
167
- local task_id="$1"
168
-
169
- # Check if Loop 2 started
170
- local loop2_start=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
171
- GET "swarm:${task_id}:loop2:started" 2>/dev/null || echo "")
172
-
173
- if [ -n "$loop2_start" ]; then
174
- # Check if Loop 3 completed
175
- local loop3_complete=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
176
- GET "swarm:${task_id}:loop3:complete" 2>/dev/null || echo "")
177
-
178
- if [ -z "$loop3_complete" ]; then
179
- local alert_key="violation:${task_id}:gate_bypass"
180
- if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
181
- local evidence=$(jq -nc \
182
- --arg loop2_start "$loop2_start" \
183
- '{
184
- loop2_started_at: $loop2_start,
185
- loop3_complete: false,
186
- gate_passed: false
187
- }')
188
-
189
- send_violation_alert \
190
- "$task_id" \
191
- "gate_bypass_violation" \
192
- "critical" \
193
- "Loop 2 validators started before Loop 3 gate passed. This violates CFN Loop protocol." \
194
- "Check orchestrator gate check logic. Loop 2 must BLPOP on gate-passed signal." \
195
- "$evidence"
196
-
197
- redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
198
- SETEX "$alert_key" 3600 "alerted" >/dev/null
199
- fi
200
- fi
201
- fi
202
- }
203
-
204
- # Function: Check if agents completed but orchestrator hung
205
- check_orchestrator_hang() {
206
- local task_id="$1"
207
-
208
- # Get orchestrator status
209
- local status=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
210
- GET "swarm:${task_id}:status" 2>/dev/null || echo "")
211
-
212
- # Check if status indicates waiting for agents
213
- if [[ "$status" =~ loop3_waiting|loop2_waiting ]]; then
214
- # Count done signals
215
- local done_keys=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
216
- KEYS "swarm:${task_id}:*:done" 2>/dev/null | wc -l)
217
-
218
- # Get expected agent count
219
- local swarm_id=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
220
- GET "task:${task_id}:swarm" 2>/dev/null || echo "swarm:swarm-${task_id}")
221
- local expected=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
222
- HGET "${swarm_id}:metadata" max_agents 2>/dev/null || echo "0")
223
-
224
- if [ "$done_keys" -ge "$expected" ] && [ "$expected" -gt 0 ]; then
225
- # Agents completed but orchestrator still waiting
226
- local alert_key="violation:${task_id}:orchestrator_hang"
227
- if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
228
- local evidence=$(jq -nc \
229
- --arg status "$status" \
230
- --arg done "$done_keys" \
231
- --arg expected "$expected" \
232
- '{
233
- orchestrator_status: $status,
234
- done_signals_count: ($done | tonumber),
235
- expected_agents: ($expected | tonumber)
236
- }')
237
-
238
- send_violation_alert \
239
- "$task_id" \
240
- "orchestrator_hang_with_complete_agents" \
241
- "critical" \
242
- "All agents signaled completion but orchestrator still waiting. Possible BLPOP key mismatch." \
243
- "Check orchestrator DONE_KEY construction. Verify agent IDs match (with iteration suffix)." \
244
- "$evidence"
245
-
246
- redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
247
- SETEX "$alert_key" 3600 "alerted" >/dev/null
248
- fi
249
- fi
250
- fi
251
- }
252
-
253
- # Function: Check if coordinator monitoring with timeout
254
- check_coordinator_timeout_pattern() {
255
- local task_id="$1"
256
-
257
- # Check if swarm created but status never updated (5+ min)
258
- local swarm_id=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
259
- GET "task:${task_id}:swarm" 2>/dev/null || echo "swarm:swarm-${task_id}")
260
-
261
- local created_at=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
262
- HGET "${swarm_id}:metadata" created_at 2>/dev/null || echo "")
263
-
264
- if [ -n "$created_at" ]; then
265
- local created_ts=$(date -d "$created_at" +%s 2>/dev/null || echo "0")
266
- local now_ts=$(date +%s)
267
- local elapsed=$((now_ts - created_ts))
268
-
269
- # Check if swarm cancelled with SIGTERM after ~5-10 minutes
270
- local status=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
271
- HGET "${swarm_id}:metadata" status 2>/dev/null || echo "")
272
- local shutdown_reason=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
273
- HGET "${swarm_id}:metadata" shutdown_reason 2>/dev/null || echo "")
274
-
275
- if [ "$status" = "cancelled" ] && [ "$shutdown_reason" = "SIGTERM_received" ] && [ $elapsed -ge 300 ] && [ $elapsed -le 600 ]; then
276
- local alert_key="violation:${task_id}:coordinator_timeout"
277
- if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
278
- local evidence=$(jq -nc \
279
- --arg created "$created_at" \
280
- --arg elapsed "$elapsed" \
281
- --arg reason "$shutdown_reason" \
282
- '{
283
- swarm_created_at: $created,
284
- cancelled_after_seconds: ($elapsed | tonumber),
285
- shutdown_reason: $reason,
286
- likely_cause: "coordinator_monitoring_with_bash_timeout"
287
- }')
288
-
289
- send_violation_alert \
290
- "$task_id" \
291
- "coordinator_monitoring_timeout" \
292
- "critical" \
293
- "Coordinator cancelled after ${elapsed}s with SIGTERM. Likely wrapped monitoring in Bash() with timeout." \
294
- "Check coordinator template. Monitoring must use multiple tool calls in coordinator's own message loop, NOT single Bash() call." \
295
- "$evidence"
296
-
297
- redis-cli -h "$REDIS_HOST" -p "$REDIS_HOST" \
298
- SETEX "$alert_key" 3600 "alerted" >/dev/null
299
- fi
300
- fi
301
- fi
302
- }
303
-
304
- # Function: Check if Product Owner skipped
305
- check_product_owner_skipped() {
306
- local task_id="$1"
307
-
308
- # Check if Loop 2 completed
309
- local loop2_complete=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
310
- GET "swarm:${task_id}:loop2:complete" 2>/dev/null || echo "")
311
-
312
- if [ -n "$loop2_complete" ]; then
313
- # Check if Product Owner was consulted
314
- local po_consulted=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
315
- GET "swarm:${task_id}:product_owner:consulted" 2>/dev/null || echo "")
316
-
317
- if [ -z "$po_consulted" ]; then
318
- # Wait 60s after Loop 2 complete to allow time for PO spawn
319
- local loop2_ts=$(date -d "$loop2_complete" +%s 2>/dev/null || echo "0")
320
- local now_ts=$(date +%s)
321
- local elapsed=$((now_ts - loop2_ts))
322
-
323
- if [ $elapsed -gt 60 ]; then
324
- local alert_key="violation:${task_id}:po_skipped"
325
- if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
326
- local evidence=$(jq -nc \
327
- --arg loop2_complete "$loop2_complete" \
328
- --arg elapsed "$elapsed" \
329
- '{
330
- loop2_completed_at: $loop2_complete,
331
- time_since_loop2_seconds: ($elapsed | tonumber),
332
- product_owner_consulted: false
333
- }')
334
-
335
- send_violation_alert \
336
- "$task_id" \
337
- "product_owner_not_consulted" \
338
- "warning" \
339
- "Loop 2 completed ${elapsed}s ago but Product Owner not consulted. Strategic decision skipped." \
340
- "Check orchestrator Product Owner spawning logic. PO should be spawned after Loop 2 consensus check." \
341
- "$evidence"
342
-
343
- redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
344
- SETEX "$alert_key" 3600 "alerted" >/dev/null
345
- fi
346
- fi
347
- fi
348
- fi
349
- }
350
-
351
- # Main monitoring loop
352
- echo "Starting violation monitoring..."
353
- echo ""
354
-
355
- ITERATION=0
356
- while true; do
357
- ITERATION=$((ITERATION + 1))
358
- echo "[Check #${ITERATION}] $(date '+%H:%M:%S')"
359
-
360
- # Find all active swarm metadata keys
361
- SWARM_KEYS=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
362
- KEYS "swarm:*:metadata" 2>/dev/null || echo "")
363
-
364
- if [ -z "$SWARM_KEYS" ]; then
365
- echo " No active swarms found"
366
- else
367
- SWARM_COUNT=$(echo "$SWARM_KEYS" | wc -l)
368
- echo " Monitoring $SWARM_COUNT swarm(s)..."
369
-
370
- for SWARM_KEY in $SWARM_KEYS; do
371
- # Extract task ID
372
- TASK_ID=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
373
- HGET "$SWARM_KEY" task_id 2>/dev/null || echo "")
374
-
375
- if [ -z "$TASK_ID" ]; then
376
- continue
377
- fi
378
-
379
- # Run violation checks
380
- check_orchestrator_not_started "$SWARM_KEY"
381
- check_gate_bypass "$TASK_ID"
382
- check_orchestrator_hang "$TASK_ID"
383
- check_coordinator_timeout_pattern "$TASK_ID"
384
- check_product_owner_skipped "$TASK_ID"
385
- done
386
- fi
387
-
388
- echo " Sleeping ${CHECK_INTERVAL}s..."
389
- echo ""
390
- sleep "$CHECK_INTERVAL"
391
- done
@@ -1,101 +0,0 @@
1
- #!/bin/bash
2
- # Redis Coordination Skill - Agent Heartbeat Monitor
3
- # Version: 1.0.0
4
- # Last Updated: 2025-10-19
5
-
6
- # Strict error handling
7
- set -euo pipefail
8
-
9
- # Default values
10
- TASK_ID=""
11
- CHECK_INTERVAL=30
12
- MISS_THRESHOLD=2
13
- AGENTS=()
14
-
15
- # Parse command-line arguments
16
- while [[ $# -gt 0 ]]; do
17
- case "$1" in
18
- --task-id)
19
- TASK_ID="$2"
20
- shift 2
21
- ;;
22
- --check-interval)
23
- CHECK_INTERVAL="$2"
24
- shift 2
25
- ;;
26
- --miss-threshold)
27
- MISS_THRESHOLD="$2"
28
- shift 2
29
- ;;
30
- --agents)
31
- IFS=',' read -ra AGENTS <<< "$2"
32
- shift 2
33
- ;;
34
- *)
35
- echo "Unknown parameter: $1"
36
- exit 1
37
- ;;
38
- esac
39
- done
40
-
41
- # Validate required parameters
42
- if [[ -z "$TASK_ID" ]]; then
43
- echo "Error: task-id is required"
44
- exit 1
45
- fi
46
-
47
- # Function to check agent heartbeat
48
- check_agent_heartbeat() {
49
- local agent_id="$1"
50
- local miss_count=0
51
- local last_heartbeat
52
-
53
- # Check heartbeat key
54
- last_heartbeat=$(redis-cli get "swarm:${TASK_ID}:${agent_id}:heartbeat")
55
-
56
- # If no heartbeat found, increment miss count
57
- if [[ -z "$last_heartbeat" ]]; then
58
- ((miss_count++))
59
- echo "[$(date -u)] No heartbeat detected for agent: ${agent_id}" >> /var/log/claude-flow/heartbeat-misses.log
60
- else
61
- # Reset miss count if heartbeat exists
62
- miss_count=0
63
- fi
64
-
65
- # Trigger actions on missed heartbeats
66
- if ((miss_count >= MISS_THRESHOLD)); then
67
- handle_agent_failure "$agent_id"
68
- fi
69
- }
70
-
71
- # Function to handle agent failure
72
- handle_agent_failure() {
73
- local agent_id="$1"
74
-
75
- # Log agent failure
76
- echo "[$(date -u)] CRITICAL: Agent ${agent_id} failed health check" >> /var/log/claude-flow/agent-failures.log
77
-
78
- # Remove from active agents
79
- redis-cli srem "swarm:${TASK_ID}:active-agents" "$agent_id"
80
-
81
- # Trigger emergency recovery
82
- ./.claude/skills/cfn-redis-coordination/agent-recovery.sh \
83
- --task-id "$TASK_ID" \
84
- --agent-id "$agent_id"
85
- }
86
-
87
- # Main monitoring loop
88
- while true; do
89
- # If no agents specified, fetch from Redis set
90
- if [[ ${#AGENTS[@]} -eq 0 ]]; then
91
- mapfile -t AGENTS < <(redis-cli smembers "swarm:${TASK_ID}:active-agents")
92
- fi
93
-
94
- # Check heartbeat for each agent
95
- for agent in "${AGENTS[@]}"; do
96
- check_agent_heartbeat "$agent"
97
- done
98
-
99
- # Sleep before next check
100
- sleep "$CHECK_INTERVAL"
101
- done