claude-flow-novice 2.14.2 → 2.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. package/.claude/commands/CFN_LOOP_TASK_MODE.md +4 -47
  2. package/.claude/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
  3. package/claude-assets/commands/CFN_LOOP_TASK_MODE.md +4 -47
  4. package/claude-assets/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
  5. package/dist/cli/agent-prompt-builder.js +25 -0
  6. package/dist/cli/agent-prompt-builder.js.map +1 -1
  7. package/dist/cli/config-manager.js +91 -109
  8. package/package.json +1 -1
  9. package/scripts/init-project.js +1 -1
  10. package/.claude/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
  11. package/.claude/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
  12. package/.claude/skills/cfn-redis-coordination/LOGGING.md +0 -260
  13. package/.claude/skills/cfn-redis-coordination/README.md +0 -65
  14. package/.claude/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
  15. package/.claude/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
  16. package/.claude/skills/cfn-redis-coordination/SKILL.md +0 -720
  17. package/.claude/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
  18. package/.claude/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
  19. package/.claude/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
  20. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
  21. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
  22. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
  23. package/.claude/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
  24. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
  25. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
  26. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
  27. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
  28. package/.claude/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
  29. package/.claude/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
  30. package/.claude/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
  31. package/.claude/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
  32. package/.claude/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
  33. package/.claude/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
  34. package/.claude/skills/cfn-redis-coordination/examples/README.md +0 -73
  35. package/.claude/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
  36. package/.claude/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
  37. package/.claude/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
  38. package/.claude/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
  39. package/.claude/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
  40. package/.claude/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
  41. package/.claude/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
  42. package/.claude/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
  43. package/.claude/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
  44. package/.claude/skills/cfn-redis-coordination/heartbeat.sh +0 -126
  45. package/.claude/skills/cfn-redis-coordination/init-swarm.sh +0 -148
  46. package/.claude/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
  47. package/.claude/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
  48. package/.claude/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
  49. package/.claude/skills/cfn-redis-coordination/log-event.sh +0 -109
  50. package/.claude/skills/cfn-redis-coordination/metrics-export.sh +0 -674
  51. package/.claude/skills/cfn-redis-coordination/metrics-schema.json +0 -66
  52. package/.claude/skills/cfn-redis-coordination/metrics-storage.md +0 -31
  53. package/.claude/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
  54. package/.claude/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
  55. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
  56. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
  57. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
  58. package/.claude/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
  59. package/.claude/skills/cfn-redis-coordination/priority_wake.py +0 -134
  60. package/.claude/skills/cfn-redis-coordination/query-dlq.sh +0 -162
  61. package/.claude/skills/cfn-redis-coordination/query-logs.sh +0 -103
  62. package/.claude/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
  63. package/.claude/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
  64. package/.claude/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
  65. package/.claude/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
  66. package/.claude/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
  67. package/.claude/skills/cfn-redis-coordination/signal.sh +0 -38
  68. package/.claude/skills/cfn-redis-coordination/store-context.sh +0 -86
  69. package/.claude/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
  70. package/.claude/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
  71. package/.claude/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
  72. package/.claude/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
  73. package/.claude/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
  74. package/.claude/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
  75. package/.claude/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
  76. package/.claude/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
  77. package/.claude/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
  78. package/.claude/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
  79. package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
  80. package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
  81. package/.claude/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
  82. package/.claude/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
  83. package/.claude/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
  84. package/.claude/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
  85. package/.claude/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
  86. package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
  87. package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
  88. package/claude-assets/skills/cfn-redis-coordination/LOGGING.md +0 -260
  89. package/claude-assets/skills/cfn-redis-coordination/README.md +0 -65
  90. package/claude-assets/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
  91. package/claude-assets/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
  92. package/claude-assets/skills/cfn-redis-coordination/SKILL.md +0 -720
  93. package/claude-assets/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
  94. package/claude-assets/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
  95. package/claude-assets/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
  96. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
  97. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
  98. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
  99. package/claude-assets/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
  100. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
  101. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
  102. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
  103. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
  104. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
  105. package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
  106. package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
  107. package/claude-assets/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
  108. package/claude-assets/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
  109. package/claude-assets/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
  110. package/claude-assets/skills/cfn-redis-coordination/examples/README.md +0 -73
  111. package/claude-assets/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
  112. package/claude-assets/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
  113. package/claude-assets/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
  114. package/claude-assets/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
  115. package/claude-assets/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
  116. package/claude-assets/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
  117. package/claude-assets/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
  118. package/claude-assets/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
  119. package/claude-assets/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
  120. package/claude-assets/skills/cfn-redis-coordination/heartbeat.sh +0 -126
  121. package/claude-assets/skills/cfn-redis-coordination/init-swarm.sh +0 -148
  122. package/claude-assets/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
  123. package/claude-assets/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
  124. package/claude-assets/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
  125. package/claude-assets/skills/cfn-redis-coordination/log-event.sh +0 -109
  126. package/claude-assets/skills/cfn-redis-coordination/metrics-export.sh +0 -674
  127. package/claude-assets/skills/cfn-redis-coordination/metrics-schema.json +0 -66
  128. package/claude-assets/skills/cfn-redis-coordination/metrics-storage.md +0 -31
  129. package/claude-assets/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
  130. package/claude-assets/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
  131. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
  132. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
  133. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
  134. package/claude-assets/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
  135. package/claude-assets/skills/cfn-redis-coordination/priority_wake.py +0 -134
  136. package/claude-assets/skills/cfn-redis-coordination/query-dlq.sh +0 -162
  137. package/claude-assets/skills/cfn-redis-coordination/query-logs.sh +0 -103
  138. package/claude-assets/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
  139. package/claude-assets/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
  140. package/claude-assets/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
  141. package/claude-assets/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
  142. package/claude-assets/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
  143. package/claude-assets/skills/cfn-redis-coordination/signal.sh +0 -38
  144. package/claude-assets/skills/cfn-redis-coordination/store-context.sh +0 -86
  145. package/claude-assets/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
  146. package/claude-assets/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
  147. package/claude-assets/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
  148. package/claude-assets/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
  149. package/claude-assets/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
  150. package/claude-assets/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
  151. package/claude-assets/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
  152. package/claude-assets/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
  153. package/claude-assets/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
  154. package/claude-assets/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
  155. package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
  156. package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
  157. package/claude-assets/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
  158. package/claude-assets/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
  159. package/claude-assets/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
  160. package/claude-assets/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
  161. package/claude-assets/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
@@ -1,66 +0,0 @@
1
- {
2
- "$schema": "http://json-schema.org/draft-07/schema#",
3
- "title": "CFN Loop Observability Metrics Schema",
4
- "version": "1.0.0",
5
- "type": "object",
6
- "properties": {
7
- "task_metadata": {
8
- "type": "object",
9
- "properties": {
10
- "task_id": {"type": "string"},
11
- "mode": {"enum": ["mvp", "standard", "enterprise"]},
12
- "current_iteration": {"type": "number", "minimum": 1},
13
- "start_timestamp": {"type": "number"}
14
- },
15
- "required": ["task_id", "mode"]
16
- },
17
- "iteration_metrics": {
18
- "type": "object",
19
- "properties": {
20
- "duration_ms": {"type": "number", "minimum": 0},
21
- "iteration_count": {"type": "number", "minimum": 1},
22
- "gate_pass_rate": {"type": "number", "minimum": 0, "maximum": 1}
23
- }
24
- },
25
- "agent_metrics": {
26
- "type": "object",
27
- "properties": {
28
- "latency_ms": {
29
- "type": "object",
30
- "additionalProperties": {
31
- "type": "object",
32
- "properties": {
33
- "min": {"type": "number"},
34
- "max": {"type": "number"},
35
- "avg": {"type": "number"}
36
- }
37
- }
38
- },
39
- "timeout_count": {"type": "number", "minimum": 0},
40
- "retry_count": {"type": "number", "minimum": 0},
41
- "heartbeat_miss_count": {"type": "number", "minimum": 0}
42
- }
43
- },
44
- "consensus_metrics": {
45
- "type": "object",
46
- "properties": {
47
- "loop3": {
48
- "type": "object",
49
- "properties": {
50
- "confidence_score": {"type": "number", "minimum": 0, "maximum": 1},
51
- "gate_pass": {"type": "boolean"}
52
- }
53
- },
54
- "loop2": {
55
- "type": "object",
56
- "properties": {
57
- "consensus_score": {"type": "number", "minimum": 0, "maximum": 1},
58
- "final_consensus": {"type": "boolean"}
59
- }
60
- },
61
- "quorum_fallback_count": {"type": "number", "minimum": 0}
62
- }
63
- }
64
- },
65
- "required": ["task_metadata", "iteration_metrics", "agent_metrics", "consensus_metrics"]
66
- }
@@ -1,31 +0,0 @@
1
- # CFN Loop Metrics Storage Strategy
2
-
3
- ## Redis Key Patterns
4
-
5
- ### Task-Level Metrics
6
- - `swarm:{task_id}:metrics:metadata`
7
- - `swarm:{task_id}:metrics:iteration`
8
- - `swarm:{task_id}:metrics:agent`
9
- - `swarm:{task_id}:metrics:consensus`
10
-
11
- ### Iteration-Specific Keys
12
- - `swarm:{task_id}:iteration:{iteration_number}:duration`
13
- - `swarm:{task_id}:iteration:{iteration_number}:gate_pass_rate`
14
-
15
- ### Agent-Level Keys
16
- - `swarm:{task_id}:agent:{agent_id}:latency`
17
- - `swarm:{task_id}:agent:{agent_id}:timeouts`
18
-
19
- ### Consensus Keys
20
- - `swarm:{task_id}:consensus:loop3:confidence`
21
- - `swarm:{task_id}:consensus:loop2:score`
22
-
23
- ## Storage Mechanisms
24
- - Hash (HSET): Detailed metrics
25
- - List (LPUSH): Time-series events
26
- - Sorted Set (ZADD): Ranked metrics
27
-
28
- ## Retention Policy
29
- - Default: 30 days
30
- - Can be configured via environment variable
31
- - Automatic pruning after task completion
@@ -1,391 +0,0 @@
1
- #!/bin/bash
2
- # monitor-cfn-violations.sh - Real-time CFN Loop violation detector
3
- # Part of Redis Coordination Skill
4
- #
5
- # Monitors active CFN Loop executions and detects common violations:
6
- # - Orchestrator never started
7
- # - Loop 2 started before Loop 3 complete (gate bypass)
8
- # - Missing agent completion signals
9
- # - Heartbeat monitoring not started
10
- # - Product Owner not consulted
11
- # - Coordinator timeout issues
12
- #
13
- # Alerts sent via Redis pub/sub and WebSocket (web portal integration)
14
- #
15
- # Usage: ./monitor-cfn-violations.sh [--interval 30] [--websocket-port 3001]
16
- #
17
- # Version: 1.0.0
18
- # Last Updated: 2025-10-20
19
-
20
- set -euo pipefail
21
-
22
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
23
- REDIS_HOST="${REDIS_HOST:-localhost}"
24
- REDIS_PORT="${REDIS_PORT:-6379}"
25
- CHECK_INTERVAL=30 # seconds between checks
26
- WEBSOCKET_PORT=3001
27
- VIOLATION_LOG="/tmp/cfn-violations.log"
28
-
29
- # Parse arguments
30
- while [[ $# -gt 0 ]]; do
31
- case $1 in
32
- --interval)
33
- CHECK_INTERVAL="$2"
34
- shift 2
35
- ;;
36
- --websocket-port)
37
- WEBSOCKET_PORT="$2"
38
- shift 2
39
- ;;
40
- *)
41
- echo "Unknown argument: $1"
42
- exit 1
43
- ;;
44
- esac
45
- done
46
-
47
- echo "=== CFN Loop Violation Monitor ==="
48
- echo "Redis: ${REDIS_HOST}:${REDIS_PORT}"
49
- echo "Check interval: ${CHECK_INTERVAL}s"
50
- echo "WebSocket port: ${WEBSOCKET_PORT}"
51
- echo "Log: ${VIOLATION_LOG}"
52
- echo ""
53
-
54
- # Initialize violation log
55
- echo "[$(date -Iseconds)] Monitor started" > "$VIOLATION_LOG"
56
-
57
- # Function: Send violation alert via Redis pub/sub
58
- send_violation_alert() {
59
- local task_id="$1"
60
- local violation_type="$2"
61
- local severity="$3" # critical, warning, info
62
- local description="$4"
63
- local recommendation="$5"
64
- local evidence="$6" # JSON string
65
-
66
- local timestamp=$(date -Iseconds)
67
-
68
- # Build JSON alert
69
- local alert=$(jq -nc \
70
- --arg ts "$timestamp" \
71
- --arg tid "$task_id" \
72
- --arg vtype "$violation_type" \
73
- --arg sev "$severity" \
74
- --arg desc "$description" \
75
- --arg rec "$recommendation" \
76
- --argjson ev "$evidence" \
77
- '{
78
- timestamp: $ts,
79
- task_id: $tid,
80
- violation_type: $vtype,
81
- severity: $sev,
82
- description: $desc,
83
- recommendation: $rec,
84
- evidence: $ev
85
- }')
86
-
87
- # Publish to task-specific channel
88
- echo "$alert" | redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
89
- PUBLISH "swarm:${task_id}:violations" >/dev/null
90
-
91
- # Publish to global violations channel (for web portal)
92
- echo "$alert" | redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
93
- PUBLISH "cfn:violations:all" >/dev/null
94
-
95
- # Log violation
96
- echo "[$(date -Iseconds)] [$severity] $violation_type: $description (task: $task_id)" >> "$VIOLATION_LOG"
97
-
98
- # Send to WebSocket server if available
99
- if command -v curl &>/dev/null; then
100
- curl -s -X POST "http://localhost:${WEBSOCKET_PORT}/api/violations" \
101
- -H "Content-Type: application/json" \
102
- -d "$alert" >/dev/null 2>&1 || true
103
- fi
104
-
105
- echo " 🚨 [$severity] $violation_type: $description"
106
- }
107
-
108
- # Function: Check if orchestrator never started
109
- check_orchestrator_not_started() {
110
- local swarm_id="$1"
111
-
112
- # Get swarm metadata
113
- local created_at=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
114
- HGET "$swarm_id" created_at 2>/dev/null || echo "")
115
-
116
- if [ -z "$created_at" ]; then
117
- return 0 # Swarm doesn't exist, skip
118
- fi
119
-
120
- # Calculate time elapsed
121
- local created_ts=$(date -d "$created_at" +%s 2>/dev/null || echo "0")
122
- local now_ts=$(date +%s)
123
- local elapsed=$((now_ts - created_ts))
124
-
125
- # If swarm exists >2 minutes but no status key, orchestrator never started
126
- if [ $elapsed -gt 120 ]; then
127
- local task_id=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
128
- HGET "$swarm_id" task_id 2>/dev/null || echo "unknown")
129
-
130
- local status_key="swarm:${task_id}:status"
131
- local status=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
132
- GET "$status_key" 2>/dev/null || echo "")
133
-
134
- if [ -z "$status" ]; then
135
- # Check if already alerted
136
- local alert_key="violation:${task_id}:orchestrator_not_started"
137
- if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
138
- local evidence=$(jq -nc \
139
- --arg created "$created_at" \
140
- --arg elapsed "$elapsed" \
141
- --arg status_exists "false" \
142
- '{
143
- swarm_created_at: $created,
144
- time_elapsed_seconds: ($elapsed | tonumber),
145
- status_key_exists: ($status_exists == "true"),
146
- agent_keys_count: 0
147
- }')
148
-
149
- send_violation_alert \
150
- "$task_id" \
151
- "orchestrator_never_started" \
152
- "critical" \
153
- "Orchestrator was never spawned after ${elapsed}s. Coordinator may have failed at Step 2." \
154
- "Check coordinator logs. Ensure orchestrator spawned with run_in_background: true" \
155
- "$evidence"
156
-
157
- # Mark as alerted (TTL 1 hour)
158
- redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
159
- SETEX "$alert_key" 3600 "alerted" >/dev/null
160
- fi
161
- fi
162
- fi
163
- }
164
-
165
- # Function: Check if Loop 2 started before Loop 3 completed (gate bypass)
166
- check_gate_bypass() {
167
- local task_id="$1"
168
-
169
- # Check if Loop 2 started
170
- local loop2_start=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
171
- GET "swarm:${task_id}:loop2:started" 2>/dev/null || echo "")
172
-
173
- if [ -n "$loop2_start" ]; then
174
- # Check if Loop 3 completed
175
- local loop3_complete=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
176
- GET "swarm:${task_id}:loop3:complete" 2>/dev/null || echo "")
177
-
178
- if [ -z "$loop3_complete" ]; then
179
- local alert_key="violation:${task_id}:gate_bypass"
180
- if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
181
- local evidence=$(jq -nc \
182
- --arg loop2_start "$loop2_start" \
183
- '{
184
- loop2_started_at: $loop2_start,
185
- loop3_complete: false,
186
- gate_passed: false
187
- }')
188
-
189
- send_violation_alert \
190
- "$task_id" \
191
- "gate_bypass_violation" \
192
- "critical" \
193
- "Loop 2 validators started before Loop 3 gate passed. This violates CFN Loop protocol." \
194
- "Check orchestrator gate check logic. Loop 2 must BLPOP on gate-passed signal." \
195
- "$evidence"
196
-
197
- redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
198
- SETEX "$alert_key" 3600 "alerted" >/dev/null
199
- fi
200
- fi
201
- fi
202
- }
203
-
204
- # Function: Check if agents completed but orchestrator hung
205
- check_orchestrator_hang() {
206
- local task_id="$1"
207
-
208
- # Get orchestrator status
209
- local status=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
210
- GET "swarm:${task_id}:status" 2>/dev/null || echo "")
211
-
212
- # Check if status indicates waiting for agents
213
- if [[ "$status" =~ loop3_waiting|loop2_waiting ]]; then
214
- # Count done signals
215
- local done_keys=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
216
- KEYS "swarm:${task_id}:*:done" 2>/dev/null | wc -l)
217
-
218
- # Get expected agent count
219
- local swarm_id=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
220
- GET "task:${task_id}:swarm" 2>/dev/null || echo "swarm:swarm-${task_id}")
221
- local expected=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
222
- HGET "${swarm_id}:metadata" max_agents 2>/dev/null || echo "0")
223
-
224
- if [ "$done_keys" -ge "$expected" ] && [ "$expected" -gt 0 ]; then
225
- # Agents completed but orchestrator still waiting
226
- local alert_key="violation:${task_id}:orchestrator_hang"
227
- if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
228
- local evidence=$(jq -nc \
229
- --arg status "$status" \
230
- --arg done "$done_keys" \
231
- --arg expected "$expected" \
232
- '{
233
- orchestrator_status: $status,
234
- done_signals_count: ($done | tonumber),
235
- expected_agents: ($expected | tonumber)
236
- }')
237
-
238
- send_violation_alert \
239
- "$task_id" \
240
- "orchestrator_hang_with_complete_agents" \
241
- "critical" \
242
- "All agents signaled completion but orchestrator still waiting. Possible BLPOP key mismatch." \
243
- "Check orchestrator DONE_KEY construction. Verify agent IDs match (with iteration suffix)." \
244
- "$evidence"
245
-
246
- redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
247
- SETEX "$alert_key" 3600 "alerted" >/dev/null
248
- fi
249
- fi
250
- fi
251
- }
252
-
253
- # Function: Check if coordinator monitoring with timeout
254
- check_coordinator_timeout_pattern() {
255
- local task_id="$1"
256
-
257
- # Check if swarm created but status never updated (5+ min)
258
- local swarm_id=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
259
- GET "task:${task_id}:swarm" 2>/dev/null || echo "swarm:swarm-${task_id}")
260
-
261
- local created_at=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
262
- HGET "${swarm_id}:metadata" created_at 2>/dev/null || echo "")
263
-
264
- if [ -n "$created_at" ]; then
265
- local created_ts=$(date -d "$created_at" +%s 2>/dev/null || echo "0")
266
- local now_ts=$(date +%s)
267
- local elapsed=$((now_ts - created_ts))
268
-
269
- # Check if swarm cancelled with SIGTERM after ~5-10 minutes
270
- local status=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
271
- HGET "${swarm_id}:metadata" status 2>/dev/null || echo "")
272
- local shutdown_reason=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
273
- HGET "${swarm_id}:metadata" shutdown_reason 2>/dev/null || echo "")
274
-
275
- if [ "$status" = "cancelled" ] && [ "$shutdown_reason" = "SIGTERM_received" ] && [ $elapsed -ge 300 ] && [ $elapsed -le 600 ]; then
276
- local alert_key="violation:${task_id}:coordinator_timeout"
277
- if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
278
- local evidence=$(jq -nc \
279
- --arg created "$created_at" \
280
- --arg elapsed "$elapsed" \
281
- --arg reason "$shutdown_reason" \
282
- '{
283
- swarm_created_at: $created,
284
- cancelled_after_seconds: ($elapsed | tonumber),
285
- shutdown_reason: $reason,
286
- likely_cause: "coordinator_monitoring_with_bash_timeout"
287
- }')
288
-
289
- send_violation_alert \
290
- "$task_id" \
291
- "coordinator_monitoring_timeout" \
292
- "critical" \
293
- "Coordinator cancelled after ${elapsed}s with SIGTERM. Likely wrapped monitoring in Bash() with timeout." \
294
- "Check coordinator template. Monitoring must use multiple tool calls in coordinator's own message loop, NOT single Bash() call." \
295
- "$evidence"
296
-
297
- redis-cli -h "$REDIS_HOST" -p "$REDIS_HOST" \
298
- SETEX "$alert_key" 3600 "alerted" >/dev/null
299
- fi
300
- fi
301
- fi
302
- }
303
-
304
- # Function: Check if Product Owner skipped
305
- check_product_owner_skipped() {
306
- local task_id="$1"
307
-
308
- # Check if Loop 2 completed
309
- local loop2_complete=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
310
- GET "swarm:${task_id}:loop2:complete" 2>/dev/null || echo "")
311
-
312
- if [ -n "$loop2_complete" ]; then
313
- # Check if Product Owner was consulted
314
- local po_consulted=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
315
- GET "swarm:${task_id}:product_owner:consulted" 2>/dev/null || echo "")
316
-
317
- if [ -z "$po_consulted" ]; then
318
- # Wait 60s after Loop 2 complete to allow time for PO spawn
319
- local loop2_ts=$(date -d "$loop2_complete" +%s 2>/dev/null || echo "0")
320
- local now_ts=$(date +%s)
321
- local elapsed=$((now_ts - loop2_ts))
322
-
323
- if [ $elapsed -gt 60 ]; then
324
- local alert_key="violation:${task_id}:po_skipped"
325
- if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
326
- local evidence=$(jq -nc \
327
- --arg loop2_complete "$loop2_complete" \
328
- --arg elapsed "$elapsed" \
329
- '{
330
- loop2_completed_at: $loop2_complete,
331
- time_since_loop2_seconds: ($elapsed | tonumber),
332
- product_owner_consulted: false
333
- }')
334
-
335
- send_violation_alert \
336
- "$task_id" \
337
- "product_owner_not_consulted" \
338
- "warning" \
339
- "Loop 2 completed ${elapsed}s ago but Product Owner not consulted. Strategic decision skipped." \
340
- "Check orchestrator Product Owner spawning logic. PO should be spawned after Loop 2 consensus check." \
341
- "$evidence"
342
-
343
- redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
344
- SETEX "$alert_key" 3600 "alerted" >/dev/null
345
- fi
346
- fi
347
- fi
348
- fi
349
- }
350
-
351
- # Main monitoring loop
352
- echo "Starting violation monitoring..."
353
- echo ""
354
-
355
- ITERATION=0
356
- while true; do
357
- ITERATION=$((ITERATION + 1))
358
- echo "[Check #${ITERATION}] $(date '+%H:%M:%S')"
359
-
360
- # Find all active swarm metadata keys
361
- SWARM_KEYS=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
362
- KEYS "swarm:*:metadata" 2>/dev/null || echo "")
363
-
364
- if [ -z "$SWARM_KEYS" ]; then
365
- echo " No active swarms found"
366
- else
367
- SWARM_COUNT=$(echo "$SWARM_KEYS" | wc -l)
368
- echo " Monitoring $SWARM_COUNT swarm(s)..."
369
-
370
- for SWARM_KEY in $SWARM_KEYS; do
371
- # Extract task ID
372
- TASK_ID=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
373
- HGET "$SWARM_KEY" task_id 2>/dev/null || echo "")
374
-
375
- if [ -z "$TASK_ID" ]; then
376
- continue
377
- fi
378
-
379
- # Run violation checks
380
- check_orchestrator_not_started "$SWARM_KEY"
381
- check_gate_bypass "$TASK_ID"
382
- check_orchestrator_hang "$TASK_ID"
383
- check_coordinator_timeout_pattern "$TASK_ID"
384
- check_product_owner_skipped "$TASK_ID"
385
- done
386
- fi
387
-
388
- echo " Sleeping ${CHECK_INTERVAL}s..."
389
- echo ""
390
- sleep "$CHECK_INTERVAL"
391
- done
@@ -1,101 +0,0 @@
1
- #!/bin/bash
2
- # Redis Coordination Skill - Agent Heartbeat Monitor
3
- # Version: 1.0.0
4
- # Last Updated: 2025-10-19
5
-
6
- # Strict error handling
7
- set -euo pipefail
8
-
9
- # Default values
10
- TASK_ID=""
11
- CHECK_INTERVAL=30
12
- MISS_THRESHOLD=2
13
- AGENTS=()
14
-
15
- # Parse command-line arguments
16
- while [[ $# -gt 0 ]]; do
17
- case "$1" in
18
- --task-id)
19
- TASK_ID="$2"
20
- shift 2
21
- ;;
22
- --check-interval)
23
- CHECK_INTERVAL="$2"
24
- shift 2
25
- ;;
26
- --miss-threshold)
27
- MISS_THRESHOLD="$2"
28
- shift 2
29
- ;;
30
- --agents)
31
- IFS=',' read -ra AGENTS <<< "$2"
32
- shift 2
33
- ;;
34
- *)
35
- echo "Unknown parameter: $1"
36
- exit 1
37
- ;;
38
- esac
39
- done
40
-
41
- # Validate required parameters
42
- if [[ -z "$TASK_ID" ]]; then
43
- echo "Error: task-id is required"
44
- exit 1
45
- fi
46
-
47
- # Function to check agent heartbeat
48
- check_agent_heartbeat() {
49
- local agent_id="$1"
50
- local miss_count=0
51
- local last_heartbeat
52
-
53
- # Check heartbeat key
54
- last_heartbeat=$(redis-cli get "swarm:${TASK_ID}:${agent_id}:heartbeat")
55
-
56
- # If no heartbeat found, increment miss count
57
- if [[ -z "$last_heartbeat" ]]; then
58
- ((miss_count++))
59
- echo "[$(date -u)] No heartbeat detected for agent: ${agent_id}" >> /var/log/claude-flow/heartbeat-misses.log
60
- else
61
- # Reset miss count if heartbeat exists
62
- miss_count=0
63
- fi
64
-
65
- # Trigger actions on missed heartbeats
66
- if ((miss_count >= MISS_THRESHOLD)); then
67
- handle_agent_failure "$agent_id"
68
- fi
69
- }
70
-
71
- # Function to handle agent failure
72
- handle_agent_failure() {
73
- local agent_id="$1"
74
-
75
- # Log agent failure
76
- echo "[$(date -u)] CRITICAL: Agent ${agent_id} failed health check" >> /var/log/claude-flow/agent-failures.log
77
-
78
- # Remove from active agents
79
- redis-cli srem "swarm:${TASK_ID}:active-agents" "$agent_id"
80
-
81
- # Trigger emergency recovery
82
- ./.claude/skills/cfn-redis-coordination/agent-recovery.sh \
83
- --task-id "$TASK_ID" \
84
- --agent-id "$agent_id"
85
- }
86
-
87
- # Main monitoring loop
88
- while true; do
89
- # If no agents specified, fetch from Redis set
90
- if [[ ${#AGENTS[@]} -eq 0 ]]; then
91
- mapfile -t AGENTS < <(redis-cli smembers "swarm:${TASK_ID}:active-agents")
92
- fi
93
-
94
- # Check heartbeat for each agent
95
- for agent in "${AGENTS[@]}"; do
96
- check_agent_heartbeat "$agent"
97
- done
98
-
99
- # Sleep before next check
100
- sleep "$CHECK_INTERVAL"
101
- done