claude-flow-novice 2.14.2 → 2.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. package/.claude/commands/CFN_LOOP_TASK_MODE.md +4 -47
  2. package/.claude/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
  3. package/claude-assets/commands/CFN_LOOP_TASK_MODE.md +4 -47
  4. package/claude-assets/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
  5. package/dist/cli/agent-prompt-builder.js +25 -0
  6. package/dist/cli/agent-prompt-builder.js.map +1 -1
  7. package/dist/cli/config-manager.js +91 -109
  8. package/package.json +1 -1
  9. package/scripts/init-project.js +1 -1
  10. package/.claude/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
  11. package/.claude/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
  12. package/.claude/skills/cfn-redis-coordination/LOGGING.md +0 -260
  13. package/.claude/skills/cfn-redis-coordination/README.md +0 -65
  14. package/.claude/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
  15. package/.claude/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
  16. package/.claude/skills/cfn-redis-coordination/SKILL.md +0 -720
  17. package/.claude/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
  18. package/.claude/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
  19. package/.claude/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
  20. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
  21. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
  22. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
  23. package/.claude/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
  24. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
  25. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
  26. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
  27. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
  28. package/.claude/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
  29. package/.claude/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
  30. package/.claude/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
  31. package/.claude/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
  32. package/.claude/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
  33. package/.claude/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
  34. package/.claude/skills/cfn-redis-coordination/examples/README.md +0 -73
  35. package/.claude/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
  36. package/.claude/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
  37. package/.claude/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
  38. package/.claude/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
  39. package/.claude/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
  40. package/.claude/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
  41. package/.claude/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
  42. package/.claude/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
  43. package/.claude/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
  44. package/.claude/skills/cfn-redis-coordination/heartbeat.sh +0 -126
  45. package/.claude/skills/cfn-redis-coordination/init-swarm.sh +0 -148
  46. package/.claude/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
  47. package/.claude/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
  48. package/.claude/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
  49. package/.claude/skills/cfn-redis-coordination/log-event.sh +0 -109
  50. package/.claude/skills/cfn-redis-coordination/metrics-export.sh +0 -674
  51. package/.claude/skills/cfn-redis-coordination/metrics-schema.json +0 -66
  52. package/.claude/skills/cfn-redis-coordination/metrics-storage.md +0 -31
  53. package/.claude/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
  54. package/.claude/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
  55. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
  56. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
  57. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
  58. package/.claude/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
  59. package/.claude/skills/cfn-redis-coordination/priority_wake.py +0 -134
  60. package/.claude/skills/cfn-redis-coordination/query-dlq.sh +0 -162
  61. package/.claude/skills/cfn-redis-coordination/query-logs.sh +0 -103
  62. package/.claude/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
  63. package/.claude/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
  64. package/.claude/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
  65. package/.claude/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
  66. package/.claude/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
  67. package/.claude/skills/cfn-redis-coordination/signal.sh +0 -38
  68. package/.claude/skills/cfn-redis-coordination/store-context.sh +0 -86
  69. package/.claude/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
  70. package/.claude/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
  71. package/.claude/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
  72. package/.claude/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
  73. package/.claude/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
  74. package/.claude/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
  75. package/.claude/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
  76. package/.claude/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
  77. package/.claude/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
  78. package/.claude/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
  79. package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
  80. package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
  81. package/.claude/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
  82. package/.claude/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
  83. package/.claude/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
  84. package/.claude/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
  85. package/.claude/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
  86. package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
  87. package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
  88. package/claude-assets/skills/cfn-redis-coordination/LOGGING.md +0 -260
  89. package/claude-assets/skills/cfn-redis-coordination/README.md +0 -65
  90. package/claude-assets/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
  91. package/claude-assets/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
  92. package/claude-assets/skills/cfn-redis-coordination/SKILL.md +0 -720
  93. package/claude-assets/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
  94. package/claude-assets/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
  95. package/claude-assets/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
  96. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
  97. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
  98. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
  99. package/claude-assets/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
  100. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
  101. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
  102. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
  103. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
  104. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
  105. package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
  106. package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
  107. package/claude-assets/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
  108. package/claude-assets/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
  109. package/claude-assets/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
  110. package/claude-assets/skills/cfn-redis-coordination/examples/README.md +0 -73
  111. package/claude-assets/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
  112. package/claude-assets/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
  113. package/claude-assets/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
  114. package/claude-assets/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
  115. package/claude-assets/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
  116. package/claude-assets/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
  117. package/claude-assets/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
  118. package/claude-assets/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
  119. package/claude-assets/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
  120. package/claude-assets/skills/cfn-redis-coordination/heartbeat.sh +0 -126
  121. package/claude-assets/skills/cfn-redis-coordination/init-swarm.sh +0 -148
  122. package/claude-assets/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
  123. package/claude-assets/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
  124. package/claude-assets/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
  125. package/claude-assets/skills/cfn-redis-coordination/log-event.sh +0 -109
  126. package/claude-assets/skills/cfn-redis-coordination/metrics-export.sh +0 -674
  127. package/claude-assets/skills/cfn-redis-coordination/metrics-schema.json +0 -66
  128. package/claude-assets/skills/cfn-redis-coordination/metrics-storage.md +0 -31
  129. package/claude-assets/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
  130. package/claude-assets/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
  131. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
  132. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
  133. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
  134. package/claude-assets/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
  135. package/claude-assets/skills/cfn-redis-coordination/priority_wake.py +0 -134
  136. package/claude-assets/skills/cfn-redis-coordination/query-dlq.sh +0 -162
  137. package/claude-assets/skills/cfn-redis-coordination/query-logs.sh +0 -103
  138. package/claude-assets/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
  139. package/claude-assets/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
  140. package/claude-assets/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
  141. package/claude-assets/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
  142. package/claude-assets/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
  143. package/claude-assets/skills/cfn-redis-coordination/signal.sh +0 -38
  144. package/claude-assets/skills/cfn-redis-coordination/store-context.sh +0 -86
  145. package/claude-assets/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
  146. package/claude-assets/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
  147. package/claude-assets/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
  148. package/claude-assets/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
  149. package/claude-assets/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
  150. package/claude-assets/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
  151. package/claude-assets/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
  152. package/claude-assets/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
  153. package/claude-assets/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
  154. package/claude-assets/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
  155. package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
  156. package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
  157. package/claude-assets/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
  158. package/claude-assets/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
  159. package/claude-assets/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
  160. package/claude-assets/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
  161. package/claude-assets/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
@@ -1,137 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- ##############################################################################
4
- # Heartbeat Monitoring Functions Library
5
- # Extracted from orchestrate-cfn-loop.sh for standalone testing
6
- ##############################################################################
7
-
8
- ##############################################################################
9
- # Heartbeat Monitoring Functions
10
- ##############################################################################
11
- declare -A MISSED_HEARTBEATS # Track missed heartbeats per agent
12
-
13
- function check_agent_heartbeat() {
14
- local agent="$1"
15
- local task_id="$2"
16
-
17
- HB_KEY="swarm:${task_id}:${agent}:heartbeat"
18
- HB_DATA=$(redis-cli GET "$HB_KEY" 2>/dev/null || echo "")
19
-
20
- if [ -z "$HB_DATA" ] || [ "$HB_DATA" = "(nil)" ]; then
21
- return 1 # Dead
22
- else
23
- return 0 # Alive
24
- fi
25
- }
26
-
27
- function calculate_quorum() {
28
- local quorum_spec="$1"
29
- local total_agents="$2"
30
-
31
- # If no quorum specified, require all agents
32
- if [ -z "$quorum_spec" ]; then
33
- echo "$total_agents"
34
- return 0
35
- fi
36
-
37
- # Check if percentage format (e.g., "85%")
38
- if [[ "$quorum_spec" =~ %$ ]]; then
39
- # Extract percentage value (remove % suffix)
40
- local pct="${quorum_spec%\%}"
41
- # Calculate: ceil(total_agents * pct / 100)
42
- echo "scale=0; ($total_agents * $pct + 50) / 100" | bc
43
- # Check if decimal (0.0-1.0), treat as fraction
44
- elif [[ "$quorum_spec" =~ ^0?\.[0-9]+$ ]]; then
45
- # Calculate: ceil(total_agents * fraction)
46
- echo "scale=0; ($quorum_spec * $total_agents + 0.5) / 1" | bc
47
- else
48
- # Absolute number - validate it doesn't exceed total
49
- if [ "$quorum_spec" -gt "$total_agents" ]; then
50
- echo "Error: Quorum ($quorum_spec) exceeds total agents ($total_agents)" >&2
51
- return 1
52
- fi
53
- echo "$quorum_spec"
54
- fi
55
- }
56
-
57
- function check_heartbeats_loop() {
58
- local task_id="$1"
59
- local loop_name="$2"
60
- shift 2
61
- local agents=("$@")
62
-
63
- for AGENT in "${agents[@]}"; do
64
- # Skip agents already marked as failed
65
- if [[ " ${LOOP3_FAILED_AGENTS[@]} ${LOOP2_FAILED_AGENTS[@]} " =~ " ${AGENT} " ]]; then
66
- continue
67
- fi
68
-
69
- if ! check_agent_heartbeat "$AGENT" "$task_id"; then
70
- MISSED_HEARTBEATS["$AGENT"]=$((${MISSED_HEARTBEATS["$AGENT"]:-0} + 1))
71
-
72
- if [ ${MISSED_HEARTBEATS["$AGENT"]} -ge 2 ]; then
73
- local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
74
- echo " [$timestamp] [$loop_name] ⚠️ $AGENT appears hung (no heartbeat for 60s)" >&2
75
-
76
- # Determine which loop this agent belongs to and check quorum
77
- if [[ " ${LOOP3_AGENTS} " =~ " ${AGENT} " ]]; then
78
- REMAINING=$((${#LOOP3_COMPLETED_AGENTS[@]}))
79
- REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP3" "$LOOP3_TOTAL")
80
- elif [[ " ${LOOP2_AGENTS} " =~ " ${LOOP2_AGENTS} " ]]; then
81
- REMAINING=$((${#LOOP2_COMPLETED_AGENTS[@]}))
82
- REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP2" "$LOOP2_TOTAL")
83
- else
84
- continue
85
- fi
86
-
87
- if [ $REMAINING -ge $REQUIRED ]; then
88
- echo " [$timestamp] [$loop_name] ℹ️ Continuing with quorum (${REMAINING}/${REQUIRED} agents)" >&2
89
- else
90
- echo " [$timestamp] [$loop_name] ⚠️ Cannot meet quorum without $AGENT (${REMAINING}/${REQUIRED})" >&2
91
- fi
92
- fi
93
- else
94
- MISSED_HEARTBEATS["$AGENT"]=0 # Reset counter
95
- fi
96
- done
97
- }
98
-
99
- function start_heartbeat_monitor() {
100
- local task_id="$1"
101
- local loop_name="$2"
102
- shift 2
103
- local agents=("$@")
104
-
105
- # Create marker file for this monitor
106
- local monitor_marker="/tmp/heartbeat-monitor-${task_id}-${loop_name}.active"
107
- touch "$monitor_marker"
108
-
109
- (
110
- while [ -f "$monitor_marker" ]; do
111
- # Check for shutdown
112
- if [ "${SHUTDOWN_REQUESTED:-0}" -eq 1 ]; then
113
- break
114
- fi
115
-
116
- check_heartbeats_loop "$task_id" "$loop_name" "${agents[@]}"
117
- sleep 30
118
- done
119
- ) &
120
-
121
- echo "$!" # Return PID
122
- }
123
-
124
- function stop_heartbeat_monitor() {
125
- local task_id="$1"
126
- local loop_name="$2"
127
- local monitor_pid="$3"
128
-
129
- # Remove marker file to stop the monitor loop
130
- rm -f "/tmp/heartbeat-monitor-${task_id}-${loop_name}.active"
131
-
132
- # Kill monitor process if still running
133
- if [ -n "$monitor_pid" ] && kill -0 "$monitor_pid" 2>/dev/null; then
134
- kill "$monitor_pid" 2>/dev/null || true
135
- wait "$monitor_pid" 2>/dev/null || true
136
- fi
137
- }
@@ -1,106 +0,0 @@
1
- # Heartbeat Protocol Specification
2
-
3
- ## Overview
4
- The heartbeat protocol provides a mechanism for detecting hung or unresponsive agents in distributed agent swarms using Redis as a coordination mechanism.
5
-
6
- ## Key Design Components
7
-
8
- ### 1. Heartbeat Key Pattern
9
- ```
10
- swarm:{task_id}:{agent_id}:heartbeat
11
- ```
12
-
13
- ### 2. Heartbeat Message Structure
14
- ```json
15
- {
16
- "timestamp": 1760898665,
17
- "status": "working|idle|error",
18
- "iteration": 2,
19
- "progress": 0.75,
20
- "agent_details": {
21
- "agent_id": "architect-5",
22
- "task_id": "redis-phase5-1760898665",
23
- "environment": {
24
- "cpu_usage": 0.65,
25
- "memory_usage": 0.42,
26
- "system_load": 0.3
27
- }
28
- }
29
- }
30
- ```
31
-
32
- ### 3. Heartbeat Configuration
33
- - **Update Frequency**: Every 30 seconds
34
- - **Default TTL**: 60 seconds
35
- - **Miss Threshold**: 2 consecutive missed heartbeats
36
- - **Quorum Threshold**: 70% of agents must be responsive
37
-
38
- ### 4. Heartbeat Workflow
39
- 1. Agent periodically sends heartbeat via Redis SET
40
- 2. Orchestrator monitors heartbeats in background process
41
- 3. On missed heartbeats, trigger progressive recovery mechanisms
42
-
43
- ### 5. Recovery Stages
44
- - **Stage 1 (Miss 1)**: Log warning, continue monitoring
45
- - **Stage 2 (Miss 2)**:
46
- - Check if remaining agents meet quorum
47
- - Log to Dead Letter Queue (DLQ)
48
- - Attempt soft restart of agent
49
- - **Stage 3 (Miss 3)**:
50
- - Hard restart agent
51
- - Potentially replace with standby agent
52
-
53
- ### 6. Implementation Pseudo-code
54
- ```bash
55
- # Send Heartbeat
56
- redis-cli set "swarm:${TASK_ID}:${AGENT_ID}:heartbeat" \
57
- "$(generate_heartbeat_payload)" \
58
- EX 60 # 60-second expiry
59
-
60
- # Check Heartbeats
61
- check_agent_heartbeats() {
62
- for agent in ${AGENTS[@]}; do
63
- heartbeat=$(redis-cli get "swarm:${TASK_ID}:${agent}:heartbeat")
64
- if [[ -z "$heartbeat" ]]; then
65
- handle_missed_heartbeat "$agent"
66
- fi
67
- done
68
- }
69
-
70
- handle_missed_heartbeat() {
71
- local agent="$1"
72
- local miss_count=$(get_miss_count "$agent")
73
-
74
- case "$miss_count" in
75
- 1) log_warning "$agent missed first heartbeat" ;;
76
- 2)
77
- log_dlq "$agent"
78
- attempt_soft_restart "$agent"
79
- check_quorum
80
- ;;
81
- 3)
82
- hard_restart_agent "$agent"
83
- ;;
84
- esac
85
- }
86
- ```
87
-
88
- ### 7. Monitoring and Logging
89
- - Comprehensive logging to `/var/log/claude-flow/heartbeat.log`
90
- - Prometheus metrics for heartbeat health
91
- - Grafana dashboard tracking agent responsiveness
92
-
93
- ### 8. Security Considerations
94
- - Cryptographically sign heartbeat messages
95
- - Rate limit heartbeat submissions
96
- - Validate heartbeat payload schema
97
-
98
- ## Integration Points
99
- - Redis Coordination Skill
100
- - CFN Loop Validation
101
- - Agent Spawning Mechanism
102
-
103
- ## Test Coverage
104
- - Unit tests for heartbeat generation
105
- - Integration tests for recovery mechanisms
106
- - Chaos testing (intentional agent hanging)
@@ -1,126 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Heartbeat Monitoring Script for Agent Coordination
4
- # Implements 60s TTL, 30s check interval, and quorum fallback detection
5
-
6
- # Dependencies
7
- REDIS_CLI=$(which redis-cli)
8
- if [ -z "$REDIS_CLI" ]; then
9
- echo "Error: redis-cli not found. Please install Redis client."
10
- exit 1
11
- fi
12
-
13
- # Logging configuration
14
- LOG_FILE="/tmp/heartbeat-debug.log"
15
- touch "$LOG_FILE"
16
-
17
- log_debug() {
18
- echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
19
- }
20
-
21
- # Configuration
22
- HEARTBEAT_TTL=60 # 60 seconds TTL
23
- CHECK_INTERVAL=30 # 30 seconds between checks
24
- MISSED_THRESHOLD=2 # Number of missed heartbeats before considering agent hung
25
- BUFFER_TIME=3 # Additional buffer time for timing flexibility
26
-
27
- send_heartbeat() {
28
- local task_id="$1"
29
- local agent_id="$2"
30
-
31
- if [ -z "$task_id" ] || [ -z "$agent_id" ]; then
32
- echo "Usage: $0 send --task-id TASK_ID --agent-id AGENT_ID"
33
- exit 1
34
- fi
35
-
36
- log_debug "Sending heartbeat for task=$task_id, agent=$agent_id"
37
-
38
- # Use SETEX to create a key with expiration
39
- $REDIS_CLI SETEX "swarm:agent_status:${task_id}:${agent_id}" "$HEARTBEAT_TTL" "alive"
40
- echo "Heartbeat sent for agent ${agent_id} in task ${task_id}"
41
- }
42
-
43
- check_heartbeat() {
44
- local task_id="$1"
45
- local agent_id="$2"
46
-
47
- if [ -z "$task_id" ] || [ -z "$agent_id" ]; then
48
- echo "Usage: $0 check --task-id TASK_ID --agent-id AGENT_ID"
49
- exit 1
50
- fi
51
-
52
- local key="swarm:agent_status:${task_id}:${agent_id}"
53
- local status
54
- local ttl
55
-
56
- # Retrieve status and TTL
57
- status=$($REDIS_CLI GET "$key")
58
- ttl=$($REDIS_CLI TTL "$key")
59
-
60
- log_debug "Checking heartbeat for task=$task_id, agent=$agent_id: status=$status, ttl=$ttl"
61
-
62
- # Check for key existence and status
63
- if [ -z "$status" ] || [ "$ttl" -le 0 ]; then
64
- log_debug "Heartbeat DEAD: status missing or expired"
65
- echo "dead"
66
- increment_missed_counter "$task_id" "$agent_id"
67
- return 1
68
- else
69
- log_debug "Heartbeat ALIVE: status=$status, ttl=$ttl"
70
- echo "alive"
71
- return 0
72
- fi
73
- }
74
-
75
- increment_missed_counter() {
76
- local task_id="$1"
77
- local agent_id="$2"
78
- local missed_key="swarm:missed_heartbeats:${task_id}:${agent_id}"
79
-
80
- # Increment missed heartbeat counter
81
- local missed_count=$($REDIS_CLI INCR "$missed_key")
82
-
83
- # Set expiry for missed counter to match heartbeat TTL
84
- $REDIS_CLI EXPIRE "$missed_key" $HEARTBEAT_TTL
85
-
86
- log_debug "Missed heartbeat counter for task=$task_id, agent=$agent_id: count=$missed_count"
87
-
88
- if [ "$missed_count" -ge "$MISSED_THRESHOLD" ]; then
89
- # Trigger quorum fallback mechanism
90
- $REDIS_CLI LPUSH "swarm:${task_id}:quorum_fallback" "$agent_id"
91
- log_debug "QUORUM FALLBACK: Agent $agent_id missed $missed_count heartbeats"
92
- echo "WARN: Agent $agent_id missed $missed_count heartbeats. Quorum fallback triggered."
93
- fi
94
- }
95
-
96
- # Parse arguments
97
- case "$1" in
98
- send)
99
- shift
100
- while [[ "$#" -gt 0 ]]; do
101
- case $1 in
102
- --task-id) task_id="$2"; shift ;;
103
- --agent-id) agent_id="$2"; shift ;;
104
- *) echo "Unknown parameter passed: $1"; exit 1 ;;
105
- esac
106
- shift
107
- done
108
- send_heartbeat "$task_id" "$agent_id"
109
- ;;
110
- check)
111
- shift
112
- while [[ "$#" -gt 0 ]]; do
113
- case $1 in
114
- --task-id) task_id="$2"; shift ;;
115
- --agent-id) agent_id="$2"; shift ;;
116
- *) echo "Unknown parameter passed: $1"; exit 1 ;;
117
- esac
118
- shift
119
- done
120
- check_heartbeat "$task_id" "$agent_id"
121
- ;;
122
- *)
123
- echo "Usage: $0 {send|check} --task-id TASK_ID --agent-id AGENT_ID"
124
- exit 1
125
- ;;
126
- esac
@@ -1,148 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- ##############################################################################
4
- # Initialize Swarm - Redis Coordination Primitive
5
- #
6
- # Creates swarm metadata in Redis for coordination tracking across any
7
- # multi-agent workflow (CFN Loop, independent swarms, custom orchestration).
8
- #
9
- # Usage:
10
- # ./init-swarm.sh --swarm-id <id> \
11
- # --agents <agent1,agent2,...> \
12
- # [--task-id <id>] \
13
- # [--topology <mesh|hierarchical|chain>] \
14
- # [--ttl <seconds>] \
15
- # [--metadata <json-string>]
16
- #
17
- # Per-Agent Timeout Configuration:
18
- # Agents can have custom timeout values set via Redis:
19
- #
20
- # redis-cli SETEX "swarm:<task-id>:<agent-id>:timeout" 86400 <timeout-seconds>
21
- #
22
- # If no custom timeout is set, the orchestrator will use role-based defaults:
23
- # - researcher: 7200s (2 hours)
24
- # - backend-dev, coder, frontend-dev: 3600s (1 hour)
25
- # - reviewer, tester, security: 1800s (30 minutes)
26
- # - coordinator, orchestrator, product-owner: 900s (15 minutes)
27
- # - default: 3600s (1 hour)
28
- #
29
- # Benefits:
30
- # - Namespace isolation for concurrent swarms
31
- # - Agent inventory and resource tracking
32
- # - Status monitoring (in_progress/completed)
33
- # - Automatic cleanup via TTL
34
- # - Per-agent timeout configuration via get-agent-timeout.sh
35
- ##############################################################################
36
-
37
- set -euo pipefail
38
-
39
- # Configuration
40
- SWARM_ID=""
41
- AGENTS=""
42
- TASK_ID=""
43
- TOPOLOGY="mesh"
44
- TTL=604800 # 7 days default
45
- MAX_AGENTS=""
46
- METADATA_EXTRA=""
47
- AGENT_TIMEOUTS=""
48
-
49
- # Parse arguments
50
- while [[ $# -gt 0 ]]; do
51
- case $1 in
52
- --swarm-id)
53
- SWARM_ID="$2"
54
- shift 2
55
- ;;
56
- --agents)
57
- AGENTS="$2"
58
- shift 2
59
- ;;
60
- --task-id)
61
- TASK_ID="$2"
62
- shift 2
63
- ;;
64
- --topology)
65
- TOPOLOGY="$2"
66
- shift 2
67
- ;;
68
- --ttl)
69
- TTL="$2"
70
- shift 2
71
- ;;
72
- --max-agents)
73
- MAX_AGENTS="$2"
74
- shift 2
75
- ;;
76
- --metadata)
77
- METADATA_EXTRA="$2"
78
- shift 2
79
- ;;
80
- --agent-timeouts)
81
- AGENT_TIMEOUTS="$2"
82
- shift 2
83
- ;;
84
- *)
85
- echo "Unknown option: $1"
86
- echo "Usage: $0 --swarm-id <id> --agents <agent1,agent2,...> [options]"
87
- exit 1
88
- ;;
89
- esac
90
- done
91
-
92
- # Validation
93
- if [ -z "$SWARM_ID" ] || [ -z "$AGENTS" ]; then
94
- echo "Error: Required parameters missing"
95
- echo "Usage: $0 --swarm-id <id> --agents <agent1,agent2,...>"
96
- exit 1
97
- fi
98
-
99
- # Calculate max agents if not provided
100
- if [ -z "$MAX_AGENTS" ]; then
101
- IFS=',' read -ra AGENT_ARRAY <<< "$AGENTS"
102
- MAX_AGENTS=${#AGENT_ARRAY[@]}
103
- fi
104
-
105
- # Use swarm-id as task-id if not provided
106
- if [ -z "$TASK_ID" ]; then
107
- TASK_ID="$SWARM_ID"
108
- fi
109
-
110
- echo "[Swarm] Initializing swarm: $SWARM_ID"
111
- echo "[Swarm] Topology: $TOPOLOGY"
112
- echo "[Swarm] Total agents: $MAX_AGENTS"
113
- echo "[Swarm] TTL: $TTL seconds ($(($TTL / 86400)) days)"
114
-
115
- # Create swarm metadata key
116
- METADATA_KEY="swarm:${SWARM_ID}:metadata"
117
-
118
- # Store base metadata
119
- # Extract repository name from PWD
120
- REPO_NAME=$(basename "$(pwd)")
121
-
122
- redis-cli hset "$METADATA_KEY" \
123
- swarm_id "$SWARM_ID" \
124
- task_id "$TASK_ID" \
125
- topology "$TOPOLOGY" \
126
- max_agents "$MAX_AGENTS" \
127
- agents "$AGENTS" \
128
- created_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
129
- status "in_progress" \
130
- repository "$REPO_NAME" \
131
- cwd "$(pwd)" > /dev/null
132
-
133
- # Add extra metadata if provided (JSON string)
134
- if [ -n "$METADATA_EXTRA" ]; then
135
- # Parse JSON and add each key-value pair
136
- echo "$METADATA_EXTRA" | jq -r 'to_entries | .[] | "\(.key) \(.value)"' | while read -r KEY VALUE; do
137
- redis-cli hset "$METADATA_KEY" "$KEY" "$VALUE" > /dev/null
138
- done
139
- fi
140
-
141
- # Set TTL
142
- redis-cli expire "$METADATA_KEY" "$TTL" > /dev/null
143
-
144
- echo "[Swarm] Registered in Redis: $METADATA_KEY"
145
- echo "[Swarm] ✅ Initialization complete"
146
-
147
- # Output swarm ID for chaining
148
- echo "$SWARM_ID"