claude-flow-novice 2.14.3 → 2.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/.claude/commands/CFN_LOOP_TASK_MODE.md +4 -47
  2. package/.claude/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
  3. package/claude-assets/commands/CFN_LOOP_TASK_MODE.md +4 -47
  4. package/claude-assets/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
  5. package/dist/agents/agent-loader.js +165 -146
  6. package/dist/agents/agent-loader.js.map +1 -1
  7. package/dist/cli/agent-prompt-builder.js +25 -0
  8. package/dist/cli/agent-prompt-builder.js.map +1 -1
  9. package/dist/cli/config-manager.js +91 -109
  10. package/package.json +1 -1
  11. package/.claude/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
  12. package/.claude/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
  13. package/.claude/skills/cfn-redis-coordination/LOGGING.md +0 -260
  14. package/.claude/skills/cfn-redis-coordination/README.md +0 -65
  15. package/.claude/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
  16. package/.claude/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
  17. package/.claude/skills/cfn-redis-coordination/SKILL.md +0 -720
  18. package/.claude/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
  19. package/.claude/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
  20. package/.claude/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
  21. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
  22. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
  23. package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
  24. package/.claude/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
  25. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
  26. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
  27. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
  28. package/.claude/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
  29. package/.claude/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
  30. package/.claude/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
  31. package/.claude/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
  32. package/.claude/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
  33. package/.claude/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
  34. package/.claude/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
  35. package/.claude/skills/cfn-redis-coordination/examples/README.md +0 -73
  36. package/.claude/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
  37. package/.claude/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
  38. package/.claude/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
  39. package/.claude/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
  40. package/.claude/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
  41. package/.claude/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
  42. package/.claude/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
  43. package/.claude/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
  44. package/.claude/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
  45. package/.claude/skills/cfn-redis-coordination/heartbeat.sh +0 -126
  46. package/.claude/skills/cfn-redis-coordination/init-swarm.sh +0 -148
  47. package/.claude/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
  48. package/.claude/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
  49. package/.claude/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
  50. package/.claude/skills/cfn-redis-coordination/log-event.sh +0 -109
  51. package/.claude/skills/cfn-redis-coordination/metrics-export.sh +0 -674
  52. package/.claude/skills/cfn-redis-coordination/metrics-schema.json +0 -66
  53. package/.claude/skills/cfn-redis-coordination/metrics-storage.md +0 -31
  54. package/.claude/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
  55. package/.claude/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
  56. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
  57. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
  58. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
  59. package/.claude/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
  60. package/.claude/skills/cfn-redis-coordination/priority_wake.py +0 -134
  61. package/.claude/skills/cfn-redis-coordination/query-dlq.sh +0 -162
  62. package/.claude/skills/cfn-redis-coordination/query-logs.sh +0 -103
  63. package/.claude/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
  64. package/.claude/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
  65. package/.claude/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
  66. package/.claude/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
  67. package/.claude/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
  68. package/.claude/skills/cfn-redis-coordination/signal.sh +0 -38
  69. package/.claude/skills/cfn-redis-coordination/store-context.sh +0 -86
  70. package/.claude/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
  71. package/.claude/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
  72. package/.claude/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
  73. package/.claude/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
  74. package/.claude/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
  75. package/.claude/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
  76. package/.claude/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
  77. package/.claude/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
  78. package/.claude/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
  79. package/.claude/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
  80. package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
  81. package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
  82. package/.claude/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
  83. package/.claude/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
  84. package/.claude/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
  85. package/.claude/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
  86. package/.claude/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
  87. package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
  88. package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
  89. package/claude-assets/skills/cfn-redis-coordination/LOGGING.md +0 -260
  90. package/claude-assets/skills/cfn-redis-coordination/README.md +0 -65
  91. package/claude-assets/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
  92. package/claude-assets/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
  93. package/claude-assets/skills/cfn-redis-coordination/SKILL.md +0 -720
  94. package/claude-assets/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
  95. package/claude-assets/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
  96. package/claude-assets/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
  97. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
  98. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
  99. package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
  100. package/claude-assets/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
  101. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
  102. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
  103. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
  104. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
  105. package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
  106. package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
  107. package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
  108. package/claude-assets/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
  109. package/claude-assets/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
  110. package/claude-assets/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
  111. package/claude-assets/skills/cfn-redis-coordination/examples/README.md +0 -73
  112. package/claude-assets/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
  113. package/claude-assets/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
  114. package/claude-assets/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
  115. package/claude-assets/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
  116. package/claude-assets/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
  117. package/claude-assets/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
  118. package/claude-assets/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
  119. package/claude-assets/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
  120. package/claude-assets/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
  121. package/claude-assets/skills/cfn-redis-coordination/heartbeat.sh +0 -126
  122. package/claude-assets/skills/cfn-redis-coordination/init-swarm.sh +0 -148
  123. package/claude-assets/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
  124. package/claude-assets/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
  125. package/claude-assets/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
  126. package/claude-assets/skills/cfn-redis-coordination/log-event.sh +0 -109
  127. package/claude-assets/skills/cfn-redis-coordination/metrics-export.sh +0 -674
  128. package/claude-assets/skills/cfn-redis-coordination/metrics-schema.json +0 -66
  129. package/claude-assets/skills/cfn-redis-coordination/metrics-storage.md +0 -31
  130. package/claude-assets/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
  131. package/claude-assets/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
  132. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
  133. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
  134. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
  135. package/claude-assets/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
  136. package/claude-assets/skills/cfn-redis-coordination/priority_wake.py +0 -134
  137. package/claude-assets/skills/cfn-redis-coordination/query-dlq.sh +0 -162
  138. package/claude-assets/skills/cfn-redis-coordination/query-logs.sh +0 -103
  139. package/claude-assets/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
  140. package/claude-assets/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
  141. package/claude-assets/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
  142. package/claude-assets/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
  143. package/claude-assets/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
  144. package/claude-assets/skills/cfn-redis-coordination/signal.sh +0 -38
  145. package/claude-assets/skills/cfn-redis-coordination/store-context.sh +0 -86
  146. package/claude-assets/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
  147. package/claude-assets/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
  148. package/claude-assets/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
  149. package/claude-assets/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
  150. package/claude-assets/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
  151. package/claude-assets/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
  152. package/claude-assets/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
  153. package/claude-assets/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
  154. package/claude-assets/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
  155. package/claude-assets/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
  156. package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
  157. package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
  158. package/claude-assets/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
  159. package/claude-assets/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
  160. package/claude-assets/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
  161. package/claude-assets/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
  162. package/claude-assets/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
@@ -1,137 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- ##############################################################################
4
- # Heartbeat Monitoring Functions Library
5
- # Extracted from orchestrate-cfn-loop.sh for standalone testing
6
- ##############################################################################
7
-
8
- ##############################################################################
9
- # Heartbeat Monitoring Functions
10
- ##############################################################################
11
- declare -A MISSED_HEARTBEATS # Track missed heartbeats per agent
12
-
13
- function check_agent_heartbeat() {
14
- local agent="$1"
15
- local task_id="$2"
16
-
17
- HB_KEY="swarm:${task_id}:${agent}:heartbeat"
18
- HB_DATA=$(redis-cli GET "$HB_KEY" 2>/dev/null || echo "")
19
-
20
- if [ -z "$HB_DATA" ] || [ "$HB_DATA" = "(nil)" ]; then
21
- return 1 # Dead
22
- else
23
- return 0 # Alive
24
- fi
25
- }
26
-
27
- function calculate_quorum() {
28
- local quorum_spec="$1"
29
- local total_agents="$2"
30
-
31
- # If no quorum specified, require all agents
32
- if [ -z "$quorum_spec" ]; then
33
- echo "$total_agents"
34
- return 0
35
- fi
36
-
37
- # Check if percentage format (e.g., "85%")
38
- if [[ "$quorum_spec" =~ %$ ]]; then
39
- # Extract percentage value (remove % suffix)
40
- local pct="${quorum_spec%\%}"
41
- # Calculate: ceil(total_agents * pct / 100)
42
- echo "scale=0; ($total_agents * $pct + 50) / 100" | bc
43
- # Check if decimal (0.0-1.0), treat as fraction
44
- elif [[ "$quorum_spec" =~ ^0?\.[0-9]+$ ]]; then
45
- # Calculate: ceil(total_agents * fraction)
46
- echo "scale=0; ($quorum_spec * $total_agents + 0.5) / 1" | bc
47
- else
48
- # Absolute number - validate it doesn't exceed total
49
- if [ "$quorum_spec" -gt "$total_agents" ]; then
50
- echo "Error: Quorum ($quorum_spec) exceeds total agents ($total_agents)" >&2
51
- return 1
52
- fi
53
- echo "$quorum_spec"
54
- fi
55
- }
56
-
57
- function check_heartbeats_loop() {
58
- local task_id="$1"
59
- local loop_name="$2"
60
- shift 2
61
- local agents=("$@")
62
-
63
- for AGENT in "${agents[@]}"; do
64
- # Skip agents already marked as failed
65
- if [[ " ${LOOP3_FAILED_AGENTS[@]} ${LOOP2_FAILED_AGENTS[@]} " =~ " ${AGENT} " ]]; then
66
- continue
67
- fi
68
-
69
- if ! check_agent_heartbeat "$AGENT" "$task_id"; then
70
- MISSED_HEARTBEATS["$AGENT"]=$((${MISSED_HEARTBEATS["$AGENT"]:-0} + 1))
71
-
72
- if [ ${MISSED_HEARTBEATS["$AGENT"]} -ge 2 ]; then
73
- local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
74
- echo " [$timestamp] [$loop_name] ⚠️ $AGENT appears hung (no heartbeat for 60s)" >&2
75
-
76
- # Determine which loop this agent belongs to and check quorum
77
- if [[ " ${LOOP3_AGENTS} " =~ " ${AGENT} " ]]; then
78
- REMAINING=$((${#LOOP3_COMPLETED_AGENTS[@]}))
79
- REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP3" "$LOOP3_TOTAL")
80
- elif [[ " ${LOOP2_AGENTS} " =~ " ${LOOP2_AGENTS} " ]]; then
81
- REMAINING=$((${#LOOP2_COMPLETED_AGENTS[@]}))
82
- REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP2" "$LOOP2_TOTAL")
83
- else
84
- continue
85
- fi
86
-
87
- if [ $REMAINING -ge $REQUIRED ]; then
88
- echo " [$timestamp] [$loop_name] ℹ️ Continuing with quorum (${REMAINING}/${REQUIRED} agents)" >&2
89
- else
90
- echo " [$timestamp] [$loop_name] ⚠️ Cannot meet quorum without $AGENT (${REMAINING}/${REQUIRED})" >&2
91
- fi
92
- fi
93
- else
94
- MISSED_HEARTBEATS["$AGENT"]=0 # Reset counter
95
- fi
96
- done
97
- }
98
-
99
- function start_heartbeat_monitor() {
100
- local task_id="$1"
101
- local loop_name="$2"
102
- shift 2
103
- local agents=("$@")
104
-
105
- # Create marker file for this monitor
106
- local monitor_marker="/tmp/heartbeat-monitor-${task_id}-${loop_name}.active"
107
- touch "$monitor_marker"
108
-
109
- (
110
- while [ -f "$monitor_marker" ]; do
111
- # Check for shutdown
112
- if [ "${SHUTDOWN_REQUESTED:-0}" -eq 1 ]; then
113
- break
114
- fi
115
-
116
- check_heartbeats_loop "$task_id" "$loop_name" "${agents[@]}"
117
- sleep 30
118
- done
119
- ) &
120
-
121
- echo "$!" # Return PID
122
- }
123
-
124
- function stop_heartbeat_monitor() {
125
- local task_id="$1"
126
- local loop_name="$2"
127
- local monitor_pid="$3"
128
-
129
- # Remove marker file to stop the monitor loop
130
- rm -f "/tmp/heartbeat-monitor-${task_id}-${loop_name}.active"
131
-
132
- # Kill monitor process if still running
133
- if [ -n "$monitor_pid" ] && kill -0 "$monitor_pid" 2>/dev/null; then
134
- kill "$monitor_pid" 2>/dev/null || true
135
- wait "$monitor_pid" 2>/dev/null || true
136
- fi
137
- }
@@ -1,106 +0,0 @@
1
- # Heartbeat Protocol Specification
2
-
3
- ## Overview
4
- The heartbeat protocol provides a mechanism for detecting hung or unresponsive agents in distributed agent swarms using Redis as a coordination mechanism.
5
-
6
- ## Key Design Components
7
-
8
- ### 1. Heartbeat Key Pattern
9
- ```
10
- swarm:{task_id}:{agent_id}:heartbeat
11
- ```
12
-
13
- ### 2. Heartbeat Message Structure
14
- ```json
15
- {
16
- "timestamp": 1760898665,
17
- "status": "working|idle|error",
18
- "iteration": 2,
19
- "progress": 0.75,
20
- "agent_details": {
21
- "agent_id": "architect-5",
22
- "task_id": "redis-phase5-1760898665",
23
- "environment": {
24
- "cpu_usage": 0.65,
25
- "memory_usage": 0.42,
26
- "system_load": 0.3
27
- }
28
- }
29
- }
30
- ```
31
-
32
- ### 3. Heartbeat Configuration
33
- - **Update Frequency**: Every 30 seconds
34
- - **Default TTL**: 60 seconds
35
- - **Miss Threshold**: 2 consecutive missed heartbeats
36
- - **Quorum Threshold**: 70% of agents must be responsive
37
-
38
- ### 4. Heartbeat Workflow
39
- 1. Agent periodically sends heartbeat via Redis SET
40
- 2. Orchestrator monitors heartbeats in background process
41
- 3. On missed heartbeats, trigger progressive recovery mechanisms
42
-
43
- ### 5. Recovery Stages
44
- - **Stage 1 (Miss 1)**: Log warning, continue monitoring
45
- - **Stage 2 (Miss 2)**:
46
- - Check if remaining agents meet quorum
47
- - Log to Dead Letter Queue (DLQ)
48
- - Attempt soft restart of agent
49
- - **Stage 3 (Miss 3)**:
50
- - Hard restart agent
51
- - Potentially replace with standby agent
52
-
53
- ### 6. Implementation Pseudo-code
54
- ```bash
55
- # Send Heartbeat
56
- redis-cli set "swarm:${TASK_ID}:${AGENT_ID}:heartbeat" \
57
- "$(generate_heartbeat_payload)" \
58
- EX 60 # 60-second expiry
59
-
60
- # Check Heartbeats
61
- check_agent_heartbeats() {
62
- for agent in ${AGENTS[@]}; do
63
- heartbeat=$(redis-cli get "swarm:${TASK_ID}:${agent}:heartbeat")
64
- if [[ -z "$heartbeat" ]]; then
65
- handle_missed_heartbeat "$agent"
66
- fi
67
- done
68
- }
69
-
70
- handle_missed_heartbeat() {
71
- local agent="$1"
72
- local miss_count=$(get_miss_count "$agent")
73
-
74
- case "$miss_count" in
75
- 1) log_warning "$agent missed first heartbeat" ;;
76
- 2)
77
- log_dlq "$agent"
78
- attempt_soft_restart "$agent"
79
- check_quorum
80
- ;;
81
- 3)
82
- hard_restart_agent "$agent"
83
- ;;
84
- esac
85
- }
86
- ```
87
-
88
- ### 7. Monitoring and Logging
89
- - Comprehensive logging to `/var/log/claude-flow/heartbeat.log`
90
- - Prometheus metrics for heartbeat health
91
- - Grafana dashboard tracking agent responsiveness
92
-
93
- ### 8. Security Considerations
94
- - Cryptographically sign heartbeat messages
95
- - Rate limit heartbeat submissions
96
- - Validate heartbeat payload schema
97
-
98
- ## Integration Points
99
- - Redis Coordination Skill
100
- - CFN Loop Validation
101
- - Agent Spawning Mechanism
102
-
103
- ## Test Coverage
104
- - Unit tests for heartbeat generation
105
- - Integration tests for recovery mechanisms
106
- - Chaos testing (intentional agent hanging)
@@ -1,126 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Heartbeat Monitoring Script for Agent Coordination
4
- # Implements 60s TTL, 30s check interval, and quorum fallback detection
5
-
6
- # Dependencies
7
- REDIS_CLI=$(which redis-cli)
8
- if [ -z "$REDIS_CLI" ]; then
9
- echo "Error: redis-cli not found. Please install Redis client."
10
- exit 1
11
- fi
12
-
13
- # Logging configuration
14
- LOG_FILE="/tmp/heartbeat-debug.log"
15
- touch "$LOG_FILE"
16
-
17
- log_debug() {
18
- echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
19
- }
20
-
21
- # Configuration
22
- HEARTBEAT_TTL=60 # 60 seconds TTL
23
- CHECK_INTERVAL=30 # 30 seconds between checks
24
- MISSED_THRESHOLD=2 # Number of missed heartbeats before considering agent hung
25
- BUFFER_TIME=3 # Additional buffer time for timing flexibility
26
-
27
- send_heartbeat() {
28
- local task_id="$1"
29
- local agent_id="$2"
30
-
31
- if [ -z "$task_id" ] || [ -z "$agent_id" ]; then
32
- echo "Usage: $0 send --task-id TASK_ID --agent-id AGENT_ID"
33
- exit 1
34
- fi
35
-
36
- log_debug "Sending heartbeat for task=$task_id, agent=$agent_id"
37
-
38
- # Use SETEX to create a key with expiration
39
- $REDIS_CLI SETEX "swarm:agent_status:${task_id}:${agent_id}" "$HEARTBEAT_TTL" "alive"
40
- echo "Heartbeat sent for agent ${agent_id} in task ${task_id}"
41
- }
42
-
43
- check_heartbeat() {
44
- local task_id="$1"
45
- local agent_id="$2"
46
-
47
- if [ -z "$task_id" ] || [ -z "$agent_id" ]; then
48
- echo "Usage: $0 check --task-id TASK_ID --agent-id AGENT_ID"
49
- exit 1
50
- fi
51
-
52
- local key="swarm:agent_status:${task_id}:${agent_id}"
53
- local status
54
- local ttl
55
-
56
- # Retrieve status and TTL
57
- status=$($REDIS_CLI GET "$key")
58
- ttl=$($REDIS_CLI TTL "$key")
59
-
60
- log_debug "Checking heartbeat for task=$task_id, agent=$agent_id: status=$status, ttl=$ttl"
61
-
62
- # Check for key existence and status
63
- if [ -z "$status" ] || [ "$ttl" -le 0 ]; then
64
- log_debug "Heartbeat DEAD: status missing or expired"
65
- echo "dead"
66
- increment_missed_counter "$task_id" "$agent_id"
67
- return 1
68
- else
69
- log_debug "Heartbeat ALIVE: status=$status, ttl=$ttl"
70
- echo "alive"
71
- return 0
72
- fi
73
- }
74
-
75
- increment_missed_counter() {
76
- local task_id="$1"
77
- local agent_id="$2"
78
- local missed_key="swarm:missed_heartbeats:${task_id}:${agent_id}"
79
-
80
- # Increment missed heartbeat counter
81
- local missed_count=$($REDIS_CLI INCR "$missed_key")
82
-
83
- # Set expiry for missed counter to match heartbeat TTL
84
- $REDIS_CLI EXPIRE "$missed_key" $HEARTBEAT_TTL
85
-
86
- log_debug "Missed heartbeat counter for task=$task_id, agent=$agent_id: count=$missed_count"
87
-
88
- if [ "$missed_count" -ge "$MISSED_THRESHOLD" ]; then
89
- # Trigger quorum fallback mechanism
90
- $REDIS_CLI LPUSH "swarm:${task_id}:quorum_fallback" "$agent_id"
91
- log_debug "QUORUM FALLBACK: Agent $agent_id missed $missed_count heartbeats"
92
- echo "WARN: Agent $agent_id missed $missed_count heartbeats. Quorum fallback triggered."
93
- fi
94
- }
95
-
96
- # Parse arguments
97
- case "$1" in
98
- send)
99
- shift
100
- while [[ "$#" -gt 0 ]]; do
101
- case $1 in
102
- --task-id) task_id="$2"; shift ;;
103
- --agent-id) agent_id="$2"; shift ;;
104
- *) echo "Unknown parameter passed: $1"; exit 1 ;;
105
- esac
106
- shift
107
- done
108
- send_heartbeat "$task_id" "$agent_id"
109
- ;;
110
- check)
111
- shift
112
- while [[ "$#" -gt 0 ]]; do
113
- case $1 in
114
- --task-id) task_id="$2"; shift ;;
115
- --agent-id) agent_id="$2"; shift ;;
116
- *) echo "Unknown parameter passed: $1"; exit 1 ;;
117
- esac
118
- shift
119
- done
120
- check_heartbeat "$task_id" "$agent_id"
121
- ;;
122
- *)
123
- echo "Usage: $0 {send|check} --task-id TASK_ID --agent-id AGENT_ID"
124
- exit 1
125
- ;;
126
- esac
@@ -1,148 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- ##############################################################################
4
- # Initialize Swarm - Redis Coordination Primitive
5
- #
6
- # Creates swarm metadata in Redis for coordination tracking across any
7
- # multi-agent workflow (CFN Loop, independent swarms, custom orchestration).
8
- #
9
- # Usage:
10
- # ./init-swarm.sh --swarm-id <id> \
11
- # --agents <agent1,agent2,...> \
12
- # [--task-id <id>] \
13
- # [--topology <mesh|hierarchical|chain>] \
14
- # [--ttl <seconds>] \
15
- # [--metadata <json-string>]
16
- #
17
- # Per-Agent Timeout Configuration:
18
- # Agents can have custom timeout values set via Redis:
19
- #
20
- # redis-cli SETEX "swarm:<task-id>:<agent-id>:timeout" 86400 <timeout-seconds>
21
- #
22
- # If no custom timeout is set, the orchestrator will use role-based defaults:
23
- # - researcher: 7200s (2 hours)
24
- # - backend-dev, coder, frontend-dev: 3600s (1 hour)
25
- # - reviewer, tester, security: 1800s (30 minutes)
26
- # - coordinator, orchestrator, product-owner: 900s (15 minutes)
27
- # - default: 3600s (1 hour)
28
- #
29
- # Benefits:
30
- # - Namespace isolation for concurrent swarms
31
- # - Agent inventory and resource tracking
32
- # - Status monitoring (in_progress/completed)
33
- # - Automatic cleanup via TTL
34
- # - Per-agent timeout configuration via get-agent-timeout.sh
35
- ##############################################################################
36
-
37
- set -euo pipefail
38
-
39
- # Configuration
40
- SWARM_ID=""
41
- AGENTS=""
42
- TASK_ID=""
43
- TOPOLOGY="mesh"
44
- TTL=604800 # 7 days default
45
- MAX_AGENTS=""
46
- METADATA_EXTRA=""
47
- AGENT_TIMEOUTS=""
48
-
49
- # Parse arguments
50
- while [[ $# -gt 0 ]]; do
51
- case $1 in
52
- --swarm-id)
53
- SWARM_ID="$2"
54
- shift 2
55
- ;;
56
- --agents)
57
- AGENTS="$2"
58
- shift 2
59
- ;;
60
- --task-id)
61
- TASK_ID="$2"
62
- shift 2
63
- ;;
64
- --topology)
65
- TOPOLOGY="$2"
66
- shift 2
67
- ;;
68
- --ttl)
69
- TTL="$2"
70
- shift 2
71
- ;;
72
- --max-agents)
73
- MAX_AGENTS="$2"
74
- shift 2
75
- ;;
76
- --metadata)
77
- METADATA_EXTRA="$2"
78
- shift 2
79
- ;;
80
- --agent-timeouts)
81
- AGENT_TIMEOUTS="$2"
82
- shift 2
83
- ;;
84
- *)
85
- echo "Unknown option: $1"
86
- echo "Usage: $0 --swarm-id <id> --agents <agent1,agent2,...> [options]"
87
- exit 1
88
- ;;
89
- esac
90
- done
91
-
92
- # Validation
93
- if [ -z "$SWARM_ID" ] || [ -z "$AGENTS" ]; then
94
- echo "Error: Required parameters missing"
95
- echo "Usage: $0 --swarm-id <id> --agents <agent1,agent2,...>"
96
- exit 1
97
- fi
98
-
99
- # Calculate max agents if not provided
100
- if [ -z "$MAX_AGENTS" ]; then
101
- IFS=',' read -ra AGENT_ARRAY <<< "$AGENTS"
102
- MAX_AGENTS=${#AGENT_ARRAY[@]}
103
- fi
104
-
105
- # Use swarm-id as task-id if not provided
106
- if [ -z "$TASK_ID" ]; then
107
- TASK_ID="$SWARM_ID"
108
- fi
109
-
110
- echo "[Swarm] Initializing swarm: $SWARM_ID"
111
- echo "[Swarm] Topology: $TOPOLOGY"
112
- echo "[Swarm] Total agents: $MAX_AGENTS"
113
- echo "[Swarm] TTL: $TTL seconds ($(($TTL / 86400)) days)"
114
-
115
- # Create swarm metadata key
116
- METADATA_KEY="swarm:${SWARM_ID}:metadata"
117
-
118
- # Store base metadata
119
- # Extract repository name from PWD
120
- REPO_NAME=$(basename "$(pwd)")
121
-
122
- redis-cli hset "$METADATA_KEY" \
123
- swarm_id "$SWARM_ID" \
124
- task_id "$TASK_ID" \
125
- topology "$TOPOLOGY" \
126
- max_agents "$MAX_AGENTS" \
127
- agents "$AGENTS" \
128
- created_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
129
- status "in_progress" \
130
- repository "$REPO_NAME" \
131
- cwd "$(pwd)" > /dev/null
132
-
133
- # Add extra metadata if provided (JSON string)
134
- if [ -n "$METADATA_EXTRA" ]; then
135
- # Parse JSON and add each key-value pair
136
- echo "$METADATA_EXTRA" | jq -r 'to_entries | .[] | "\(.key) \(.value)"' | while read -r KEY VALUE; do
137
- redis-cli hset "$METADATA_KEY" "$KEY" "$VALUE" > /dev/null
138
- done
139
- fi
140
-
141
- # Set TTL
142
- redis-cli expire "$METADATA_KEY" "$TTL" > /dev/null
143
-
144
- echo "[Swarm] Registered in Redis: $METADATA_KEY"
145
- echo "[Swarm] ✅ Initialization complete"
146
-
147
- # Output swarm ID for chaining
148
- echo "$SWARM_ID"