claude-flow-novice 2.14.3 → 2.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/CFN_LOOP_TASK_MODE.md +4 -47
- package/.claude/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
- package/claude-assets/commands/CFN_LOOP_TASK_MODE.md +4 -47
- package/claude-assets/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
- package/dist/agents/agent-loader.js +165 -146
- package/dist/agents/agent-loader.js.map +1 -1
- package/dist/cli/agent-prompt-builder.js +25 -0
- package/dist/cli/agent-prompt-builder.js.map +1 -1
- package/dist/cli/config-manager.js +91 -109
- package/package.json +1 -1
- package/.claude/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
- package/.claude/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
- package/.claude/skills/cfn-redis-coordination/LOGGING.md +0 -260
- package/.claude/skills/cfn-redis-coordination/README.md +0 -65
- package/.claude/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
- package/.claude/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
- package/.claude/skills/cfn-redis-coordination/SKILL.md +0 -720
- package/.claude/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
- package/.claude/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
- package/.claude/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
- package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
- package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
- package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
- package/.claude/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
- package/.claude/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
- package/.claude/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
- package/.claude/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
- package/.claude/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
- package/.claude/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
- package/.claude/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
- package/.claude/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
- package/.claude/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
- package/.claude/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
- package/.claude/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
- package/.claude/skills/cfn-redis-coordination/examples/README.md +0 -73
- package/.claude/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
- package/.claude/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
- package/.claude/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
- package/.claude/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
- package/.claude/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
- package/.claude/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
- package/.claude/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
- package/.claude/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
- package/.claude/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
- package/.claude/skills/cfn-redis-coordination/heartbeat.sh +0 -126
- package/.claude/skills/cfn-redis-coordination/init-swarm.sh +0 -148
- package/.claude/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
- package/.claude/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
- package/.claude/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
- package/.claude/skills/cfn-redis-coordination/log-event.sh +0 -109
- package/.claude/skills/cfn-redis-coordination/metrics-export.sh +0 -674
- package/.claude/skills/cfn-redis-coordination/metrics-schema.json +0 -66
- package/.claude/skills/cfn-redis-coordination/metrics-storage.md +0 -31
- package/.claude/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
- package/.claude/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
- package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
- package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
- package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
- package/.claude/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
- package/.claude/skills/cfn-redis-coordination/priority_wake.py +0 -134
- package/.claude/skills/cfn-redis-coordination/query-dlq.sh +0 -162
- package/.claude/skills/cfn-redis-coordination/query-logs.sh +0 -103
- package/.claude/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
- package/.claude/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
- package/.claude/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
- package/.claude/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
- package/.claude/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
- package/.claude/skills/cfn-redis-coordination/signal.sh +0 -38
- package/.claude/skills/cfn-redis-coordination/store-context.sh +0 -86
- package/.claude/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
- package/.claude/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
- package/.claude/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
- package/.claude/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
- package/.claude/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
- package/.claude/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
- package/.claude/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
- package/.claude/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
- package/.claude/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
- package/.claude/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
- package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
- package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
- package/.claude/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
- package/.claude/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
- package/.claude/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
- package/.claude/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
- package/.claude/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
- package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
- package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
- package/claude-assets/skills/cfn-redis-coordination/LOGGING.md +0 -260
- package/claude-assets/skills/cfn-redis-coordination/README.md +0 -65
- package/claude-assets/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
- package/claude-assets/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
- package/claude-assets/skills/cfn-redis-coordination/SKILL.md +0 -720
- package/claude-assets/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
- package/claude-assets/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
- package/claude-assets/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
- package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
- package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
- package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
- package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
- package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
- package/claude-assets/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
- package/claude-assets/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
- package/claude-assets/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
- package/claude-assets/skills/cfn-redis-coordination/examples/README.md +0 -73
- package/claude-assets/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
- package/claude-assets/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
- package/claude-assets/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
- package/claude-assets/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
- package/claude-assets/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
- package/claude-assets/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
- package/claude-assets/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
- package/claude-assets/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
- package/claude-assets/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
- package/claude-assets/skills/cfn-redis-coordination/heartbeat.sh +0 -126
- package/claude-assets/skills/cfn-redis-coordination/init-swarm.sh +0 -148
- package/claude-assets/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
- package/claude-assets/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
- package/claude-assets/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
- package/claude-assets/skills/cfn-redis-coordination/log-event.sh +0 -109
- package/claude-assets/skills/cfn-redis-coordination/metrics-export.sh +0 -674
- package/claude-assets/skills/cfn-redis-coordination/metrics-schema.json +0 -66
- package/claude-assets/skills/cfn-redis-coordination/metrics-storage.md +0 -31
- package/claude-assets/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
- package/claude-assets/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
- package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
- package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
- package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
- package/claude-assets/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
- package/claude-assets/skills/cfn-redis-coordination/priority_wake.py +0 -134
- package/claude-assets/skills/cfn-redis-coordination/query-dlq.sh +0 -162
- package/claude-assets/skills/cfn-redis-coordination/query-logs.sh +0 -103
- package/claude-assets/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
- package/claude-assets/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
- package/claude-assets/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
- package/claude-assets/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
- package/claude-assets/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
- package/claude-assets/skills/cfn-redis-coordination/signal.sh +0 -38
- package/claude-assets/skills/cfn-redis-coordination/store-context.sh +0 -86
- package/claude-assets/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
- package/claude-assets/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
- package/claude-assets/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
- package/claude-assets/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
- package/claude-assets/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
- package/claude-assets/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
- package/claude-assets/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
- package/claude-assets/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
- package/claude-assets/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
- package/claude-assets/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
- package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
- package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
- package/claude-assets/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
- package/claude-assets/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
- package/claude-assets/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
- package/claude-assets/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
- package/claude-assets/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
|
@@ -1,137 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
|
|
3
|
-
##############################################################################
|
|
4
|
-
# Heartbeat Monitoring Functions Library
|
|
5
|
-
# Extracted from orchestrate-cfn-loop.sh for standalone testing
|
|
6
|
-
##############################################################################
|
|
7
|
-
|
|
8
|
-
##############################################################################
|
|
9
|
-
# Heartbeat Monitoring Functions
|
|
10
|
-
##############################################################################
|
|
11
|
-
declare -A MISSED_HEARTBEATS # Track missed heartbeats per agent
|
|
12
|
-
|
|
13
|
-
function check_agent_heartbeat() {
|
|
14
|
-
local agent="$1"
|
|
15
|
-
local task_id="$2"
|
|
16
|
-
|
|
17
|
-
HB_KEY="swarm:${task_id}:${agent}:heartbeat"
|
|
18
|
-
HB_DATA=$(redis-cli GET "$HB_KEY" 2>/dev/null || echo "")
|
|
19
|
-
|
|
20
|
-
if [ -z "$HB_DATA" ] || [ "$HB_DATA" = "(nil)" ]; then
|
|
21
|
-
return 1 # Dead
|
|
22
|
-
else
|
|
23
|
-
return 0 # Alive
|
|
24
|
-
fi
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
function calculate_quorum() {
|
|
28
|
-
local quorum_spec="$1"
|
|
29
|
-
local total_agents="$2"
|
|
30
|
-
|
|
31
|
-
# If no quorum specified, require all agents
|
|
32
|
-
if [ -z "$quorum_spec" ]; then
|
|
33
|
-
echo "$total_agents"
|
|
34
|
-
return 0
|
|
35
|
-
fi
|
|
36
|
-
|
|
37
|
-
# Check if percentage format (e.g., "85%")
|
|
38
|
-
if [[ "$quorum_spec" =~ %$ ]]; then
|
|
39
|
-
# Extract percentage value (remove % suffix)
|
|
40
|
-
local pct="${quorum_spec%\%}"
|
|
41
|
-
# Calculate: ceil(total_agents * pct / 100)
|
|
42
|
-
echo "scale=0; ($total_agents * $pct + 50) / 100" | bc
|
|
43
|
-
# Check if decimal (0.0-1.0), treat as fraction
|
|
44
|
-
elif [[ "$quorum_spec" =~ ^0?\.[0-9]+$ ]]; then
|
|
45
|
-
# Calculate: ceil(total_agents * fraction)
|
|
46
|
-
echo "scale=0; ($quorum_spec * $total_agents + 0.5) / 1" | bc
|
|
47
|
-
else
|
|
48
|
-
# Absolute number - validate it doesn't exceed total
|
|
49
|
-
if [ "$quorum_spec" -gt "$total_agents" ]; then
|
|
50
|
-
echo "Error: Quorum ($quorum_spec) exceeds total agents ($total_agents)" >&2
|
|
51
|
-
return 1
|
|
52
|
-
fi
|
|
53
|
-
echo "$quorum_spec"
|
|
54
|
-
fi
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
function check_heartbeats_loop() {
|
|
58
|
-
local task_id="$1"
|
|
59
|
-
local loop_name="$2"
|
|
60
|
-
shift 2
|
|
61
|
-
local agents=("$@")
|
|
62
|
-
|
|
63
|
-
for AGENT in "${agents[@]}"; do
|
|
64
|
-
# Skip agents already marked as failed
|
|
65
|
-
if [[ " ${LOOP3_FAILED_AGENTS[@]} ${LOOP2_FAILED_AGENTS[@]} " =~ " ${AGENT} " ]]; then
|
|
66
|
-
continue
|
|
67
|
-
fi
|
|
68
|
-
|
|
69
|
-
if ! check_agent_heartbeat "$AGENT" "$task_id"; then
|
|
70
|
-
MISSED_HEARTBEATS["$AGENT"]=$((${MISSED_HEARTBEATS["$AGENT"]:-0} + 1))
|
|
71
|
-
|
|
72
|
-
if [ ${MISSED_HEARTBEATS["$AGENT"]} -ge 2 ]; then
|
|
73
|
-
local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
74
|
-
echo " [$timestamp] [$loop_name] ⚠️ $AGENT appears hung (no heartbeat for 60s)" >&2
|
|
75
|
-
|
|
76
|
-
# Determine which loop this agent belongs to and check quorum
|
|
77
|
-
if [[ " ${LOOP3_AGENTS} " =~ " ${AGENT} " ]]; then
|
|
78
|
-
REMAINING=$((${#LOOP3_COMPLETED_AGENTS[@]}))
|
|
79
|
-
REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP3" "$LOOP3_TOTAL")
|
|
80
|
-
elif [[ " ${LOOP2_AGENTS} " =~ " ${LOOP2_AGENTS} " ]]; then
|
|
81
|
-
REMAINING=$((${#LOOP2_COMPLETED_AGENTS[@]}))
|
|
82
|
-
REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP2" "$LOOP2_TOTAL")
|
|
83
|
-
else
|
|
84
|
-
continue
|
|
85
|
-
fi
|
|
86
|
-
|
|
87
|
-
if [ $REMAINING -ge $REQUIRED ]; then
|
|
88
|
-
echo " [$timestamp] [$loop_name] ℹ️ Continuing with quorum (${REMAINING}/${REQUIRED} agents)" >&2
|
|
89
|
-
else
|
|
90
|
-
echo " [$timestamp] [$loop_name] ⚠️ Cannot meet quorum without $AGENT (${REMAINING}/${REQUIRED})" >&2
|
|
91
|
-
fi
|
|
92
|
-
fi
|
|
93
|
-
else
|
|
94
|
-
MISSED_HEARTBEATS["$AGENT"]=0 # Reset counter
|
|
95
|
-
fi
|
|
96
|
-
done
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
function start_heartbeat_monitor() {
|
|
100
|
-
local task_id="$1"
|
|
101
|
-
local loop_name="$2"
|
|
102
|
-
shift 2
|
|
103
|
-
local agents=("$@")
|
|
104
|
-
|
|
105
|
-
# Create marker file for this monitor
|
|
106
|
-
local monitor_marker="/tmp/heartbeat-monitor-${task_id}-${loop_name}.active"
|
|
107
|
-
touch "$monitor_marker"
|
|
108
|
-
|
|
109
|
-
(
|
|
110
|
-
while [ -f "$monitor_marker" ]; do
|
|
111
|
-
# Check for shutdown
|
|
112
|
-
if [ "${SHUTDOWN_REQUESTED:-0}" -eq 1 ]; then
|
|
113
|
-
break
|
|
114
|
-
fi
|
|
115
|
-
|
|
116
|
-
check_heartbeats_loop "$task_id" "$loop_name" "${agents[@]}"
|
|
117
|
-
sleep 30
|
|
118
|
-
done
|
|
119
|
-
) &
|
|
120
|
-
|
|
121
|
-
echo "$!" # Return PID
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
function stop_heartbeat_monitor() {
|
|
125
|
-
local task_id="$1"
|
|
126
|
-
local loop_name="$2"
|
|
127
|
-
local monitor_pid="$3"
|
|
128
|
-
|
|
129
|
-
# Remove marker file to stop the monitor loop
|
|
130
|
-
rm -f "/tmp/heartbeat-monitor-${task_id}-${loop_name}.active"
|
|
131
|
-
|
|
132
|
-
# Kill monitor process if still running
|
|
133
|
-
if [ -n "$monitor_pid" ] && kill -0 "$monitor_pid" 2>/dev/null; then
|
|
134
|
-
kill "$monitor_pid" 2>/dev/null || true
|
|
135
|
-
wait "$monitor_pid" 2>/dev/null || true
|
|
136
|
-
fi
|
|
137
|
-
}
|
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
# Heartbeat Protocol Specification
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
The heartbeat protocol provides a mechanism for detecting hung or unresponsive agents in distributed agent swarms using Redis as a coordination mechanism.
|
|
5
|
-
|
|
6
|
-
## Key Design Components
|
|
7
|
-
|
|
8
|
-
### 1. Heartbeat Key Pattern
|
|
9
|
-
```
|
|
10
|
-
swarm:{task_id}:{agent_id}:heartbeat
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
### 2. Heartbeat Message Structure
|
|
14
|
-
```json
|
|
15
|
-
{
|
|
16
|
-
"timestamp": 1760898665,
|
|
17
|
-
"status": "working|idle|error",
|
|
18
|
-
"iteration": 2,
|
|
19
|
-
"progress": 0.75,
|
|
20
|
-
"agent_details": {
|
|
21
|
-
"agent_id": "architect-5",
|
|
22
|
-
"task_id": "redis-phase5-1760898665",
|
|
23
|
-
"environment": {
|
|
24
|
-
"cpu_usage": 0.65,
|
|
25
|
-
"memory_usage": 0.42,
|
|
26
|
-
"system_load": 0.3
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
### 3. Heartbeat Configuration
|
|
33
|
-
- **Update Frequency**: Every 30 seconds
|
|
34
|
-
- **Default TTL**: 60 seconds
|
|
35
|
-
- **Miss Threshold**: 2 consecutive missed heartbeats
|
|
36
|
-
- **Quorum Threshold**: 70% of agents must be responsive
|
|
37
|
-
|
|
38
|
-
### 4. Heartbeat Workflow
|
|
39
|
-
1. Agent periodically sends heartbeat via Redis SET
|
|
40
|
-
2. Orchestrator monitors heartbeats in background process
|
|
41
|
-
3. On missed heartbeats, trigger progressive recovery mechanisms
|
|
42
|
-
|
|
43
|
-
### 5. Recovery Stages
|
|
44
|
-
- **Stage 1 (Miss 1)**: Log warning, continue monitoring
|
|
45
|
-
- **Stage 2 (Miss 2)**:
|
|
46
|
-
- Check if remaining agents meet quorum
|
|
47
|
-
- Log to Dead Letter Queue (DLQ)
|
|
48
|
-
- Attempt soft restart of agent
|
|
49
|
-
- **Stage 3 (Miss 3)**:
|
|
50
|
-
- Hard restart agent
|
|
51
|
-
- Potentially replace with standby agent
|
|
52
|
-
|
|
53
|
-
### 6. Implementation Pseudo-code
|
|
54
|
-
```bash
|
|
55
|
-
# Send Heartbeat
|
|
56
|
-
redis-cli set "swarm:${TASK_ID}:${AGENT_ID}:heartbeat" \
|
|
57
|
-
"$(generate_heartbeat_payload)" \
|
|
58
|
-
EX 60 # 60-second expiry
|
|
59
|
-
|
|
60
|
-
# Check Heartbeats
|
|
61
|
-
check_agent_heartbeats() {
|
|
62
|
-
for agent in ${AGENTS[@]}; do
|
|
63
|
-
heartbeat=$(redis-cli get "swarm:${TASK_ID}:${agent}:heartbeat")
|
|
64
|
-
if [[ -z "$heartbeat" ]]; then
|
|
65
|
-
handle_missed_heartbeat "$agent"
|
|
66
|
-
fi
|
|
67
|
-
done
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
handle_missed_heartbeat() {
|
|
71
|
-
local agent="$1"
|
|
72
|
-
local miss_count=$(get_miss_count "$agent")
|
|
73
|
-
|
|
74
|
-
case "$miss_count" in
|
|
75
|
-
1) log_warning "$agent missed first heartbeat" ;;
|
|
76
|
-
2)
|
|
77
|
-
log_dlq "$agent"
|
|
78
|
-
attempt_soft_restart "$agent"
|
|
79
|
-
check_quorum
|
|
80
|
-
;;
|
|
81
|
-
3)
|
|
82
|
-
hard_restart_agent "$agent"
|
|
83
|
-
;;
|
|
84
|
-
esac
|
|
85
|
-
}
|
|
86
|
-
```
|
|
87
|
-
|
|
88
|
-
### 7. Monitoring and Logging
|
|
89
|
-
- Comprehensive logging to `/var/log/claude-flow/heartbeat.log`
|
|
90
|
-
- Prometheus metrics for heartbeat health
|
|
91
|
-
- Grafana dashboard tracking agent responsiveness
|
|
92
|
-
|
|
93
|
-
### 8. Security Considerations
|
|
94
|
-
- Cryptographically sign heartbeat messages
|
|
95
|
-
- Rate limit heartbeat submissions
|
|
96
|
-
- Validate heartbeat payload schema
|
|
97
|
-
|
|
98
|
-
## Integration Points
|
|
99
|
-
- Redis Coordination Skill
|
|
100
|
-
- CFN Loop Validation
|
|
101
|
-
- Agent Spawning Mechanism
|
|
102
|
-
|
|
103
|
-
## Test Coverage
|
|
104
|
-
- Unit tests for heartbeat generation
|
|
105
|
-
- Integration tests for recovery mechanisms
|
|
106
|
-
- Chaos testing (intentional agent hanging)
|
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
|
|
3
|
-
# Heartbeat Monitoring Script for Agent Coordination
|
|
4
|
-
# Implements 60s TTL, 30s check interval, and quorum fallback detection
|
|
5
|
-
|
|
6
|
-
# Dependencies
|
|
7
|
-
REDIS_CLI=$(which redis-cli)
|
|
8
|
-
if [ -z "$REDIS_CLI" ]; then
|
|
9
|
-
echo "Error: redis-cli not found. Please install Redis client."
|
|
10
|
-
exit 1
|
|
11
|
-
fi
|
|
12
|
-
|
|
13
|
-
# Logging configuration
|
|
14
|
-
LOG_FILE="/tmp/heartbeat-debug.log"
|
|
15
|
-
touch "$LOG_FILE"
|
|
16
|
-
|
|
17
|
-
log_debug() {
|
|
18
|
-
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
# Configuration
|
|
22
|
-
HEARTBEAT_TTL=60 # 60 seconds TTL
|
|
23
|
-
CHECK_INTERVAL=30 # 30 seconds between checks
|
|
24
|
-
MISSED_THRESHOLD=2 # Number of missed heartbeats before considering agent hung
|
|
25
|
-
BUFFER_TIME=3 # Additional buffer time for timing flexibility
|
|
26
|
-
|
|
27
|
-
send_heartbeat() {
|
|
28
|
-
local task_id="$1"
|
|
29
|
-
local agent_id="$2"
|
|
30
|
-
|
|
31
|
-
if [ -z "$task_id" ] || [ -z "$agent_id" ]; then
|
|
32
|
-
echo "Usage: $0 send --task-id TASK_ID --agent-id AGENT_ID"
|
|
33
|
-
exit 1
|
|
34
|
-
fi
|
|
35
|
-
|
|
36
|
-
log_debug "Sending heartbeat for task=$task_id, agent=$agent_id"
|
|
37
|
-
|
|
38
|
-
# Use SETEX to create a key with expiration
|
|
39
|
-
$REDIS_CLI SETEX "swarm:agent_status:${task_id}:${agent_id}" "$HEARTBEAT_TTL" "alive"
|
|
40
|
-
echo "Heartbeat sent for agent ${agent_id} in task ${task_id}"
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
check_heartbeat() {
|
|
44
|
-
local task_id="$1"
|
|
45
|
-
local agent_id="$2"
|
|
46
|
-
|
|
47
|
-
if [ -z "$task_id" ] || [ -z "$agent_id" ]; then
|
|
48
|
-
echo "Usage: $0 check --task-id TASK_ID --agent-id AGENT_ID"
|
|
49
|
-
exit 1
|
|
50
|
-
fi
|
|
51
|
-
|
|
52
|
-
local key="swarm:agent_status:${task_id}:${agent_id}"
|
|
53
|
-
local status
|
|
54
|
-
local ttl
|
|
55
|
-
|
|
56
|
-
# Retrieve status and TTL
|
|
57
|
-
status=$($REDIS_CLI GET "$key")
|
|
58
|
-
ttl=$($REDIS_CLI TTL "$key")
|
|
59
|
-
|
|
60
|
-
log_debug "Checking heartbeat for task=$task_id, agent=$agent_id: status=$status, ttl=$ttl"
|
|
61
|
-
|
|
62
|
-
# Check for key existence and status
|
|
63
|
-
if [ -z "$status" ] || [ "$ttl" -le 0 ]; then
|
|
64
|
-
log_debug "Heartbeat DEAD: status missing or expired"
|
|
65
|
-
echo "dead"
|
|
66
|
-
increment_missed_counter "$task_id" "$agent_id"
|
|
67
|
-
return 1
|
|
68
|
-
else
|
|
69
|
-
log_debug "Heartbeat ALIVE: status=$status, ttl=$ttl"
|
|
70
|
-
echo "alive"
|
|
71
|
-
return 0
|
|
72
|
-
fi
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
increment_missed_counter() {
|
|
76
|
-
local task_id="$1"
|
|
77
|
-
local agent_id="$2"
|
|
78
|
-
local missed_key="swarm:missed_heartbeats:${task_id}:${agent_id}"
|
|
79
|
-
|
|
80
|
-
# Increment missed heartbeat counter
|
|
81
|
-
local missed_count=$($REDIS_CLI INCR "$missed_key")
|
|
82
|
-
|
|
83
|
-
# Set expiry for missed counter to match heartbeat TTL
|
|
84
|
-
$REDIS_CLI EXPIRE "$missed_key" $HEARTBEAT_TTL
|
|
85
|
-
|
|
86
|
-
log_debug "Missed heartbeat counter for task=$task_id, agent=$agent_id: count=$missed_count"
|
|
87
|
-
|
|
88
|
-
if [ "$missed_count" -ge "$MISSED_THRESHOLD" ]; then
|
|
89
|
-
# Trigger quorum fallback mechanism
|
|
90
|
-
$REDIS_CLI LPUSH "swarm:${task_id}:quorum_fallback" "$agent_id"
|
|
91
|
-
log_debug "QUORUM FALLBACK: Agent $agent_id missed $missed_count heartbeats"
|
|
92
|
-
echo "WARN: Agent $agent_id missed $missed_count heartbeats. Quorum fallback triggered."
|
|
93
|
-
fi
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
# Parse arguments
|
|
97
|
-
case "$1" in
|
|
98
|
-
send)
|
|
99
|
-
shift
|
|
100
|
-
while [[ "$#" -gt 0 ]]; do
|
|
101
|
-
case $1 in
|
|
102
|
-
--task-id) task_id="$2"; shift ;;
|
|
103
|
-
--agent-id) agent_id="$2"; shift ;;
|
|
104
|
-
*) echo "Unknown parameter passed: $1"; exit 1 ;;
|
|
105
|
-
esac
|
|
106
|
-
shift
|
|
107
|
-
done
|
|
108
|
-
send_heartbeat "$task_id" "$agent_id"
|
|
109
|
-
;;
|
|
110
|
-
check)
|
|
111
|
-
shift
|
|
112
|
-
while [[ "$#" -gt 0 ]]; do
|
|
113
|
-
case $1 in
|
|
114
|
-
--task-id) task_id="$2"; shift ;;
|
|
115
|
-
--agent-id) agent_id="$2"; shift ;;
|
|
116
|
-
*) echo "Unknown parameter passed: $1"; exit 1 ;;
|
|
117
|
-
esac
|
|
118
|
-
shift
|
|
119
|
-
done
|
|
120
|
-
check_heartbeat "$task_id" "$agent_id"
|
|
121
|
-
;;
|
|
122
|
-
*)
|
|
123
|
-
echo "Usage: $0 {send|check} --task-id TASK_ID --agent-id AGENT_ID"
|
|
124
|
-
exit 1
|
|
125
|
-
;;
|
|
126
|
-
esac
|
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
|
|
3
|
-
##############################################################################
|
|
4
|
-
# Initialize Swarm - Redis Coordination Primitive
|
|
5
|
-
#
|
|
6
|
-
# Creates swarm metadata in Redis for coordination tracking across any
|
|
7
|
-
# multi-agent workflow (CFN Loop, independent swarms, custom orchestration).
|
|
8
|
-
#
|
|
9
|
-
# Usage:
|
|
10
|
-
# ./init-swarm.sh --swarm-id <id> \
|
|
11
|
-
# --agents <agent1,agent2,...> \
|
|
12
|
-
# [--task-id <id>] \
|
|
13
|
-
# [--topology <mesh|hierarchical|chain>] \
|
|
14
|
-
# [--ttl <seconds>] \
|
|
15
|
-
# [--metadata <json-string>]
|
|
16
|
-
#
|
|
17
|
-
# Per-Agent Timeout Configuration:
|
|
18
|
-
# Agents can have custom timeout values set via Redis:
|
|
19
|
-
#
|
|
20
|
-
# redis-cli SETEX "swarm:<task-id>:<agent-id>:timeout" 86400 <timeout-seconds>
|
|
21
|
-
#
|
|
22
|
-
# If no custom timeout is set, the orchestrator will use role-based defaults:
|
|
23
|
-
# - researcher: 7200s (2 hours)
|
|
24
|
-
# - backend-dev, coder, frontend-dev: 3600s (1 hour)
|
|
25
|
-
# - reviewer, tester, security: 1800s (30 minutes)
|
|
26
|
-
# - coordinator, orchestrator, product-owner: 900s (15 minutes)
|
|
27
|
-
# - default: 3600s (1 hour)
|
|
28
|
-
#
|
|
29
|
-
# Benefits:
|
|
30
|
-
# - Namespace isolation for concurrent swarms
|
|
31
|
-
# - Agent inventory and resource tracking
|
|
32
|
-
# - Status monitoring (in_progress/completed)
|
|
33
|
-
# - Automatic cleanup via TTL
|
|
34
|
-
# - Per-agent timeout configuration via get-agent-timeout.sh
|
|
35
|
-
##############################################################################
|
|
36
|
-
|
|
37
|
-
set -euo pipefail
|
|
38
|
-
|
|
39
|
-
# Configuration
|
|
40
|
-
SWARM_ID=""
|
|
41
|
-
AGENTS=""
|
|
42
|
-
TASK_ID=""
|
|
43
|
-
TOPOLOGY="mesh"
|
|
44
|
-
TTL=604800 # 7 days default
|
|
45
|
-
MAX_AGENTS=""
|
|
46
|
-
METADATA_EXTRA=""
|
|
47
|
-
AGENT_TIMEOUTS=""
|
|
48
|
-
|
|
49
|
-
# Parse arguments
|
|
50
|
-
while [[ $# -gt 0 ]]; do
|
|
51
|
-
case $1 in
|
|
52
|
-
--swarm-id)
|
|
53
|
-
SWARM_ID="$2"
|
|
54
|
-
shift 2
|
|
55
|
-
;;
|
|
56
|
-
--agents)
|
|
57
|
-
AGENTS="$2"
|
|
58
|
-
shift 2
|
|
59
|
-
;;
|
|
60
|
-
--task-id)
|
|
61
|
-
TASK_ID="$2"
|
|
62
|
-
shift 2
|
|
63
|
-
;;
|
|
64
|
-
--topology)
|
|
65
|
-
TOPOLOGY="$2"
|
|
66
|
-
shift 2
|
|
67
|
-
;;
|
|
68
|
-
--ttl)
|
|
69
|
-
TTL="$2"
|
|
70
|
-
shift 2
|
|
71
|
-
;;
|
|
72
|
-
--max-agents)
|
|
73
|
-
MAX_AGENTS="$2"
|
|
74
|
-
shift 2
|
|
75
|
-
;;
|
|
76
|
-
--metadata)
|
|
77
|
-
METADATA_EXTRA="$2"
|
|
78
|
-
shift 2
|
|
79
|
-
;;
|
|
80
|
-
--agent-timeouts)
|
|
81
|
-
AGENT_TIMEOUTS="$2"
|
|
82
|
-
shift 2
|
|
83
|
-
;;
|
|
84
|
-
*)
|
|
85
|
-
echo "Unknown option: $1"
|
|
86
|
-
echo "Usage: $0 --swarm-id <id> --agents <agent1,agent2,...> [options]"
|
|
87
|
-
exit 1
|
|
88
|
-
;;
|
|
89
|
-
esac
|
|
90
|
-
done
|
|
91
|
-
|
|
92
|
-
# Validation
|
|
93
|
-
if [ -z "$SWARM_ID" ] || [ -z "$AGENTS" ]; then
|
|
94
|
-
echo "Error: Required parameters missing"
|
|
95
|
-
echo "Usage: $0 --swarm-id <id> --agents <agent1,agent2,...>"
|
|
96
|
-
exit 1
|
|
97
|
-
fi
|
|
98
|
-
|
|
99
|
-
# Calculate max agents if not provided
|
|
100
|
-
if [ -z "$MAX_AGENTS" ]; then
|
|
101
|
-
IFS=',' read -ra AGENT_ARRAY <<< "$AGENTS"
|
|
102
|
-
MAX_AGENTS=${#AGENT_ARRAY[@]}
|
|
103
|
-
fi
|
|
104
|
-
|
|
105
|
-
# Use swarm-id as task-id if not provided
|
|
106
|
-
if [ -z "$TASK_ID" ]; then
|
|
107
|
-
TASK_ID="$SWARM_ID"
|
|
108
|
-
fi
|
|
109
|
-
|
|
110
|
-
echo "[Swarm] Initializing swarm: $SWARM_ID"
|
|
111
|
-
echo "[Swarm] Topology: $TOPOLOGY"
|
|
112
|
-
echo "[Swarm] Total agents: $MAX_AGENTS"
|
|
113
|
-
echo "[Swarm] TTL: $TTL seconds ($(($TTL / 86400)) days)"
|
|
114
|
-
|
|
115
|
-
# Create swarm metadata key
|
|
116
|
-
METADATA_KEY="swarm:${SWARM_ID}:metadata"
|
|
117
|
-
|
|
118
|
-
# Store base metadata
|
|
119
|
-
# Extract repository name from PWD
|
|
120
|
-
REPO_NAME=$(basename "$(pwd)")
|
|
121
|
-
|
|
122
|
-
redis-cli hset "$METADATA_KEY" \
|
|
123
|
-
swarm_id "$SWARM_ID" \
|
|
124
|
-
task_id "$TASK_ID" \
|
|
125
|
-
topology "$TOPOLOGY" \
|
|
126
|
-
max_agents "$MAX_AGENTS" \
|
|
127
|
-
agents "$AGENTS" \
|
|
128
|
-
created_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
|
129
|
-
status "in_progress" \
|
|
130
|
-
repository "$REPO_NAME" \
|
|
131
|
-
cwd "$(pwd)" > /dev/null
|
|
132
|
-
|
|
133
|
-
# Add extra metadata if provided (JSON string)
|
|
134
|
-
if [ -n "$METADATA_EXTRA" ]; then
|
|
135
|
-
# Parse JSON and add each key-value pair
|
|
136
|
-
echo "$METADATA_EXTRA" | jq -r 'to_entries | .[] | "\(.key) \(.value)"' | while read -r KEY VALUE; do
|
|
137
|
-
redis-cli hset "$METADATA_KEY" "$KEY" "$VALUE" > /dev/null
|
|
138
|
-
done
|
|
139
|
-
fi
|
|
140
|
-
|
|
141
|
-
# Set TTL
|
|
142
|
-
redis-cli expire "$METADATA_KEY" "$TTL" > /dev/null
|
|
143
|
-
|
|
144
|
-
echo "[Swarm] Registered in Redis: $METADATA_KEY"
|
|
145
|
-
echo "[Swarm] ✅ Initialization complete"
|
|
146
|
-
|
|
147
|
-
# Output swarm ID for chaining
|
|
148
|
-
echo "$SWARM_ID"
|