claude-flow-novice 2.14.3 → 2.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/CFN_LOOP_TASK_MODE.md +4 -47
- package/.claude/commands/seo/SEO_TASK_MODE.md +892 -0
- package/.claude/commands/seo/seo-blog.md +428 -0
- package/.claude/commands/seo/seo-landing.md +91 -0
- package/.claude/commands/seo/seo-product.md +104 -0
- package/.claude/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
- package/claude-assets/agents/cfn-dev-team/coordinators/epic-creator.md +120 -0
- package/claude-assets/agents/cfn-seo-team/AGENT_CREATION_REPORT.md +481 -0
- package/claude-assets/agents/cfn-seo-team/DELEGATION_MATRIX.md +371 -0
- package/claude-assets/agents/cfn-seo-team/HUMANIZER_PROMPTS.md +536 -0
- package/claude-assets/agents/cfn-seo-team/INTEGRATION_REQUIREMENTS.md +642 -0
- package/claude-assets/agents/cfn-seo-team/cfn-seo-coordinator.md +414 -0
- package/claude-assets/agents/cfn-seo-team/competitive-seo-analyst.md +423 -0
- package/claude-assets/agents/cfn-seo-team/content-atomization-specialist.md +580 -0
- package/claude-assets/agents/cfn-seo-team/content-seo-strategist.md +245 -0
- package/claude-assets/agents/cfn-seo-team/eeat-content-auditor.md +389 -0
- package/claude-assets/agents/cfn-seo-team/geo-optimization-expert.md +269 -0
- package/claude-assets/agents/cfn-seo-team/link-building-specialist.md +291 -0
- package/claude-assets/agents/cfn-seo-team/local-seo-optimizer.md +333 -0
- package/claude-assets/agents/cfn-seo-team/programmatic-seo-engineer.md +244 -0
- package/claude-assets/agents/cfn-seo-team/schema-markup-engineer.md +430 -0
- package/claude-assets/agents/cfn-seo-team/seo-analytics-specialist.md +376 -0
- package/claude-assets/agents/cfn-seo-team/seo-validators/accessibility-validator.md +565 -0
- package/claude-assets/agents/cfn-seo-team/seo-validators/audience-validator.md +484 -0
- package/claude-assets/agents/cfn-seo-team/seo-validators/branding-validator.md +452 -0
- package/claude-assets/agents/cfn-seo-team/seo-validators/humanizer-validator.md +333 -0
- package/claude-assets/agents/cfn-seo-team/technical-seo-specialist.md +228 -0
- package/claude-assets/commands/CFN_LOOP_TASK_MODE.md +4 -47
- package/claude-assets/commands/seo/SEO_TASK_MODE.md +892 -0
- package/claude-assets/commands/seo/seo-blog.md +428 -0
- package/claude-assets/commands/seo/seo-landing.md +91 -0
- package/claude-assets/commands/seo/seo-product.md +104 -0
- package/claude-assets/skills/cfn-redis-coordination/demos/test-cancel-swarm.sh +0 -276
- package/claude-assets/skills/seo-orchestration/SKILL.md +292 -0
- package/claude-assets/skills/seo-orchestration/orchestrate-seo.sh +566 -0
- package/claude-assets/skills/seo-orchestration/orchestrate-seo.sh.backup +755 -0
- package/claude-assets/skills/seo-orchestration/validate-consensus.sh +270 -0
- package/dist/agents/agent-loader.js +165 -146
- package/dist/agents/agent-loader.js.map +1 -1
- package/dist/cli/agent-prompt-builder.js +25 -0
- package/dist/cli/agent-prompt-builder.js.map +1 -1
- package/package.json +1 -1
- package/.claude/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
- package/.claude/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
- package/.claude/skills/cfn-redis-coordination/LOGGING.md +0 -260
- package/.claude/skills/cfn-redis-coordination/README.md +0 -65
- package/.claude/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
- package/.claude/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
- package/.claude/skills/cfn-redis-coordination/SKILL.md +0 -720
- package/.claude/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
- package/.claude/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
- package/.claude/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
- package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
- package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
- package/.claude/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
- package/.claude/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
- package/.claude/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
- package/.claude/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
- package/.claude/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
- package/.claude/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
- package/.claude/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
- package/.claude/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
- package/.claude/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
- package/.claude/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
- package/.claude/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
- package/.claude/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
- package/.claude/skills/cfn-redis-coordination/examples/README.md +0 -73
- package/.claude/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
- package/.claude/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
- package/.claude/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
- package/.claude/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
- package/.claude/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
- package/.claude/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
- package/.claude/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
- package/.claude/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
- package/.claude/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
- package/.claude/skills/cfn-redis-coordination/heartbeat.sh +0 -126
- package/.claude/skills/cfn-redis-coordination/init-swarm.sh +0 -148
- package/.claude/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
- package/.claude/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
- package/.claude/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
- package/.claude/skills/cfn-redis-coordination/log-event.sh +0 -109
- package/.claude/skills/cfn-redis-coordination/metrics-export.sh +0 -674
- package/.claude/skills/cfn-redis-coordination/metrics-schema.json +0 -66
- package/.claude/skills/cfn-redis-coordination/metrics-storage.md +0 -31
- package/.claude/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
- package/.claude/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
- package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
- package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
- package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
- package/.claude/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
- package/.claude/skills/cfn-redis-coordination/priority_wake.py +0 -134
- package/.claude/skills/cfn-redis-coordination/query-dlq.sh +0 -162
- package/.claude/skills/cfn-redis-coordination/query-logs.sh +0 -103
- package/.claude/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
- package/.claude/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
- package/.claude/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
- package/.claude/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
- package/.claude/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
- package/.claude/skills/cfn-redis-coordination/signal.sh +0 -38
- package/.claude/skills/cfn-redis-coordination/store-context.sh +0 -86
- package/.claude/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
- package/.claude/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
- package/.claude/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
- package/.claude/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
- package/.claude/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
- package/.claude/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
- package/.claude/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
- package/.claude/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
- package/.claude/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
- package/.claude/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
- package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
- package/.claude/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
- package/.claude/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
- package/.claude/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
- package/.claude/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
- package/.claude/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
- package/.claude/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
- package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT.md +0 -57
- package/claude-assets/skills/cfn-redis-coordination/HEARTBEAT_MONITORING.md +0 -267
- package/claude-assets/skills/cfn-redis-coordination/LOGGING.md +0 -260
- package/claude-assets/skills/cfn-redis-coordination/README.md +0 -65
- package/claude-assets/skills/cfn-redis-coordination/SECURITY_REVIEW.md +0 -25
- package/claude-assets/skills/cfn-redis-coordination/SHUTDOWN_HANDLING.md +0 -164
- package/claude-assets/skills/cfn-redis-coordination/SKILL.md +0 -720
- package/claude-assets/skills/cfn-redis-coordination/demos/test-dlq.sh +0 -129
- package/claude-assets/skills/cfn-redis-coordination/demos/test-iteration-feedback.sh +0 -320
- package/claude-assets/skills/cfn-redis-coordination/demos/test-orchestrator.sh +0 -249
- package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4-unix.sh +0 -148
- package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake-phase4.sh +0 -163
- package/claude-assets/skills/cfn-redis-coordination/demos/test-priority-wake.sh +0 -138
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quick-fix.sh +0 -81
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-absolute.sh +0 -45
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-fallback.sh +0 -68
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-percentage.sh +0 -56
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum-with-retry.sh +0 -81
- package/claude-assets/skills/cfn-redis-coordination/demos/test-quorum.sh +0 -57
- package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown-handling.sh +0 -187
- package/claude-assets/skills/cfn-redis-coordination/demos/test-shutdown.sh +0 -160
- package/claude-assets/skills/cfn-redis-coordination/demos/test-utils-unix.sh +0 -97
- package/claude-assets/skills/cfn-redis-coordination/demos/test-utils.sh +0 -97
- package/claude-assets/skills/cfn-redis-coordination/demos/test-waiting-mode.sh +0 -59
- package/claude-assets/skills/cfn-redis-coordination/examples/README.md +0 -73
- package/claude-assets/skills/cfn-redis-coordination/examples/grafana-dashboard.json +0 -352
- package/claude-assets/skills/cfn-redis-coordination/examples/hierarchical-pattern.sh +0 -127
- package/claude-assets/skills/cfn-redis-coordination/examples/mesh-pattern.sh +0 -171
- package/claude-assets/skills/cfn-redis-coordination/examples/timeout-handling.sh +0 -227
- package/claude-assets/skills/cfn-redis-coordination/examples/waiting-mode-pattern.sh +0 -239
- package/claude-assets/skills/cfn-redis-coordination/execute-product-owner-decision.sh +0 -258
- package/claude-assets/skills/cfn-redis-coordination/get-agent-timeout.sh +0 -177
- package/claude-assets/skills/cfn-redis-coordination/heartbeat-functions.sh +0 -137
- package/claude-assets/skills/cfn-redis-coordination/heartbeat-protocol.md +0 -106
- package/claude-assets/skills/cfn-redis-coordination/heartbeat.sh +0 -126
- package/claude-assets/skills/cfn-redis-coordination/init-swarm.sh +0 -148
- package/claude-assets/skills/cfn-redis-coordination/invoke-redis-pattern.sh +0 -220
- package/claude-assets/skills/cfn-redis-coordination/invoke-waiting-mode.sh +0 -283
- package/claude-assets/skills/cfn-redis-coordination/list-active-swarms.sh +0 -147
- package/claude-assets/skills/cfn-redis-coordination/log-event.sh +0 -109
- package/claude-assets/skills/cfn-redis-coordination/metrics-export.sh +0 -674
- package/claude-assets/skills/cfn-redis-coordination/metrics-schema.json +0 -66
- package/claude-assets/skills/cfn-redis-coordination/metrics-storage.md +0 -31
- package/claude-assets/skills/cfn-redis-coordination/monitor-cfn-violations.sh +0 -391
- package/claude-assets/skills/cfn-redis-coordination/monitor-heartbeats.sh +0 -101
- package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop-v3.sh +0 -141
- package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh +0 -31
- package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
- package/claude-assets/skills/cfn-redis-coordination/priority-wake-mechanism.md +0 -75
- package/claude-assets/skills/cfn-redis-coordination/priority_wake.py +0 -134
- package/claude-assets/skills/cfn-redis-coordination/query-dlq.sh +0 -162
- package/claude-assets/skills/cfn-redis-coordination/query-logs.sh +0 -103
- package/claude-assets/skills/cfn-redis-coordination/redis-pattern.sh +0 -619
- package/claude-assets/skills/cfn-redis-coordination/retrieve-context.sh +0 -58
- package/claude-assets/skills/cfn-redis-coordination/select-specialist-agent.sh +0 -371
- package/claude-assets/skills/cfn-redis-coordination/semantic-match-tfidf.py +0 -252
- package/claude-assets/skills/cfn-redis-coordination/send-heartbeat.sh +0 -165
- package/claude-assets/skills/cfn-redis-coordination/signal.sh +0 -38
- package/claude-assets/skills/cfn-redis-coordination/store-context.sh +0 -86
- package/claude-assets/skills/cfn-redis-coordination/store-epic-context.sh +0 -123
- package/claude-assets/skills/cfn-redis-coordination/test-context-injection.sh +0 -354
- package/claude-assets/skills/cfn-redis-coordination/test-timeout-enforcement.sh +0 -513
- package/claude-assets/skills/cfn-redis-coordination/tests/convert-line-endings.sh +0 -15
- package/claude-assets/skills/cfn-redis-coordination/tests/dlq-functionality-test.sh +0 -102
- package/claude-assets/skills/cfn-redis-coordination/tests/edge-cases-test.sh +0 -99
- package/claude-assets/skills/cfn-redis-coordination/tests/integration-test.sh +0 -170
- package/claude-assets/skills/cfn-redis-coordination/tests/retry-mechanism-test.sh +0 -82
- package/claude-assets/skills/cfn-redis-coordination/tests/run-test-suite.sh +0 -92
- package/claude-assets/skills/cfn-redis-coordination/tests/run-tests.sh +0 -4
- package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-monitoring.sh +0 -418
- package/claude-assets/skills/cfn-redis-coordination/tests/test-heartbeat-simple.sh +0 -124
- package/claude-assets/skills/cfn-redis-coordination/tests/test-primitives.sh +0 -166
- package/claude-assets/skills/cfn-redis-coordination/tests/test-utils.sh +0 -54
- package/claude-assets/skills/cfn-redis-coordination/tests/test_utils.sh +0 -49
- package/claude-assets/skills/cfn-redis-coordination/v2_modularization/core_orchestration.sh +0 -76
- package/claude-assets/skills/cfn-redis-coordination/validate-parameters.sh +0 -492
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
-
"title": "CFN Loop Observability Metrics Schema",
|
|
4
|
-
"version": "1.0.0",
|
|
5
|
-
"type": "object",
|
|
6
|
-
"properties": {
|
|
7
|
-
"task_metadata": {
|
|
8
|
-
"type": "object",
|
|
9
|
-
"properties": {
|
|
10
|
-
"task_id": {"type": "string"},
|
|
11
|
-
"mode": {"enum": ["mvp", "standard", "enterprise"]},
|
|
12
|
-
"current_iteration": {"type": "number", "minimum": 1},
|
|
13
|
-
"start_timestamp": {"type": "number"}
|
|
14
|
-
},
|
|
15
|
-
"required": ["task_id", "mode"]
|
|
16
|
-
},
|
|
17
|
-
"iteration_metrics": {
|
|
18
|
-
"type": "object",
|
|
19
|
-
"properties": {
|
|
20
|
-
"duration_ms": {"type": "number", "minimum": 0},
|
|
21
|
-
"iteration_count": {"type": "number", "minimum": 1},
|
|
22
|
-
"gate_pass_rate": {"type": "number", "minimum": 0, "maximum": 1}
|
|
23
|
-
}
|
|
24
|
-
},
|
|
25
|
-
"agent_metrics": {
|
|
26
|
-
"type": "object",
|
|
27
|
-
"properties": {
|
|
28
|
-
"latency_ms": {
|
|
29
|
-
"type": "object",
|
|
30
|
-
"additionalProperties": {
|
|
31
|
-
"type": "object",
|
|
32
|
-
"properties": {
|
|
33
|
-
"min": {"type": "number"},
|
|
34
|
-
"max": {"type": "number"},
|
|
35
|
-
"avg": {"type": "number"}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
},
|
|
39
|
-
"timeout_count": {"type": "number", "minimum": 0},
|
|
40
|
-
"retry_count": {"type": "number", "minimum": 0},
|
|
41
|
-
"heartbeat_miss_count": {"type": "number", "minimum": 0}
|
|
42
|
-
}
|
|
43
|
-
},
|
|
44
|
-
"consensus_metrics": {
|
|
45
|
-
"type": "object",
|
|
46
|
-
"properties": {
|
|
47
|
-
"loop3": {
|
|
48
|
-
"type": "object",
|
|
49
|
-
"properties": {
|
|
50
|
-
"confidence_score": {"type": "number", "minimum": 0, "maximum": 1},
|
|
51
|
-
"gate_pass": {"type": "boolean"}
|
|
52
|
-
}
|
|
53
|
-
},
|
|
54
|
-
"loop2": {
|
|
55
|
-
"type": "object",
|
|
56
|
-
"properties": {
|
|
57
|
-
"consensus_score": {"type": "number", "minimum": 0, "maximum": 1},
|
|
58
|
-
"final_consensus": {"type": "boolean"}
|
|
59
|
-
}
|
|
60
|
-
},
|
|
61
|
-
"quorum_fallback_count": {"type": "number", "minimum": 0}
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
},
|
|
65
|
-
"required": ["task_metadata", "iteration_metrics", "agent_metrics", "consensus_metrics"]
|
|
66
|
-
}
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
# CFN Loop Metrics Storage Strategy
|
|
2
|
-
|
|
3
|
-
## Redis Key Patterns
|
|
4
|
-
|
|
5
|
-
### Task-Level Metrics
|
|
6
|
-
- `swarm:{task_id}:metrics:metadata`
|
|
7
|
-
- `swarm:{task_id}:metrics:iteration`
|
|
8
|
-
- `swarm:{task_id}:metrics:agent`
|
|
9
|
-
- `swarm:{task_id}:metrics:consensus`
|
|
10
|
-
|
|
11
|
-
### Iteration-Specific Keys
|
|
12
|
-
- `swarm:{task_id}:iteration:{iteration_number}:duration`
|
|
13
|
-
- `swarm:{task_id}:iteration:{iteration_number}:gate_pass_rate`
|
|
14
|
-
|
|
15
|
-
### Agent-Level Keys
|
|
16
|
-
- `swarm:{task_id}:agent:{agent_id}:latency`
|
|
17
|
-
- `swarm:{task_id}:agent:{agent_id}:timeouts`
|
|
18
|
-
|
|
19
|
-
### Consensus Keys
|
|
20
|
-
- `swarm:{task_id}:consensus:loop3:confidence`
|
|
21
|
-
- `swarm:{task_id}:consensus:loop2:score`
|
|
22
|
-
|
|
23
|
-
## Storage Mechanisms
|
|
24
|
-
- Hash (HSET): Detailed metrics
|
|
25
|
-
- List (LPUSH): Time-series events
|
|
26
|
-
- Sorted Set (ZADD): Ranked metrics
|
|
27
|
-
|
|
28
|
-
## Retention Policy
|
|
29
|
-
- Default: 30 days
|
|
30
|
-
- Can be configured via environment variable
|
|
31
|
-
- Automatic pruning after task completion
|
|
@@ -1,391 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
# monitor-cfn-violations.sh - Real-time CFN Loop violation detector
|
|
3
|
-
# Part of Redis Coordination Skill
|
|
4
|
-
#
|
|
5
|
-
# Monitors active CFN Loop executions and detects common violations:
|
|
6
|
-
# - Orchestrator never started
|
|
7
|
-
# - Loop 2 started before Loop 3 complete (gate bypass)
|
|
8
|
-
# - Missing agent completion signals
|
|
9
|
-
# - Heartbeat monitoring not started
|
|
10
|
-
# - Product Owner not consulted
|
|
11
|
-
# - Coordinator timeout issues
|
|
12
|
-
#
|
|
13
|
-
# Alerts sent via Redis pub/sub and WebSocket (web portal integration)
|
|
14
|
-
#
|
|
15
|
-
# Usage: ./monitor-cfn-violations.sh [--interval 30] [--websocket-port 3001]
|
|
16
|
-
#
|
|
17
|
-
# Version: 1.0.0
|
|
18
|
-
# Last Updated: 2025-10-20
|
|
19
|
-
|
|
20
|
-
set -euo pipefail
|
|
21
|
-
|
|
22
|
-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
23
|
-
REDIS_HOST="${REDIS_HOST:-localhost}"
|
|
24
|
-
REDIS_PORT="${REDIS_PORT:-6379}"
|
|
25
|
-
CHECK_INTERVAL=30 # seconds between checks
|
|
26
|
-
WEBSOCKET_PORT=3001
|
|
27
|
-
VIOLATION_LOG="/tmp/cfn-violations.log"
|
|
28
|
-
|
|
29
|
-
# Parse arguments
|
|
30
|
-
while [[ $# -gt 0 ]]; do
|
|
31
|
-
case $1 in
|
|
32
|
-
--interval)
|
|
33
|
-
CHECK_INTERVAL="$2"
|
|
34
|
-
shift 2
|
|
35
|
-
;;
|
|
36
|
-
--websocket-port)
|
|
37
|
-
WEBSOCKET_PORT="$2"
|
|
38
|
-
shift 2
|
|
39
|
-
;;
|
|
40
|
-
*)
|
|
41
|
-
echo "Unknown argument: $1"
|
|
42
|
-
exit 1
|
|
43
|
-
;;
|
|
44
|
-
esac
|
|
45
|
-
done
|
|
46
|
-
|
|
47
|
-
echo "=== CFN Loop Violation Monitor ==="
|
|
48
|
-
echo "Redis: ${REDIS_HOST}:${REDIS_PORT}"
|
|
49
|
-
echo "Check interval: ${CHECK_INTERVAL}s"
|
|
50
|
-
echo "WebSocket port: ${WEBSOCKET_PORT}"
|
|
51
|
-
echo "Log: ${VIOLATION_LOG}"
|
|
52
|
-
echo ""
|
|
53
|
-
|
|
54
|
-
# Initialize violation log
|
|
55
|
-
echo "[$(date -Iseconds)] Monitor started" > "$VIOLATION_LOG"
|
|
56
|
-
|
|
57
|
-
# Function: Send violation alert via Redis pub/sub
|
|
58
|
-
send_violation_alert() {
|
|
59
|
-
local task_id="$1"
|
|
60
|
-
local violation_type="$2"
|
|
61
|
-
local severity="$3" # critical, warning, info
|
|
62
|
-
local description="$4"
|
|
63
|
-
local recommendation="$5"
|
|
64
|
-
local evidence="$6" # JSON string
|
|
65
|
-
|
|
66
|
-
local timestamp=$(date -Iseconds)
|
|
67
|
-
|
|
68
|
-
# Build JSON alert
|
|
69
|
-
local alert=$(jq -nc \
|
|
70
|
-
--arg ts "$timestamp" \
|
|
71
|
-
--arg tid "$task_id" \
|
|
72
|
-
--arg vtype "$violation_type" \
|
|
73
|
-
--arg sev "$severity" \
|
|
74
|
-
--arg desc "$description" \
|
|
75
|
-
--arg rec "$recommendation" \
|
|
76
|
-
--argjson ev "$evidence" \
|
|
77
|
-
'{
|
|
78
|
-
timestamp: $ts,
|
|
79
|
-
task_id: $tid,
|
|
80
|
-
violation_type: $vtype,
|
|
81
|
-
severity: $sev,
|
|
82
|
-
description: $desc,
|
|
83
|
-
recommendation: $rec,
|
|
84
|
-
evidence: $ev
|
|
85
|
-
}')
|
|
86
|
-
|
|
87
|
-
# Publish to task-specific channel
|
|
88
|
-
echo "$alert" | redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
89
|
-
PUBLISH "swarm:${task_id}:violations" >/dev/null
|
|
90
|
-
|
|
91
|
-
# Publish to global violations channel (for web portal)
|
|
92
|
-
echo "$alert" | redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
93
|
-
PUBLISH "cfn:violations:all" >/dev/null
|
|
94
|
-
|
|
95
|
-
# Log violation
|
|
96
|
-
echo "[$(date -Iseconds)] [$severity] $violation_type: $description (task: $task_id)" >> "$VIOLATION_LOG"
|
|
97
|
-
|
|
98
|
-
# Send to WebSocket server if available
|
|
99
|
-
if command -v curl &>/dev/null; then
|
|
100
|
-
curl -s -X POST "http://localhost:${WEBSOCKET_PORT}/api/violations" \
|
|
101
|
-
-H "Content-Type: application/json" \
|
|
102
|
-
-d "$alert" >/dev/null 2>&1 || true
|
|
103
|
-
fi
|
|
104
|
-
|
|
105
|
-
echo " 🚨 [$severity] $violation_type: $description"
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
# Function: Check if orchestrator never started
|
|
109
|
-
check_orchestrator_not_started() {
|
|
110
|
-
local swarm_id="$1"
|
|
111
|
-
|
|
112
|
-
# Get swarm metadata
|
|
113
|
-
local created_at=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
114
|
-
HGET "$swarm_id" created_at 2>/dev/null || echo "")
|
|
115
|
-
|
|
116
|
-
if [ -z "$created_at" ]; then
|
|
117
|
-
return 0 # Swarm doesn't exist, skip
|
|
118
|
-
fi
|
|
119
|
-
|
|
120
|
-
# Calculate time elapsed
|
|
121
|
-
local created_ts=$(date -d "$created_at" +%s 2>/dev/null || echo "0")
|
|
122
|
-
local now_ts=$(date +%s)
|
|
123
|
-
local elapsed=$((now_ts - created_ts))
|
|
124
|
-
|
|
125
|
-
# If swarm exists >2 minutes but no status key, orchestrator never started
|
|
126
|
-
if [ $elapsed -gt 120 ]; then
|
|
127
|
-
local task_id=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
128
|
-
HGET "$swarm_id" task_id 2>/dev/null || echo "unknown")
|
|
129
|
-
|
|
130
|
-
local status_key="swarm:${task_id}:status"
|
|
131
|
-
local status=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
132
|
-
GET "$status_key" 2>/dev/null || echo "")
|
|
133
|
-
|
|
134
|
-
if [ -z "$status" ]; then
|
|
135
|
-
# Check if already alerted
|
|
136
|
-
local alert_key="violation:${task_id}:orchestrator_not_started"
|
|
137
|
-
if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
|
|
138
|
-
local evidence=$(jq -nc \
|
|
139
|
-
--arg created "$created_at" \
|
|
140
|
-
--arg elapsed "$elapsed" \
|
|
141
|
-
--arg status_exists "false" \
|
|
142
|
-
'{
|
|
143
|
-
swarm_created_at: $created,
|
|
144
|
-
time_elapsed_seconds: ($elapsed | tonumber),
|
|
145
|
-
status_key_exists: ($status_exists == "true"),
|
|
146
|
-
agent_keys_count: 0
|
|
147
|
-
}')
|
|
148
|
-
|
|
149
|
-
send_violation_alert \
|
|
150
|
-
"$task_id" \
|
|
151
|
-
"orchestrator_never_started" \
|
|
152
|
-
"critical" \
|
|
153
|
-
"Orchestrator was never spawned after ${elapsed}s. Coordinator may have failed at Step 2." \
|
|
154
|
-
"Check coordinator logs. Ensure orchestrator spawned with run_in_background: true" \
|
|
155
|
-
"$evidence"
|
|
156
|
-
|
|
157
|
-
# Mark as alerted (TTL 1 hour)
|
|
158
|
-
redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
159
|
-
SETEX "$alert_key" 3600 "alerted" >/dev/null
|
|
160
|
-
fi
|
|
161
|
-
fi
|
|
162
|
-
fi
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
# Function: Check if Loop 2 started before Loop 3 completed (gate bypass)
|
|
166
|
-
check_gate_bypass() {
|
|
167
|
-
local task_id="$1"
|
|
168
|
-
|
|
169
|
-
# Check if Loop 2 started
|
|
170
|
-
local loop2_start=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
171
|
-
GET "swarm:${task_id}:loop2:started" 2>/dev/null || echo "")
|
|
172
|
-
|
|
173
|
-
if [ -n "$loop2_start" ]; then
|
|
174
|
-
# Check if Loop 3 completed
|
|
175
|
-
local loop3_complete=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
176
|
-
GET "swarm:${task_id}:loop3:complete" 2>/dev/null || echo "")
|
|
177
|
-
|
|
178
|
-
if [ -z "$loop3_complete" ]; then
|
|
179
|
-
local alert_key="violation:${task_id}:gate_bypass"
|
|
180
|
-
if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
|
|
181
|
-
local evidence=$(jq -nc \
|
|
182
|
-
--arg loop2_start "$loop2_start" \
|
|
183
|
-
'{
|
|
184
|
-
loop2_started_at: $loop2_start,
|
|
185
|
-
loop3_complete: false,
|
|
186
|
-
gate_passed: false
|
|
187
|
-
}')
|
|
188
|
-
|
|
189
|
-
send_violation_alert \
|
|
190
|
-
"$task_id" \
|
|
191
|
-
"gate_bypass_violation" \
|
|
192
|
-
"critical" \
|
|
193
|
-
"Loop 2 validators started before Loop 3 gate passed. This violates CFN Loop protocol." \
|
|
194
|
-
"Check orchestrator gate check logic. Loop 2 must BLPOP on gate-passed signal." \
|
|
195
|
-
"$evidence"
|
|
196
|
-
|
|
197
|
-
redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
198
|
-
SETEX "$alert_key" 3600 "alerted" >/dev/null
|
|
199
|
-
fi
|
|
200
|
-
fi
|
|
201
|
-
fi
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
# Function: Check if agents completed but orchestrator hung
|
|
205
|
-
check_orchestrator_hang() {
|
|
206
|
-
local task_id="$1"
|
|
207
|
-
|
|
208
|
-
# Get orchestrator status
|
|
209
|
-
local status=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
210
|
-
GET "swarm:${task_id}:status" 2>/dev/null || echo "")
|
|
211
|
-
|
|
212
|
-
# Check if status indicates waiting for agents
|
|
213
|
-
if [[ "$status" =~ loop3_waiting|loop2_waiting ]]; then
|
|
214
|
-
# Count done signals
|
|
215
|
-
local done_keys=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
216
|
-
KEYS "swarm:${task_id}:*:done" 2>/dev/null | wc -l)
|
|
217
|
-
|
|
218
|
-
# Get expected agent count
|
|
219
|
-
local swarm_id=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
220
|
-
GET "task:${task_id}:swarm" 2>/dev/null || echo "swarm:swarm-${task_id}")
|
|
221
|
-
local expected=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
222
|
-
HGET "${swarm_id}:metadata" max_agents 2>/dev/null || echo "0")
|
|
223
|
-
|
|
224
|
-
if [ "$done_keys" -ge "$expected" ] && [ "$expected" -gt 0 ]; then
|
|
225
|
-
# Agents completed but orchestrator still waiting
|
|
226
|
-
local alert_key="violation:${task_id}:orchestrator_hang"
|
|
227
|
-
if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
|
|
228
|
-
local evidence=$(jq -nc \
|
|
229
|
-
--arg status "$status" \
|
|
230
|
-
--arg done "$done_keys" \
|
|
231
|
-
--arg expected "$expected" \
|
|
232
|
-
'{
|
|
233
|
-
orchestrator_status: $status,
|
|
234
|
-
done_signals_count: ($done | tonumber),
|
|
235
|
-
expected_agents: ($expected | tonumber)
|
|
236
|
-
}')
|
|
237
|
-
|
|
238
|
-
send_violation_alert \
|
|
239
|
-
"$task_id" \
|
|
240
|
-
"orchestrator_hang_with_complete_agents" \
|
|
241
|
-
"critical" \
|
|
242
|
-
"All agents signaled completion but orchestrator still waiting. Possible BLPOP key mismatch." \
|
|
243
|
-
"Check orchestrator DONE_KEY construction. Verify agent IDs match (with iteration suffix)." \
|
|
244
|
-
"$evidence"
|
|
245
|
-
|
|
246
|
-
redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
247
|
-
SETEX "$alert_key" 3600 "alerted" >/dev/null
|
|
248
|
-
fi
|
|
249
|
-
fi
|
|
250
|
-
fi
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
# Function: Check if coordinator monitoring with timeout
|
|
254
|
-
check_coordinator_timeout_pattern() {
|
|
255
|
-
local task_id="$1"
|
|
256
|
-
|
|
257
|
-
# Check if swarm created but status never updated (5+ min)
|
|
258
|
-
local swarm_id=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
259
|
-
GET "task:${task_id}:swarm" 2>/dev/null || echo "swarm:swarm-${task_id}")
|
|
260
|
-
|
|
261
|
-
local created_at=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
262
|
-
HGET "${swarm_id}:metadata" created_at 2>/dev/null || echo "")
|
|
263
|
-
|
|
264
|
-
if [ -n "$created_at" ]; then
|
|
265
|
-
local created_ts=$(date -d "$created_at" +%s 2>/dev/null || echo "0")
|
|
266
|
-
local now_ts=$(date +%s)
|
|
267
|
-
local elapsed=$((now_ts - created_ts))
|
|
268
|
-
|
|
269
|
-
# Check if swarm cancelled with SIGTERM after ~5-10 minutes
|
|
270
|
-
local status=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
271
|
-
HGET "${swarm_id}:metadata" status 2>/dev/null || echo "")
|
|
272
|
-
local shutdown_reason=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
273
|
-
HGET "${swarm_id}:metadata" shutdown_reason 2>/dev/null || echo "")
|
|
274
|
-
|
|
275
|
-
if [ "$status" = "cancelled" ] && [ "$shutdown_reason" = "SIGTERM_received" ] && [ $elapsed -ge 300 ] && [ $elapsed -le 600 ]; then
|
|
276
|
-
local alert_key="violation:${task_id}:coordinator_timeout"
|
|
277
|
-
if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
|
|
278
|
-
local evidence=$(jq -nc \
|
|
279
|
-
--arg created "$created_at" \
|
|
280
|
-
--arg elapsed "$elapsed" \
|
|
281
|
-
--arg reason "$shutdown_reason" \
|
|
282
|
-
'{
|
|
283
|
-
swarm_created_at: $created,
|
|
284
|
-
cancelled_after_seconds: ($elapsed | tonumber),
|
|
285
|
-
shutdown_reason: $reason,
|
|
286
|
-
likely_cause: "coordinator_monitoring_with_bash_timeout"
|
|
287
|
-
}')
|
|
288
|
-
|
|
289
|
-
send_violation_alert \
|
|
290
|
-
"$task_id" \
|
|
291
|
-
"coordinator_monitoring_timeout" \
|
|
292
|
-
"critical" \
|
|
293
|
-
"Coordinator cancelled after ${elapsed}s with SIGTERM. Likely wrapped monitoring in Bash() with timeout." \
|
|
294
|
-
"Check coordinator template. Monitoring must use multiple tool calls in coordinator's own message loop, NOT single Bash() call." \
|
|
295
|
-
"$evidence"
|
|
296
|
-
|
|
297
|
-
redis-cli -h "$REDIS_HOST" -p "$REDIS_HOST" \
|
|
298
|
-
SETEX "$alert_key" 3600 "alerted" >/dev/null
|
|
299
|
-
fi
|
|
300
|
-
fi
|
|
301
|
-
fi
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
# Function: Check if Product Owner skipped
|
|
305
|
-
check_product_owner_skipped() {
|
|
306
|
-
local task_id="$1"
|
|
307
|
-
|
|
308
|
-
# Check if Loop 2 completed
|
|
309
|
-
local loop2_complete=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
310
|
-
GET "swarm:${task_id}:loop2:complete" 2>/dev/null || echo "")
|
|
311
|
-
|
|
312
|
-
if [ -n "$loop2_complete" ]; then
|
|
313
|
-
# Check if Product Owner was consulted
|
|
314
|
-
local po_consulted=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
315
|
-
GET "swarm:${task_id}:product_owner:consulted" 2>/dev/null || echo "")
|
|
316
|
-
|
|
317
|
-
if [ -z "$po_consulted" ]; then
|
|
318
|
-
# Wait 60s after Loop 2 complete to allow time for PO spawn
|
|
319
|
-
local loop2_ts=$(date -d "$loop2_complete" +%s 2>/dev/null || echo "0")
|
|
320
|
-
local now_ts=$(date +%s)
|
|
321
|
-
local elapsed=$((now_ts - loop2_ts))
|
|
322
|
-
|
|
323
|
-
if [ $elapsed -gt 60 ]; then
|
|
324
|
-
local alert_key="violation:${task_id}:po_skipped"
|
|
325
|
-
if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" EXISTS "$alert_key" | grep -q "1"; then
|
|
326
|
-
local evidence=$(jq -nc \
|
|
327
|
-
--arg loop2_complete "$loop2_complete" \
|
|
328
|
-
--arg elapsed "$elapsed" \
|
|
329
|
-
'{
|
|
330
|
-
loop2_completed_at: $loop2_complete,
|
|
331
|
-
time_since_loop2_seconds: ($elapsed | tonumber),
|
|
332
|
-
product_owner_consulted: false
|
|
333
|
-
}')
|
|
334
|
-
|
|
335
|
-
send_violation_alert \
|
|
336
|
-
"$task_id" \
|
|
337
|
-
"product_owner_not_consulted" \
|
|
338
|
-
"warning" \
|
|
339
|
-
"Loop 2 completed ${elapsed}s ago but Product Owner not consulted. Strategic decision skipped." \
|
|
340
|
-
"Check orchestrator Product Owner spawning logic. PO should be spawned after Loop 2 consensus check." \
|
|
341
|
-
"$evidence"
|
|
342
|
-
|
|
343
|
-
redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
344
|
-
SETEX "$alert_key" 3600 "alerted" >/dev/null
|
|
345
|
-
fi
|
|
346
|
-
fi
|
|
347
|
-
fi
|
|
348
|
-
fi
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
# Main monitoring loop
|
|
352
|
-
echo "Starting violation monitoring..."
|
|
353
|
-
echo ""
|
|
354
|
-
|
|
355
|
-
ITERATION=0
|
|
356
|
-
while true; do
|
|
357
|
-
ITERATION=$((ITERATION + 1))
|
|
358
|
-
echo "[Check #${ITERATION}] $(date '+%H:%M:%S')"
|
|
359
|
-
|
|
360
|
-
# Find all active swarm metadata keys
|
|
361
|
-
SWARM_KEYS=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
362
|
-
KEYS "swarm:*:metadata" 2>/dev/null || echo "")
|
|
363
|
-
|
|
364
|
-
if [ -z "$SWARM_KEYS" ]; then
|
|
365
|
-
echo " No active swarms found"
|
|
366
|
-
else
|
|
367
|
-
SWARM_COUNT=$(echo "$SWARM_KEYS" | wc -l)
|
|
368
|
-
echo " Monitoring $SWARM_COUNT swarm(s)..."
|
|
369
|
-
|
|
370
|
-
for SWARM_KEY in $SWARM_KEYS; do
|
|
371
|
-
# Extract task ID
|
|
372
|
-
TASK_ID=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" \
|
|
373
|
-
HGET "$SWARM_KEY" task_id 2>/dev/null || echo "")
|
|
374
|
-
|
|
375
|
-
if [ -z "$TASK_ID" ]; then
|
|
376
|
-
continue
|
|
377
|
-
fi
|
|
378
|
-
|
|
379
|
-
# Run violation checks
|
|
380
|
-
check_orchestrator_not_started "$SWARM_KEY"
|
|
381
|
-
check_gate_bypass "$TASK_ID"
|
|
382
|
-
check_orchestrator_hang "$TASK_ID"
|
|
383
|
-
check_coordinator_timeout_pattern "$TASK_ID"
|
|
384
|
-
check_product_owner_skipped "$TASK_ID"
|
|
385
|
-
done
|
|
386
|
-
fi
|
|
387
|
-
|
|
388
|
-
echo " Sleeping ${CHECK_INTERVAL}s..."
|
|
389
|
-
echo ""
|
|
390
|
-
sleep "$CHECK_INTERVAL"
|
|
391
|
-
done
|
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
# Redis Coordination Skill - Agent Heartbeat Monitor
|
|
3
|
-
# Version: 1.0.0
|
|
4
|
-
# Last Updated: 2025-10-19
|
|
5
|
-
|
|
6
|
-
# Strict error handling
|
|
7
|
-
set -euo pipefail
|
|
8
|
-
|
|
9
|
-
# Default values
|
|
10
|
-
TASK_ID=""
|
|
11
|
-
CHECK_INTERVAL=30
|
|
12
|
-
MISS_THRESHOLD=2
|
|
13
|
-
AGENTS=()
|
|
14
|
-
|
|
15
|
-
# Parse command-line arguments
|
|
16
|
-
while [[ $# -gt 0 ]]; do
|
|
17
|
-
case "$1" in
|
|
18
|
-
--task-id)
|
|
19
|
-
TASK_ID="$2"
|
|
20
|
-
shift 2
|
|
21
|
-
;;
|
|
22
|
-
--check-interval)
|
|
23
|
-
CHECK_INTERVAL="$2"
|
|
24
|
-
shift 2
|
|
25
|
-
;;
|
|
26
|
-
--miss-threshold)
|
|
27
|
-
MISS_THRESHOLD="$2"
|
|
28
|
-
shift 2
|
|
29
|
-
;;
|
|
30
|
-
--agents)
|
|
31
|
-
IFS=',' read -ra AGENTS <<< "$2"
|
|
32
|
-
shift 2
|
|
33
|
-
;;
|
|
34
|
-
*)
|
|
35
|
-
echo "Unknown parameter: $1"
|
|
36
|
-
exit 1
|
|
37
|
-
;;
|
|
38
|
-
esac
|
|
39
|
-
done
|
|
40
|
-
|
|
41
|
-
# Validate required parameters
|
|
42
|
-
if [[ -z "$TASK_ID" ]]; then
|
|
43
|
-
echo "Error: task-id is required"
|
|
44
|
-
exit 1
|
|
45
|
-
fi
|
|
46
|
-
|
|
47
|
-
# Function to check agent heartbeat
|
|
48
|
-
check_agent_heartbeat() {
|
|
49
|
-
local agent_id="$1"
|
|
50
|
-
local miss_count=0
|
|
51
|
-
local last_heartbeat
|
|
52
|
-
|
|
53
|
-
# Check heartbeat key
|
|
54
|
-
last_heartbeat=$(redis-cli get "swarm:${TASK_ID}:${agent_id}:heartbeat")
|
|
55
|
-
|
|
56
|
-
# If no heartbeat found, increment miss count
|
|
57
|
-
if [[ -z "$last_heartbeat" ]]; then
|
|
58
|
-
((miss_count++))
|
|
59
|
-
echo "[$(date -u)] No heartbeat detected for agent: ${agent_id}" >> /var/log/claude-flow/heartbeat-misses.log
|
|
60
|
-
else
|
|
61
|
-
# Reset miss count if heartbeat exists
|
|
62
|
-
miss_count=0
|
|
63
|
-
fi
|
|
64
|
-
|
|
65
|
-
# Trigger actions on missed heartbeats
|
|
66
|
-
if ((miss_count >= MISS_THRESHOLD)); then
|
|
67
|
-
handle_agent_failure "$agent_id"
|
|
68
|
-
fi
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
# Function to handle agent failure
|
|
72
|
-
handle_agent_failure() {
|
|
73
|
-
local agent_id="$1"
|
|
74
|
-
|
|
75
|
-
# Log agent failure
|
|
76
|
-
echo "[$(date -u)] CRITICAL: Agent ${agent_id} failed health check" >> /var/log/claude-flow/agent-failures.log
|
|
77
|
-
|
|
78
|
-
# Remove from active agents
|
|
79
|
-
redis-cli srem "swarm:${TASK_ID}:active-agents" "$agent_id"
|
|
80
|
-
|
|
81
|
-
# Trigger emergency recovery
|
|
82
|
-
./.claude/skills/cfn-redis-coordination/agent-recovery.sh \
|
|
83
|
-
--task-id "$TASK_ID" \
|
|
84
|
-
--agent-id "$agent_id"
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
# Main monitoring loop
|
|
88
|
-
while true; do
|
|
89
|
-
# If no agents specified, fetch from Redis set
|
|
90
|
-
if [[ ${#AGENTS[@]} -eq 0 ]]; then
|
|
91
|
-
mapfile -t AGENTS < <(redis-cli smembers "swarm:${TASK_ID}:active-agents")
|
|
92
|
-
fi
|
|
93
|
-
|
|
94
|
-
# Check heartbeat for each agent
|
|
95
|
-
for agent in "${AGENTS[@]}"; do
|
|
96
|
-
check_agent_heartbeat "$agent"
|
|
97
|
-
done
|
|
98
|
-
|
|
99
|
-
# Sleep before next check
|
|
100
|
-
sleep "$CHECK_INTERVAL"
|
|
101
|
-
done
|