claude-flow-novice 2.6.0 → 2.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/adaptive-context/cfn-v3-reflection.json +21 -0
- package/.claude/agents/AGENT_LIFECYCLE.md +495 -0
- package/.claude/agents/CLAUDE.md +1002 -995
- package/.claude/agents/accessibility-advocate.md +457 -0
- package/.claude/agents/agent-principles/README.md +226 -226
- package/.claude/agents/agent-principles/agent-type-guidelines.md +10 -0
- package/.claude/agents/agent-principles/format-selection.md +10 -0
- package/.claude/agents/agent-principles/phase4-template-optimization.md +502 -494
- package/.claude/agents/agent-principles/prompt-engineering.md +8 -0
- package/.claude/agents/agent-principles/quality-metrics.md +8 -0
- package/.claude/agents/analysis/code-analyzer.md +7 -17
- package/.claude/agents/analysis/code-review/analyze-code-quality.md +2 -104
- package/.claude/agents/analysis/perf-analyzer.md +2 -196
- package/.claude/agents/context/context-curator.md +78 -84
- package/.claude/agents/context/context-reflector.md +27 -81
- package/.claude/agents/coordinators/README.md +42 -0
- package/.claude/agents/coordinators/cfn-v3-coordinator.md +440 -0
- package/.claude/agents/{product-owner-team → coordinators}/cto-agent.md +154 -187
- package/.claude/agents/coordinators/multi-sprint-coordinator.md +50 -0
- package/.claude/agents/{product-owner-team → coordinators}/product-owner-agent.md +6 -39
- package/.claude/agents/{cfn-loop → coordinators}/product-owner.md +72 -17
- package/.claude/agents/core-agents/reviewer.md +114 -135
- package/.claude/agents/custom/agent-builder.md +637 -637
- package/.claude/agents/developers/README.md +69 -0
- package/.claude/agents/developers/backend-dev.md +77 -0
- package/.claude/agents/{core-agents → developers}/coder.md +131 -26
- package/.claude/agents/developers/react-frontend-engineer.md +121 -0
- package/.claude/agents/{frontend → developers}/state-architect.md +1 -0
- package/.claude/agents/{frontend → developers}/ui-designer.md +1 -0
- package/.claude/agents/development/backend/dev-backend-api.md +0 -29
- package/.claude/agents/development/npm-package-specialist.md +355 -347
- package/.claude/agents/documentation/api-docs/docs-api-openapi.md +8 -0
- package/.claude/agents/documentation/api-docs.md +8 -0
- package/.claude/agents/github/github-commit-agent.md +125 -117
- package/.claude/agents/goal/goal-planner.md +8 -0
- package/.claude/agents/infrastructure/README.md +100 -0
- package/.claude/agents/{specialized → infrastructure}/devops-engineer.md +131 -150
- package/.claude/agents/planners/README.md +94 -0
- package/.claude/agents/{core-agents → planners}/analyst.md +1 -22
- package/.claude/agents/{planning-team → planners}/api-designer-persona.md +8 -0
- package/.claude/agents/{core-agents → planners}/architect.md +7 -20
- package/.claude/agents/{core-agents → planners}/planner.md +0 -21
- package/.claude/agents/{planning-team → planners}/security-architect-persona.md +8 -28
- package/.claude/agents/{planning-team → planners}/system-architect-persona.md +6 -38
- package/.claude/agents/{architecture → planners}/system-architect.md +12 -17
- package/.claude/agents/product-owner-team/accessibility-advocate-persona.md +132 -161
- package/.claude/agents/product-owner-team/power-user-persona.md +149 -182
- package/.claude/agents/retrospective-analyst.md +84 -0
- package/.claude/agents/reviewers/README.md +58 -0
- package/.claude/agents/{analysis → reviewers}/code-quality-validator.md +8 -17
- package/.claude/agents/reviewers/reviewer.md +181 -0
- package/.claude/agents/sparc/architecture.md +6 -25
- package/.claude/agents/sparc/pseudocode.md +6 -0
- package/.claude/agents/sparc/refinement.md +6 -0
- package/.claude/agents/sparc/specification.md +1 -0
- package/.claude/agents/specialists/README.md +60 -0
- package/.claude/agents/{core-agents → specialists}/base-template-generator.md +8 -21
- package/.claude/agents/{specialized → specialists}/cli-agent-optimizer.md +1 -1
- package/.claude/agents/{specialized → specialists}/code-booster.md +1 -0
- package/.claude/agents/{consensus → specialists}/consensus-builder.md +1 -17
- package/.claude/agents/{specialized/mobile → specialists}/mobile-dev.md +0 -20
- package/.claude/agents/{core-agents → specialists}/performance-benchmarker.md +134 -148
- package/.claude/agents/{specialized → specialists}/rust-developer.md +1 -20
- package/.claude/agents/{specialized → specialists}/rust-enterprise-developer.md +1 -20
- package/.claude/agents/{specialized → specialists}/rust-mvp-developer.md +1 -20
- package/.claude/agents/{core-agents → specialists}/security-manager.md +68 -88
- package/.claude/agents/{security → specialists}/security-specialist-existing.md +6 -57
- package/.claude/agents/{security → specialists}/security-specialist.md +6 -30
- package/.claude/agents/{specialized/mobile → specialists}/spec-mobile-react-native.md +2 -21
- package/.claude/agents/testers/README.md +94 -0
- package/.claude/agents/{testing → testers}/e2e/playwright-agent.md +1 -20
- package/.claude/agents/{testing → testers}/interaction-tester.md +1 -20
- package/.claude/agents/{testing → testers}/playwright-tester.md +1 -1
- package/.claude/agents/testers/tester.md +139 -0
- package/.claude/agents/testers/unit/tdd-london-swarm.md +49 -0
- package/.claude/agents/testers/validation/production-validator.md +33 -0
- package/.claude/agents-ignore/cfn-loop-coordinator.md +157 -0
- package/.claude/agents-ignore/cfn-loop-coordinator.md.backup +156 -0
- package/.claude/agents-ignore/coordinator.md.backup +182 -0
- package/.claude/agents-ignore/cost-savings-cfn-loop-coordinator.md +760 -0
- package/.claude/agents-ignore/cost-savings-coordinator.md +173 -0
- package/.claude/artifacts/ace-reflections/REFLECT-001-summary.json +39 -0
- package/.claude/artifacts/ace-reflections/sprint-7_$(date -u +/"%Y%m%d_%H%M%S/").json" +47 -0
- package/.claude/commands/CFN_COORDINATOR_PARAMETERS.md +10 -10
- package/.claude/commands/cfn-loop-epic.md +3 -3
- package/.claude/commands/cfn-loop-single.md +3 -3
- package/.claude/commands/cfn-loop-sprints.md +1 -1
- package/.claude/commands/cfn-loop.md +3 -3
- package/.claude/commands/cfn-mode.md +20 -0
- package/.claude/commands/write-plan.md +104 -0
- package/.claude/data/cfn-loop.db +0 -0
- package/.claude/data/cfn_loop_logs.db +0 -0
- package/.claude/hooks/BACKUP_USAGE.md +243 -0
- package/.claude/hooks/post-edit-cfn-retrospective.sh +79 -0
- package/.claude/hooks/post-edit.sh +21 -0
- package/.claude/hooks/pre-edit-backup.sh +71 -0
- package/.claude/hooks/restore-from-backup.sh +37 -0
- package/.claude/prompts/cfn-loop-context.md +115 -0
- package/.claude/prompts/loop-specific/loop2.md +50 -0
- package/.claude/prompts/loop-specific/loop3.md +43 -0
- package/.claude/prompts/loop-specific/loop4.md +54 -0
- package/.claude/root-claude-distribute/CLAUDE.md +76 -2
- package/.claude/skills/ace-system/sprint-7-lessons.json +46 -0
- package/.claude/skills/ace-system/store-reflection.sh +33 -136
- package/.claude/skills/agent-discovery/SKILL.md +40 -0
- package/.claude/skills/agent-discovery/agents-registry-clean.json +0 -0
- package/.claude/skills/agent-discovery/agents-registry-fixed.json +19 -0
- package/.claude/skills/agent-discovery/agents-registry.json +718 -0
- package/.claude/skills/agent-discovery/discover-agents.py +175 -0
- package/.claude/skills/agent-discovery/discover-agents.sh +87 -0
- package/.claude/skills/agent-discovery/invoke-registry.sh +11 -0
- package/.claude/skills/agent-discovery/temp_script.py +0 -0
- package/.claude/skills/agent-output-processing/SKILL.md +359 -0
- package/.claude/skills/agent-selector/SKILL.md +90 -0
- package/.claude/skills/agent-selector/select-agents.sh +96 -0
- package/.claude/skills/agent-spawning/agent-selection-guide.md +1 -1
- package/.claude/skills/agent-swap/SKILL.md +36 -0
- package/.claude/skills/agent-swap/recommend-swap.sh +60 -0
- package/.claude/skills/api-validation/test-endpoints.sh +54 -0
- package/.claude/skills/automatic-memory-persistence/SKILL.md +73 -0
- package/.claude/skills/automatic-memory-persistence/persist-agent-output.sh +49 -0
- package/.claude/skills/automatic-memory-persistence/query-agent-history.sh +35 -0
- package/.claude/skills/automatic-memory-persistence/test-memory-persistence.sh +235 -0
- package/.claude/skills/cfn-loop-orchestration/README.md +41 -0
- package/.claude/skills/cfn-loop-orchestration/SKILL.md +299 -0
- package/.claude/skills/cfn-loop-orchestration/helpers/auto-tune-timeouts.sh +228 -0
- package/.claude/skills/cfn-loop-orchestration/helpers/consensus.sh +84 -0
- package/.claude/skills/cfn-loop-orchestration/helpers/deliverable-verifier.sh +71 -0
- package/.claude/skills/cfn-loop-orchestration/helpers/gate-check.sh +90 -0
- package/.claude/skills/cfn-loop-orchestration/helpers/iteration-manager.sh +87 -0
- package/.claude/skills/cfn-loop-orchestration/helpers/timeout-calculator.sh +51 -0
- package/.claude/skills/cfn-loop-orchestration/inject-loop-context.sh +41 -0
- package/.claude/skills/cfn-loop-orchestration/monitor-execution.sh +156 -0
- package/.claude/skills/cfn-loop-orchestration/orchestrate.sh +840 -0
- package/.claude/skills/cfn-loop-orchestration/security_utils.sh +99 -0
- package/.claude/skills/cfn-loop-orchestration/test-cfn-orchestration.sh +281 -0
- package/.claude/skills/cfn-loop-orchestration/test-edge-cases.sh +188 -0
- package/.claude/skills/cfn-loop-validation/SKILL.md +307 -217
- package/.claude/skills/complexity-estimator/SKILL.md +96 -0
- package/.claude/skills/complexity-estimator/estimate-complexity.sh +144 -0
- package/.claude/skills/context-pruner/SKILL.md +75 -0
- package/.claude/skills/context-pruner/prune-context.sh +73 -0
- package/.claude/skills/defense-in-depth/SKILL.md +133 -0
- package/.claude/skills/dependency-extractor/SKILL.md +35 -0
- package/.claude/skills/dependency-extractor/extract-dependencies.sh +66 -0
- package/.claude/skills/epic-decomposer/SKILL.md +44 -0
- package/.claude/skills/epic-decomposer/decompose-epic.sh +104 -0
- package/.claude/skills/improvement-recommender/SKILL.md +33 -0
- package/.claude/skills/improvement-recommender/recommend-improvements.sh +92 -0
- package/.claude/skills/intervention-detector/SKILL.md +39 -0
- package/.claude/skills/intervention-detector/detect-intervention.sh +111 -0
- package/.claude/skills/intervention-orchestrator/SKILL.md +43 -0
- package/.claude/skills/intervention-orchestrator/execute-intervention.sh +59 -0
- package/.claude/skills/loop2-output-processing/SKILL.md +163 -0
- package/.claude/skills/loop2-output-processing/execute-and-extract.sh +77 -0
- package/.claude/skills/loop2-output-processing/execute-and-extract.sh.backup +36 -0
- package/.claude/skills/loop2-output-processing/parse-feedback.sh +147 -0
- package/.claude/skills/loop2-output-processing/process-validator-output.sh +275 -0
- package/.claude/skills/loop2-output-processing/test-bug27-fix.sh +200 -0
- package/.claude/skills/loop2-output-processing/test-loop2-processing.sh +113 -0
- package/.claude/skills/loop3-output-processing/AGENT_COMPLETION_PROTOCOL.md +206 -0
- package/.claude/skills/loop3-output-processing/SKILL.md +421 -0
- package/.claude/skills/loop3-output-processing/calculate-confidence.sh +28 -0
- package/.claude/skills/loop3-output-processing/execute-and-extract.sh +85 -0
- package/.claude/skills/loop3-output-processing/parse-confidence.sh +31 -0
- package/.claude/skills/loop3-output-processing/test-agent-timeout.sh +327 -0
- package/.claude/skills/loop3-output-processing/test-loop3-processing.sh +155 -0
- package/.claude/skills/loop3-output-processing/verify-deliverables.sh +42 -0
- package/.claude/skills/pattern-extraction/SKILL.md +30 -0
- package/.claude/skills/pattern-extraction/extract-patterns.sh +80 -0
- package/.claude/skills/playbook/SKILL.md +113 -0
- package/.claude/skills/playbook/init-playbook.sh +54 -0
- package/.claude/skills/playbook/playbook.db +0 -0
- package/.claude/skills/playbook/query-playbook.sh +79 -0
- package/.claude/skills/playbook/update-playbook.sh +69 -0
- package/.claude/skills/playbook-auto-update/SKILL.md +29 -0
- package/.claude/skills/playbook-auto-update/auto-update-playbook.sh +86 -0
- package/.claude/skills/product-owner-decision/SKILL.md +332 -0
- package/.claude/skills/product-owner-decision/execute-decision.sh +176 -0
- package/.claude/skills/product-owner-decision/parse-decision.sh +66 -0
- package/.claude/skills/product-owner-decision/validate-deliverables.sh +82 -0
- package/.claude/skills/redis-coordination/LOGGING.md +260 -0
- package/.claude/skills/redis-coordination/README.md +30 -29
- package/.claude/skills/redis-coordination/SKILL.md +685 -83
- package/.claude/skills/redis-coordination/analyze-task-complexity.sh +277 -0
- package/.claude/skills/redis-coordination/cfn-loop-exec.sh +468 -0
- package/.claude/skills/redis-coordination/collect-confidence-scores.sh +179 -0
- package/.claude/skills/redis-coordination/collect-results.sh +75 -0
- package/.claude/skills/redis-coordination/data/cfn-loop.db +0 -0
- package/.claude/skills/redis-coordination/{test-orchestrator.sh → demos/test-orchestrator.sh} +25 -0
- package/.claude/skills/redis-coordination/execute-product-owner-decision.sh +258 -0
- package/.claude/skills/redis-coordination/get-agent-timeout.sh +176 -176
- package/.claude/skills/redis-coordination/invoke-waiting-mode.sh +93 -227
- package/.claude/skills/redis-coordination/invoke-waiting-mode.sh.backup-p7 +423 -0
- package/.claude/skills/redis-coordination/log-event.sh +109 -0
- package/.claude/skills/redis-coordination/monitor-cfn-violations.sh +391 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop-v3.sh +141 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh +31 -993
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.backup +38 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.backup-1761167675 +1672 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.backup-p5 +1604 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.backup-phase1 +1550 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.backup-phase2 +1621 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.backup-phase3 +1621 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.bak +0 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.broken +1627 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.corrupted +80 -0
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.deprecated +1864 -0
- package/.claude/skills/redis-coordination/query-logs.sh +103 -0
- package/.claude/skills/redis-coordination/retrieve-context.sh +58 -0
- package/.claude/skills/redis-coordination/select-specialist-agent.sh +371 -0
- package/.claude/skills/redis-coordination/semantic-match-tfidf.py +252 -0
- package/.claude/skills/redis-coordination/send-heartbeat.sh +164 -72
- package/.claude/skills/redis-coordination/signal.sh +38 -0
- package/.claude/skills/redis-coordination/store-context.sh +86 -0
- package/.claude/skills/redis-coordination/test-context-injection.sh +354 -0
- package/.claude/skills/redis-coordination/test-timeout-enforcement.sh +513 -0
- package/.claude/skills/redis-coordination/tests/convert-line-endings.sh +15 -0
- package/.claude/skills/redis-coordination/tests/dlq-functionality-test.sh +101 -101
- package/.claude/skills/redis-coordination/tests/edge-cases-test.sh +98 -98
- package/.claude/skills/redis-coordination/tests/integration-test.sh +169 -169
- package/.claude/skills/redis-coordination/tests/retry-mechanism-test.sh +81 -81
- package/.claude/skills/redis-coordination/tests/run-test-suite.sh +91 -91
- package/.claude/skills/redis-coordination/tests/run-tests.sh +4 -0
- package/.claude/skills/redis-coordination/tests/test-primitives.sh +166 -0
- package/.claude/skills/redis-coordination/tests/test-utils.sh +53 -121
- package/.claude/skills/redis-coordination/tests/test_coordination_primitives.sh.deprecated +20 -0
- package/.claude/skills/redis-coordination/tests/test_utils.sh +49 -0
- package/.claude/skills/redis-coordination/v2_modularization/core_orchestration.sh +76 -0
- package/.claude/skills/redis-coordination/validate-parameters.sh +492 -0
- package/.claude/skills/retrospective-report/SKILL.md +31 -0
- package/.claude/skills/retrospective-report/generate-report.sh +101 -0
- package/.claude/skills/run-all-skill-tests.sh +124 -0
- package/.claude/skills/scope-simplifier/SKILL.md +37 -0
- package/.claude/skills/scope-simplifier/simplify-scope.sh +68 -0
- package/.claude/skills/simplified-agent-lifecycle/COST_ANALYSIS.md +49 -0
- package/.claude/skills/simplified-agent-lifecycle/DESIGN.md +98 -0
- package/.claude/skills/simplified-agent-lifecycle/MIGRATION_PLAN.md +74 -0
- package/.claude/skills/specialist-injection/SKILL.md +41 -0
- package/.claude/skills/specialist-injection/recommend-specialist.sh +57 -0
- package/.claude/skills/sprint-execution/SKILL.md +27 -0
- package/.claude/skills/sprint-execution/execute-sprint-task.sh +59 -0
- package/.claude/skills/sprint-execution/execute-sprint.sh +65 -0
- package/.claude/skills/sprint-planner/SKILL.md +37 -0
- package/.claude/skills/sprint-planner/plan-sprint.sh +85 -0
- package/.claude/skills/standardized-error-handling/SKILL.md +56 -0
- package/.claude/skills/standardized-error-handling/capture-agent-error.sh +87 -0
- package/.claude/skills/standardized-error-handling/test-error-handling.sh +166 -0
- package/.claude/skills/task-classifier/SKILL.md +94 -0
- package/.claude/skills/task-classifier/classify-task.sh +115 -0
- package/.claude/skills/validation-templates/SKILL.md +47 -0
- package/.claude/skills/validation-templates/content.json +38 -0
- package/.claude/skills/validation-templates/data.json +38 -0
- package/.claude/skills/validation-templates/design.json +38 -0
- package/.claude/skills/validation-templates/infrastructure.json +38 -0
- package/.claude/skills/validation-templates/research.json +38 -0
- package/.claude/skills/validation-templates/software.json +38 -0
- package/.claude/skills/webapp-testing/README.md +142 -0
- package/.claude/skills/webapp-testing/SCREENSHOT_NAMING_CONVENTION.md +547 -0
- package/.claude/skills/webapp-testing/SKILL.md +877 -0
- package/.claude/skills/webapp-testing/capture-screenshot.sh +238 -0
- package/.claude/skills/webapp-testing/cfn-loop-integration.sh +265 -0
- package/.claude/skills/webapp-testing/compare-screenshots.sh +199 -0
- package/.claude/skills/webapp-testing/init-storage.sh +150 -0
- package/.claude/skills/webapp-testing/set-baseline.sh +196 -0
- package/.claude/skills/webapp-testing/test-webapp-testing.sh +233 -0
- package/.claude/spawn-pattern-examples.md +3 -3
- package/CLAUDE.md +319 -45
- package/README.md +598 -251
- package/dist/agents/agent-loader.js +146 -165
- package/dist/agents/agent-loader.js.map +1 -1
- package/dist/cli/agent-command.js +2 -0
- package/dist/cli/agent-command.js.map +1 -1
- package/dist/cli/agent-definition-parser.js +7 -0
- package/dist/cli/agent-definition-parser.js.map +1 -1
- package/dist/cli/agent-executor.js +145 -11
- package/dist/cli/agent-executor.js.map +1 -1
- package/dist/cli/agent-prompt-builder.js +81 -1
- package/dist/cli/agent-prompt-builder.js.map +1 -1
- package/dist/cli/agent-spawn.js +10 -1
- package/dist/cli/agent-spawn.js.map +1 -1
- package/dist/cli/anthropic-client.js +192 -13
- package/dist/cli/anthropic-client.js.map +1 -1
- package/dist/cli/cfn-context.js +150 -0
- package/dist/cli/cfn-context.js.map +1 -1
- package/dist/cli/cfn-fork.js +159 -0
- package/dist/cli/cfn-fork.js.map +1 -0
- package/dist/cli/cli-agent-context.js +8 -3
- package/dist/cli/cli-agent-context.js.map +1 -1
- package/dist/cli/config-manager.js +109 -91
- package/dist/cli/config-manager.js.map +1 -1
- package/dist/cli/conversation-fork.js +201 -0
- package/dist/cli/conversation-fork.js.map +1 -0
- package/dist/cli/index.js +4 -0
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/tool-definitions.js +263 -0
- package/dist/cli/tool-definitions.js.map +1 -0
- package/dist/cli/tool-executor.js +247 -0
- package/dist/cli/tool-executor.js.map +1 -0
- package/dist/hello.js +8 -0
- package/dist/hello.js.map +1 -0
- package/package.json +14 -6
- package/scripts/README.md +68 -0
- package/scripts/cfn-intervention-example.sh +21 -0
- package/scripts/migrate-test-infrastructure.sh +40 -0
- package/scripts/validate-test-migration.sh +49 -0
- package/scripts/verify-no-secrets.sh +55 -0
- package/.claude/agents/architecture/system-architect.md.backup +0 -603
- package/.claude/agents/code-booster.md +0 -131
- package/.claude/agents/consensus/performance-benchmarker.md +0 -101
- package/.claude/agents/consensus/security-manager.md +0 -107
- package/.claude/agents/context-curator.md +0 -167
- package/.claude/agents/context-reflector.md +0 -65
- package/.claude/agents/core-agents/cfn-loop-coordinator.md +0 -134
- package/.claude/agents/core-agents/code-quality-validator.md +0 -149
- package/.claude/agents/core-agents/context-curator.md +0 -452
- package/.claude/agents/core-agents/context-reflector.md +0 -273
- package/.claude/agents/core-agents/cost-savings-cfn-loop-coordinator.md +0 -235
- package/.claude/agents/core-agents/tester.md +0 -170
- package/.claude/agents/development/backend-dev.md +0 -165
- package/.claude/agents/devops/devops-engineer.md +0 -148
- package/.claude/agents/frontend/interaction-tester.md +0 -139
- package/.claude/agents/frontend/react-frontend-engineer.md +0 -9
- package/.claude/agents/personas/accessibility-advocate-persona.md +0 -107
- package/.claude/agents/testing/production-validator.md +0 -179
- package/.claude/agents/testing/tdd-london-swarm.md +0 -209
- package/.claude/agents/testing/unit/tdd-london-swarm.md +0 -43
- package/.claude/agents/testing/validation/production-validator.md +0 -43
- package/.claude/api-configs/config-current-zai-config.env +0 -62
- package/.claude/api-configs/config-test-zai-config.env +0 -62
- package/.claude/api-configs/env-backups/before-anthropic-20251020-025404.env +0 -62
- package/.claude/api-configs/env-backups/before-restore-20251020-025431.env +0 -62
- package/.claude/skills/redis-coordination/orchestrate-cfn-loop.sh.backup-1760949407 +0 -933
- package/dist/cli/cli-agent-context.test.js +0 -451
- package/dist/cli/cli-agent-context.test.js.map +0 -1
- package/dist/coordination/fleet-manager.test.js +0 -141
- package/dist/coordination/fleet-manager.test.js.map +0 -1
- package/dist/middleware/transparency-middleware.test.js +0 -184
- package/dist/middleware/transparency-middleware.test.js.map +0 -1
- /package/.claude/agents/{core-agents → developers}/researcher.md +0 -0
- /package/.claude/agents/{consensus → specialists}/crdt-synchronizer.md +0 -0
- /package/.claude/agents/{consensus → specialists}/quorum-manager.md +0 -0
- /package/.claude/agents/{consensus → specialists}/raft-manager.md +0 -0
- /package/.claude/{agents/core-agents → agents-ignore}/coordinator.md +0 -0
- /package/.claude/{agents/core-agents/cost-savings-coordinator.md → agents-ignore/cost-savings-coordinator.md.backup} +0 -0
- /package/.claude/skills/redis-coordination/{phase4-wake-queue-test-report.md → demos/phase4-wake-queue-test-report.md} +0 -0
- /package/.claude/skills/redis-coordination/{test-bzpopmin-fix.sh → demos/test-bzpopmin-fix.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-cancel-swarm.sh → demos/test-cancel-swarm.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-dlq.sh → demos/test-dlq.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-iteration-feedback.sh → demos/test-iteration-feedback.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-priority-wake-phase4-unix.sh → demos/test-priority-wake-phase4-unix.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-priority-wake-phase4.sh → demos/test-priority-wake-phase4.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-priority-wake.sh → demos/test-priority-wake.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-quick-fix.sh → demos/test-quick-fix.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-quorum-absolute.sh → demos/test-quorum-absolute.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-quorum-fallback.sh → demos/test-quorum-fallback.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-quorum-percentage.sh → demos/test-quorum-percentage.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-quorum-with-retry.sh → demos/test-quorum-with-retry.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-quorum.sh → demos/test-quorum.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-shutdown-handling.sh → demos/test-shutdown-handling.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-shutdown.sh → demos/test-shutdown.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-utils-unix.sh → demos/test-utils-unix.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-utils.sh → demos/test-utils.sh} +0 -0
- /package/.claude/skills/redis-coordination/{test-waiting-mode.sh → demos/test-waiting-mode.sh} +0 -0
|
@@ -1,933 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
|
|
3
|
-
##############################################################################
|
|
4
|
-
# CFN Loop Orchestration v2.0.0
|
|
5
|
-
# Manages multi-loop CFN execution with dependency tracking and consensus
|
|
6
|
-
#
|
|
7
|
-
# Usage:
|
|
8
|
-
# ./orchestrate-cfn-loop.sh --task-id <id> \
|
|
9
|
-
# --mode <mvp|standard|enterprise> \
|
|
10
|
-
# --loop3-agents <agent1,agent2,...> \
|
|
11
|
-
# --loop2-agents <agent1,agent2,...> \
|
|
12
|
-
# --product-owner <agent-id> \
|
|
13
|
-
# [--max-iterations <n>] \
|
|
14
|
-
# [--min-quorum-loop3 <n|n%|0.n>] \
|
|
15
|
-
# [--min-quorum-loop2 <n|n%|0.n>]
|
|
16
|
-
#
|
|
17
|
-
# CFN Loop Structure (CORRECTED):
|
|
18
|
-
# Loop 3 (Primary Swarm - Self Validation)
|
|
19
|
-
# ↓
|
|
20
|
-
# IF Loop 3 self-validation gate FAILS → RELAUNCH Loop 3 (skip Loop 2)
|
|
21
|
-
# IF Loop 3 self-validation gate PASSES → Proceed to Loop 2
|
|
22
|
-
# ↓
|
|
23
|
-
# Loop 2 (Consensus Validators)
|
|
24
|
-
# ↓
|
|
25
|
-
# Product Owner Decision
|
|
26
|
-
#
|
|
27
|
-
# Dependency Enforcement:
|
|
28
|
-
# - Loop 3 agents self-validate via confidence scores
|
|
29
|
-
# - Gate check determines if Loop 2 validators should be engaged
|
|
30
|
-
# - Loop 2 agents WAIT for gate pass signal before starting work
|
|
31
|
-
# - Product Owner BLOCKS until all Loop 2 agents signal completion
|
|
32
|
-
# - Uses Redis BLPOP for zero-token waiting
|
|
33
|
-
#
|
|
34
|
-
# Quorum Configuration:
|
|
35
|
-
# - Absolute: --min-quorum-loop3 3 (requires exactly 3 agents)
|
|
36
|
-
# - Percentage: --min-quorum-loop3 85% (requires 85% of agents)
|
|
37
|
-
# - Decimal: --min-quorum-loop3 0.66 (requires 66% of agents)
|
|
38
|
-
# - Default: 0.66 (2/3 majority) if not specified
|
|
39
|
-
#
|
|
40
|
-
# Agent Requirements:
|
|
41
|
-
# Loop 3 (Implementers):
|
|
42
|
-
# 1. Complete work
|
|
43
|
-
# 2. Signal done: redis-cli lpush "swarm:${TASK_ID}:${AGENT_ID}:done" "complete"
|
|
44
|
-
# 3. Report confidence: invoke-waiting-mode.sh report --confidence <0.0-1.0>
|
|
45
|
-
# 4. Enter waiting: invoke-waiting-mode.sh enter (for potential iteration)
|
|
46
|
-
#
|
|
47
|
-
# Loop 2 (Validators):
|
|
48
|
-
# 1. WAIT for gate pass: redis-cli blpop "swarm:${TASK_ID}:gate-passed" 0
|
|
49
|
-
# 2. Retrieve Loop 3 results for review
|
|
50
|
-
# 3. Perform validation
|
|
51
|
-
# 4. Signal done: redis-cli lpush "swarm:${TASK_ID}:${AGENT_ID}:done" "complete"
|
|
52
|
-
# 5. Report consensus: invoke-waiting-mode.sh report --confidence <0.0-1.0>
|
|
53
|
-
# 6. Enter waiting: invoke-waiting-mode.sh enter (for potential iteration)
|
|
54
|
-
##############################################################################
|
|
55
|
-
|
|
56
|
-
set -euo pipefail
|
|
57
|
-
|
|
58
|
-
# Configuration
|
|
59
|
-
TASK_ID=""
|
|
60
|
-
MODE="standard"
|
|
61
|
-
LOOP3_AGENTS=""
|
|
62
|
-
LOOP2_AGENTS=""
|
|
63
|
-
PRODUCT_OWNER=""
|
|
64
|
-
MAX_ITERATIONS=10
|
|
65
|
-
TIMEOUT=3600 # 1 hour timeout for agent completion
|
|
66
|
-
RETRY_COUNT=3
|
|
67
|
-
RETRY_DELAY=5000 # Base delay in milliseconds
|
|
68
|
-
MIN_QUORUM_LOOP3="" # Minimum agents required for Loop 3 (absolute or percentage)
|
|
69
|
-
MIN_QUORUM_LOOP2="" # Minimum agents required for Loop 2 (absolute or percentage)
|
|
70
|
-
ORCHESTRATOR_PID=$$
|
|
71
|
-
SHUTDOWN_MONITOR_PID=""
|
|
72
|
-
SHUTDOWN_REQUESTED=0
|
|
73
|
-
LOOP3_HEARTBEAT_MONITOR_PID=""
|
|
74
|
-
LOOP2_HEARTBEAT_MONITOR_PID=""
|
|
75
|
-
|
|
76
|
-
# Thresholds by mode
|
|
77
|
-
declare -A GATE_THRESHOLD=(
|
|
78
|
-
[mvp]=0.70
|
|
79
|
-
[standard]=0.75
|
|
80
|
-
[enterprise]=0.75
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
declare -A CONSENSUS_THRESHOLD=(
|
|
84
|
-
[mvp]=0.80
|
|
85
|
-
[standard]=0.90
|
|
86
|
-
[enterprise]=0.95
|
|
87
|
-
)
|
|
88
|
-
|
|
89
|
-
# Parse arguments
|
|
90
|
-
while [[ $# -gt 0 ]]; do
|
|
91
|
-
case $1 in
|
|
92
|
-
--task-id)
|
|
93
|
-
TASK_ID="$2"
|
|
94
|
-
shift 2
|
|
95
|
-
;;
|
|
96
|
-
--mode)
|
|
97
|
-
MODE="$2"
|
|
98
|
-
shift 2
|
|
99
|
-
;;
|
|
100
|
-
--loop3-agents)
|
|
101
|
-
LOOP3_AGENTS="$2"
|
|
102
|
-
shift 2
|
|
103
|
-
;;
|
|
104
|
-
--loop2-agents)
|
|
105
|
-
LOOP2_AGENTS="$2"
|
|
106
|
-
shift 2
|
|
107
|
-
;;
|
|
108
|
-
--product-owner)
|
|
109
|
-
PRODUCT_OWNER="$2"
|
|
110
|
-
shift 2
|
|
111
|
-
;;
|
|
112
|
-
--max-iterations)
|
|
113
|
-
MAX_ITERATIONS="$2"
|
|
114
|
-
shift 2
|
|
115
|
-
;;
|
|
116
|
-
--retry-count)
|
|
117
|
-
RETRY_COUNT="$2"
|
|
118
|
-
shift 2
|
|
119
|
-
;;
|
|
120
|
-
--retry-delay)
|
|
121
|
-
RETRY_DELAY="$2"
|
|
122
|
-
shift 2
|
|
123
|
-
;;
|
|
124
|
-
--timeout)
|
|
125
|
-
TIMEOUT="$2"
|
|
126
|
-
shift 2
|
|
127
|
-
;;
|
|
128
|
-
--min-quorum-loop3)
|
|
129
|
-
MIN_QUORUM_LOOP3="$2"
|
|
130
|
-
shift 2
|
|
131
|
-
;;
|
|
132
|
-
--min-quorum-loop2)
|
|
133
|
-
MIN_QUORUM_LOOP2="$2"
|
|
134
|
-
shift 2
|
|
135
|
-
;;
|
|
136
|
-
*)
|
|
137
|
-
echo "Unknown option: $1"
|
|
138
|
-
exit 1
|
|
139
|
-
;;
|
|
140
|
-
esac
|
|
141
|
-
done
|
|
142
|
-
|
|
143
|
-
# Validation
|
|
144
|
-
if [ -z "$TASK_ID" ] || [ -z "$LOOP3_AGENTS" ] || [ -z "$LOOP2_AGENTS" ] || [ -z "$PRODUCT_OWNER" ]; then
|
|
145
|
-
echo "Error: Required parameters missing"
|
|
146
|
-
echo "Usage: $0 --task-id <id> --mode <mode> --loop3-agents <agents> --loop2-agents <agents> --product-owner <agent>"
|
|
147
|
-
exit 1
|
|
148
|
-
fi
|
|
149
|
-
|
|
150
|
-
GATE=${GATE_THRESHOLD[$MODE]}
|
|
151
|
-
CONSENSUS=${CONSENSUS_THRESHOLD[$MODE]}
|
|
152
|
-
|
|
153
|
-
# Set default quorum values if not specified (66% = 2/3 majority)
|
|
154
|
-
MIN_QUORUM_LOOP3=${MIN_QUORUM_LOOP3:-0.66}
|
|
155
|
-
MIN_QUORUM_LOOP2=${MIN_QUORUM_LOOP2:-0.66}
|
|
156
|
-
|
|
157
|
-
##############################################################################
|
|
158
|
-
# Shutdown Handling Functions
|
|
159
|
-
##############################################################################
|
|
160
|
-
function cleanup_and_exit() {
|
|
161
|
-
local exit_code="${1:-130}"
|
|
162
|
-
local reason="${2:-user_interrupt}"
|
|
163
|
-
|
|
164
|
-
# Set shutdown flag to stop any ongoing operations
|
|
165
|
-
SHUTDOWN_REQUESTED=1
|
|
166
|
-
|
|
167
|
-
echo ""
|
|
168
|
-
echo "=============================================="
|
|
169
|
-
echo "🛑 Orchestrator shutting down gracefully..."
|
|
170
|
-
echo "=============================================="
|
|
171
|
-
echo "Reason: $reason"
|
|
172
|
-
echo "Exit Code: $exit_code"
|
|
173
|
-
|
|
174
|
-
# Kill shutdown monitor if running
|
|
175
|
-
if [ -n "$SHUTDOWN_MONITOR_PID" ] && kill -0 "$SHUTDOWN_MONITOR_PID" 2>/dev/null; then
|
|
176
|
-
kill "$SHUTDOWN_MONITOR_PID" 2>/dev/null || true
|
|
177
|
-
wait "$SHUTDOWN_MONITOR_PID" 2>/dev/null || true
|
|
178
|
-
fi
|
|
179
|
-
|
|
180
|
-
# Stop heartbeat monitors if running
|
|
181
|
-
if [ -n "${LOOP3_HEARTBEAT_MONITOR_PID:-}" ]; then
|
|
182
|
-
echo "Stopping Loop 3 heartbeat monitor..."
|
|
183
|
-
stop_heartbeat_monitor "$TASK_ID" "loop3" "$LOOP3_HEARTBEAT_MONITOR_PID"
|
|
184
|
-
fi
|
|
185
|
-
if [ -n "${LOOP2_HEARTBEAT_MONITOR_PID:-}" ]; then
|
|
186
|
-
echo "Stopping Loop 2 heartbeat monitor..."
|
|
187
|
-
stop_heartbeat_monitor "$TASK_ID" "loop2" "$LOOP2_HEARTBEAT_MONITOR_PID"
|
|
188
|
-
fi
|
|
189
|
-
|
|
190
|
-
# Mark swarm as cancelled if initialized
|
|
191
|
-
if [ -n "$TASK_ID" ] && [ -n "${SWARM_ID:-}" ]; then
|
|
192
|
-
echo "Marking swarm as cancelled..."
|
|
193
|
-
./.claude/skills/redis-coordination/complete-swarm.sh \
|
|
194
|
-
--swarm-id "$SWARM_ID" \
|
|
195
|
-
--final-metric "status=cancelled" \
|
|
196
|
-
--final-metric "shutdown_reason=$reason" 2>/dev/null || echo " ⚠️ Failed to mark swarm as cancelled"
|
|
197
|
-
fi
|
|
198
|
-
|
|
199
|
-
# Clean up Redis keys
|
|
200
|
-
if [ -n "$TASK_ID" ]; then
|
|
201
|
-
echo "Cleaning up Redis keys..."
|
|
202
|
-
local keys_deleted=$(redis-cli --scan --pattern "swarm:${TASK_ID}:*" | xargs -r redis-cli DEL 2>/dev/null || echo "0")
|
|
203
|
-
echo " Deleted $keys_deleted Redis keys"
|
|
204
|
-
fi
|
|
205
|
-
|
|
206
|
-
# Clean up heartbeat monitor marker files
|
|
207
|
-
rm -f /tmp/heartbeat-monitor-${TASK_ID}-*.active 2>/dev/null || true
|
|
208
|
-
|
|
209
|
-
echo "=============================================="
|
|
210
|
-
echo "Shutdown complete"
|
|
211
|
-
echo "=============================================="
|
|
212
|
-
|
|
213
|
-
exit "$exit_code"
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
# Trap SIGTERM and SIGINT for graceful shutdown
|
|
217
|
-
trap 'echo "[TRAP] Caught SIGINT" >&2; cleanup_and_exit 130 "SIGINT_received"' SIGINT
|
|
218
|
-
trap 'echo "[TRAP] Caught SIGTERM" >&2; cleanup_and_exit 143 "SIGTERM_received"' SIGTERM
|
|
219
|
-
|
|
220
|
-
##############################################################################
|
|
221
|
-
# Start Shutdown Monitor (Background Process)
|
|
222
|
-
##############################################################################
|
|
223
|
-
function start_shutdown_monitor() {
|
|
224
|
-
local task_id="$1"
|
|
225
|
-
|
|
226
|
-
(
|
|
227
|
-
# Block on shutdown channel (zero-token waiting)
|
|
228
|
-
SHUTDOWN_KEY="swarm:${task_id}:shutdown"
|
|
229
|
-
SHUTDOWN_RESULT=$(redis-cli BLPOP "$SHUTDOWN_KEY" 0 2>/dev/null || echo "")
|
|
230
|
-
|
|
231
|
-
if [ -n "$SHUTDOWN_RESULT" ]; then
|
|
232
|
-
# Extract shutdown payload (format: key value)
|
|
233
|
-
SHUTDOWN_PAYLOAD=$(echo "$SHUTDOWN_RESULT" | tail -1)
|
|
234
|
-
REASON=$(echo "$SHUTDOWN_PAYLOAD" | jq -r '.reason // "external_shutdown"' 2>/dev/null || echo "external_shutdown")
|
|
235
|
-
|
|
236
|
-
echo ""
|
|
237
|
-
echo "🛑 Shutdown signal received from Redis channel: $REASON"
|
|
238
|
-
echo " Sending SIGTERM to orchestrator PID: $ORCHESTRATOR_PID"
|
|
239
|
-
|
|
240
|
-
# Send SIGTERM to main orchestrator process
|
|
241
|
-
if kill -TERM "$ORCHESTRATOR_PID" 2>/dev/null; then
|
|
242
|
-
echo " ✅ SIGTERM sent successfully"
|
|
243
|
-
else
|
|
244
|
-
echo " ❌ Failed to send SIGTERM (process may have already exited)"
|
|
245
|
-
exit 0
|
|
246
|
-
fi
|
|
247
|
-
fi
|
|
248
|
-
) &
|
|
249
|
-
|
|
250
|
-
SHUTDOWN_MONITOR_PID=$!
|
|
251
|
-
echo "Shutdown monitor started (PID: $SHUTDOWN_MONITOR_PID)"
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
##############################################################################
|
|
255
|
-
# Quorum Calculation Function
|
|
256
|
-
##############################################################################
|
|
257
|
-
function calculate_quorum() {
|
|
258
|
-
local quorum_spec="$1"
|
|
259
|
-
local total_agents="$2"
|
|
260
|
-
|
|
261
|
-
# If no quorum specified, require all agents
|
|
262
|
-
if [ -z "$quorum_spec" ]; then
|
|
263
|
-
echo "$total_agents"
|
|
264
|
-
return 0
|
|
265
|
-
fi
|
|
266
|
-
|
|
267
|
-
# Check if percentage format (e.g., "85%")
|
|
268
|
-
if [[ "$quorum_spec" =~ %$ ]]; then
|
|
269
|
-
# Extract percentage value (remove % suffix)
|
|
270
|
-
local pct="${quorum_spec%\%}"
|
|
271
|
-
# Calculate: ceil(total_agents * pct / 100)
|
|
272
|
-
echo "scale=0; ($total_agents * $pct + 50) / 100" | bc
|
|
273
|
-
# Check if decimal (0.0-1.0), treat as fraction
|
|
274
|
-
elif [[ "$quorum_spec" =~ ^0?\.[0-9]+$ ]]; then
|
|
275
|
-
# Calculate: ceil(total_agents * fraction)
|
|
276
|
-
echo "scale=0; ($quorum_spec * $total_agents + 0.5) / 1" | bc
|
|
277
|
-
else
|
|
278
|
-
# Absolute number - validate it doesn't exceed total
|
|
279
|
-
if [ "$quorum_spec" -gt "$total_agents" ]; then
|
|
280
|
-
echo "Error: Quorum ($quorum_spec) exceeds total agents ($total_agents)" >&2
|
|
281
|
-
return 1
|
|
282
|
-
fi
|
|
283
|
-
echo "$quorum_spec"
|
|
284
|
-
fi
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
##############################################################################
|
|
288
|
-
# Dead Letter Queue (DLQ) Functions
|
|
289
|
-
##############################################################################
|
|
290
|
-
function write_to_dlq() {
|
|
291
|
-
local agent="$1"
|
|
292
|
-
local reason="$2"
|
|
293
|
-
local retry_count="$3"
|
|
294
|
-
|
|
295
|
-
DLQ_KEY="swarm:${TASK_ID}:dlq:${agent}"
|
|
296
|
-
DLQ_ENTRY=$(jq -n \
|
|
297
|
-
--arg reason "$reason" \
|
|
298
|
-
--arg retries "$retry_count" \
|
|
299
|
-
--arg ts "$(date +%s)" \
|
|
300
|
-
'{reason: $reason, retry_count: ($retries | tonumber), timestamp: ($ts | tonumber)}')
|
|
301
|
-
|
|
302
|
-
echo "$DLQ_ENTRY" | redis-cli -x LPUSH "$DLQ_KEY" >/dev/null
|
|
303
|
-
redis-cli EXPIRE "$DLQ_KEY" 604800 >/dev/null # 7 days TTL
|
|
304
|
-
|
|
305
|
-
echo " ❌ $agent → DLQ (reason: $reason, retries: $retry_count)"
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
##############################################################################
|
|
309
|
-
# Exponential Backoff Retry Function
|
|
310
|
-
##############################################################################
|
|
311
|
-
function retry_with_backoff() {
|
|
312
|
-
local agent="$1"
|
|
313
|
-
local attempt="$2"
|
|
314
|
-
local max_retries="$3"
|
|
315
|
-
local base_delay="$4"
|
|
316
|
-
|
|
317
|
-
# Check for shutdown before sleeping
|
|
318
|
-
if [ "$SHUTDOWN_REQUESTED" -eq 1 ]; then
|
|
319
|
-
echo " [SHUTDOWN] Skipping backoff delay for $agent" >&2
|
|
320
|
-
return 0
|
|
321
|
-
fi
|
|
322
|
-
|
|
323
|
-
# Exponential backoff: delay = base_delay * (2 ^ attempt)
|
|
324
|
-
local delay=$(echo "$base_delay * (2 ^ $attempt)" | bc)
|
|
325
|
-
local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
326
|
-
|
|
327
|
-
echo " [$timestamp] [Retry $attempt/$max_retries] Waiting ${delay}ms before retry for $agent..."
|
|
328
|
-
|
|
329
|
-
# Use interruptible sleep - sleep in small increments and check for shutdown
|
|
330
|
-
local delay_sec=$(echo "scale=3; $delay / 1000" | bc)
|
|
331
|
-
local elapsed=0
|
|
332
|
-
while (( $(echo "$elapsed < $delay_sec" | bc -l) )); do
|
|
333
|
-
# Sleep for 0.5s increments (or remaining time if less)
|
|
334
|
-
local remaining=$(echo "$delay_sec - $elapsed" | bc)
|
|
335
|
-
local sleep_time=$(echo "if ($remaining < 0.5) $remaining else 0.5" | bc)
|
|
336
|
-
|
|
337
|
-
sleep "$sleep_time" &
|
|
338
|
-
wait $! 2>/dev/null || return 0 # If wait is interrupted (SIGTERM), return immediately
|
|
339
|
-
|
|
340
|
-
elapsed=$(echo "$elapsed + $sleep_time" | bc)
|
|
341
|
-
|
|
342
|
-
# Check for shutdown after each sleep increment
|
|
343
|
-
if [ "$SHUTDOWN_REQUESTED" -eq 1 ]; then
|
|
344
|
-
echo " [SHUTDOWN] Interrupted backoff delay for $agent" >&2
|
|
345
|
-
return 0
|
|
346
|
-
fi
|
|
347
|
-
done
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
##############################################################################
|
|
351
|
-
# Heartbeat Monitoring Functions
|
|
352
|
-
##############################################################################
|
|
353
|
-
declare -A MISSED_HEARTBEATS # Track missed heartbeats per agent
|
|
354
|
-
|
|
355
|
-
function check_agent_heartbeat() {
|
|
356
|
-
local agent="$1"
|
|
357
|
-
local task_id="$2"
|
|
358
|
-
|
|
359
|
-
HB_KEY="swarm:${task_id}:${agent}:heartbeat"
|
|
360
|
-
HB_DATA=$(redis-cli GET "$HB_KEY" 2>/dev/null || echo "")
|
|
361
|
-
|
|
362
|
-
if [ -z "$HB_DATA" ] || [ "$HB_DATA" = "(nil)" ]; then
|
|
363
|
-
return 1 # Dead
|
|
364
|
-
else
|
|
365
|
-
return 0 # Alive
|
|
366
|
-
fi
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
function check_heartbeats_loop() {
|
|
370
|
-
local task_id="$1"
|
|
371
|
-
local loop_name="$2"
|
|
372
|
-
shift 2
|
|
373
|
-
local agents=("$@")
|
|
374
|
-
|
|
375
|
-
for AGENT in "${agents[@]}"; do
|
|
376
|
-
# Skip agents already marked as failed
|
|
377
|
-
if [[ " ${LOOP3_FAILED_AGENTS[@]} ${LOOP2_FAILED_AGENTS[@]} " =~ " ${AGENT} " ]]; then
|
|
378
|
-
continue
|
|
379
|
-
fi
|
|
380
|
-
|
|
381
|
-
if ! check_agent_heartbeat "$AGENT" "$task_id"; then
|
|
382
|
-
MISSED_HEARTBEATS["$AGENT"]=$((${MISSED_HEARTBEATS["$AGENT"]:-0} + 1))
|
|
383
|
-
|
|
384
|
-
if [ ${MISSED_HEARTBEATS["$AGENT"]} -ge 2 ]; then
|
|
385
|
-
local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
386
|
-
echo " [$timestamp] [$loop_name] ⚠️ $AGENT appears hung (no heartbeat for 60s)" >&2
|
|
387
|
-
|
|
388
|
-
# Determine which loop this agent belongs to and check quorum
|
|
389
|
-
if [[ " ${LOOP3_AGENTS} " =~ " ${AGENT} " ]]; then
|
|
390
|
-
REMAINING=$((${#LOOP3_COMPLETED_AGENTS[@]}))
|
|
391
|
-
REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP3" "$LOOP3_TOTAL")
|
|
392
|
-
elif [[ " ${LOOP2_AGENTS} " =~ " ${LOOP2_AGENTS} " ]]; then
|
|
393
|
-
REMAINING=$((${#LOOP2_COMPLETED_AGENTS[@]}))
|
|
394
|
-
REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP2" "$LOOP2_TOTAL")
|
|
395
|
-
else
|
|
396
|
-
continue
|
|
397
|
-
fi
|
|
398
|
-
|
|
399
|
-
if [ $REMAINING -ge $REQUIRED ]; then
|
|
400
|
-
echo " [$timestamp] [$loop_name] ℹ️ Continuing with quorum (${REMAINING}/${REQUIRED} agents)" >&2
|
|
401
|
-
else
|
|
402
|
-
echo " [$timestamp] [$loop_name] ⚠️ Cannot meet quorum without $AGENT (${REMAINING}/${REQUIRED})" >&2
|
|
403
|
-
fi
|
|
404
|
-
fi
|
|
405
|
-
else
|
|
406
|
-
MISSED_HEARTBEATS["$AGENT"]=0 # Reset counter
|
|
407
|
-
fi
|
|
408
|
-
done
|
|
409
|
-
}
|
|
410
|
-
|
|
411
|
-
function start_heartbeat_monitor() {
|
|
412
|
-
local task_id="$1"
|
|
413
|
-
local loop_name="$2"
|
|
414
|
-
shift 2
|
|
415
|
-
local agents=("$@")
|
|
416
|
-
|
|
417
|
-
# Create marker file for this monitor
|
|
418
|
-
local monitor_marker="/tmp/heartbeat-monitor-${task_id}-${loop_name}.active"
|
|
419
|
-
touch "$monitor_marker"
|
|
420
|
-
|
|
421
|
-
(
|
|
422
|
-
while [ -f "$monitor_marker" ]; do
|
|
423
|
-
# Check for shutdown
|
|
424
|
-
if [ "$SHUTDOWN_REQUESTED" -eq 1 ]; then
|
|
425
|
-
break
|
|
426
|
-
fi
|
|
427
|
-
|
|
428
|
-
check_heartbeats_loop "$task_id" "$loop_name" "${agents[@]}"
|
|
429
|
-
sleep 30
|
|
430
|
-
done
|
|
431
|
-
) &
|
|
432
|
-
|
|
433
|
-
echo "$!" # Return PID
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
function stop_heartbeat_monitor() {
|
|
437
|
-
local task_id="$1"
|
|
438
|
-
local loop_name="$2"
|
|
439
|
-
local monitor_pid="$3"
|
|
440
|
-
|
|
441
|
-
# Remove marker file to stop the monitor loop
|
|
442
|
-
rm -f "/tmp/heartbeat-monitor-${task_id}-${loop_name}.active"
|
|
443
|
-
|
|
444
|
-
# Kill monitor process if still running
|
|
445
|
-
if [ -n "$monitor_pid" ] && kill -0 "$monitor_pid" 2>/dev/null; then
|
|
446
|
-
kill "$monitor_pid" 2>/dev/null || true
|
|
447
|
-
wait "$monitor_pid" 2>/dev/null || true
|
|
448
|
-
fi
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
##############################################################################
|
|
452
|
-
# Get Agent-Specific Timeout
|
|
453
|
-
##############################################################################
|
|
454
|
-
function get_agent_timeout() {
|
|
455
|
-
local agent="$1"
|
|
456
|
-
local task_id="$2"
|
|
457
|
-
|
|
458
|
-
# Use get-agent-timeout.sh helper script
|
|
459
|
-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
460
|
-
AGENT_TIMEOUT=$("$SCRIPT_DIR/get-agent-timeout.sh" --task-id "$task_id" --agent-id "$agent" 2>/dev/null || echo "$TIMEOUT")
|
|
461
|
-
|
|
462
|
-
echo "$AGENT_TIMEOUT"
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
##############################################################################
|
|
466
|
-
# BLPOP with Retry Logic
|
|
467
|
-
##############################################################################
|
|
468
|
-
function blpop_with_retry() {
|
|
469
|
-
local agent="$1"
|
|
470
|
-
local done_key="$2"
|
|
471
|
-
local timeout="$3"
|
|
472
|
-
local retry_count="$4"
|
|
473
|
-
local retry_delay="$5"
|
|
474
|
-
|
|
475
|
-
for ATTEMPT in $(seq 1 $retry_count); do
|
|
476
|
-
# Check for shutdown before attempting BLPOP
|
|
477
|
-
if [ "$SHUTDOWN_REQUESTED" -eq 1 ]; then
|
|
478
|
-
echo " [SHUTDOWN] Aborting BLPOP for $agent" >&2
|
|
479
|
-
return 1
|
|
480
|
-
fi
|
|
481
|
-
|
|
482
|
-
# Use Redis's native BLPOP timeout instead of shell timeout command
|
|
483
|
-
# This allows SIGTERM to properly interrupt the process
|
|
484
|
-
RESULT=$(redis-cli blpop "$done_key" "$timeout" 2>/dev/null || echo "")
|
|
485
|
-
|
|
486
|
-
if [ -n "$RESULT" ]; then
|
|
487
|
-
echo "$RESULT"
|
|
488
|
-
return 0 # Success
|
|
489
|
-
fi
|
|
490
|
-
|
|
491
|
-
# Check for shutdown after BLPOP timeout
|
|
492
|
-
if [ "$SHUTDOWN_REQUESTED" -eq 1 ]; then
|
|
493
|
-
echo " [SHUTDOWN] Aborting retry for $agent" >&2
|
|
494
|
-
return 1
|
|
495
|
-
fi
|
|
496
|
-
|
|
497
|
-
# Log retry attempt (to stderr so it's visible during command substitution)
|
|
498
|
-
local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
499
|
-
echo " [$timestamp] ⚠️ BLPOP attempt $ATTEMPT/$retry_count failed for $agent" >&2
|
|
500
|
-
|
|
501
|
-
if [ $ATTEMPT -lt $retry_count ]; then
|
|
502
|
-
# METRICS: Increment retry counter
|
|
503
|
-
redis-cli INCR "swarm:${TASK_ID}:metrics:retry_count" >/dev/null
|
|
504
|
-
|
|
505
|
-
retry_with_backoff "$agent" "$ATTEMPT" "$retry_count" "$retry_delay" >&2
|
|
506
|
-
else
|
|
507
|
-
# Final failure - write to DLQ
|
|
508
|
-
echo " [$timestamp] ❌ FINAL FAILURE: $agent after $retry_count attempts" >&2
|
|
509
|
-
write_to_dlq "$agent" "timeout_after_retries" "$retry_count"
|
|
510
|
-
return 1
|
|
511
|
-
fi
|
|
512
|
-
done
|
|
513
|
-
|
|
514
|
-
return 1
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
echo "=== CFN Loop Orchestration ==="
|
|
518
|
-
echo "Task ID: $TASK_ID"
|
|
519
|
-
echo "Mode: $MODE (Gate: $GATE, Consensus: $CONSENSUS)"
|
|
520
|
-
echo "Max Iterations: $MAX_ITERATIONS"
|
|
521
|
-
echo ""
|
|
522
|
-
|
|
523
|
-
# Initialize swarm using general Redis coordination primitive
|
|
524
|
-
SWARM_ID="swarm-${TASK_ID}"
|
|
525
|
-
ALL_AGENTS="${LOOP3_AGENTS},${LOOP2_AGENTS},${PRODUCT_OWNER}"
|
|
526
|
-
|
|
527
|
-
# Build CFN-specific metadata
|
|
528
|
-
CFN_METADATA=$(cat <<EOF
|
|
529
|
-
{
|
|
530
|
-
"mode": "$MODE",
|
|
531
|
-
"loop3_agents": "$LOOP3_AGENTS",
|
|
532
|
-
"loop2_agents": "$LOOP2_AGENTS",
|
|
533
|
-
"product_owner": "$PRODUCT_OWNER",
|
|
534
|
-
"workflow_type": "cfn_loop"
|
|
535
|
-
}
|
|
536
|
-
EOF
|
|
537
|
-
)
|
|
538
|
-
|
|
539
|
-
# Use general init-swarm primitive
|
|
540
|
-
./.claude/skills/redis-coordination/init-swarm.sh \
|
|
541
|
-
--swarm-id "$SWARM_ID" \
|
|
542
|
-
--agents "$ALL_AGENTS" \
|
|
543
|
-
--task-id "$TASK_ID" \
|
|
544
|
-
--topology "hierarchical" \
|
|
545
|
-
--metadata "$CFN_METADATA" > /dev/null
|
|
546
|
-
|
|
547
|
-
# Start shutdown monitor in background
|
|
548
|
-
start_shutdown_monitor "$TASK_ID"
|
|
549
|
-
|
|
550
|
-
echo ""
|
|
551
|
-
|
|
552
|
-
# Iteration loop
|
|
553
|
-
for ITERATION in $(seq 1 $MAX_ITERATIONS); do
|
|
554
|
-
echo "=== Iteration $ITERATION/$MAX_ITERATIONS ==="
|
|
555
|
-
|
|
556
|
-
# METRICS: Iteration start timestamp
|
|
557
|
-
ITERATION_START=$(date +%s%N | cut -b1-13) # milliseconds
|
|
558
|
-
redis-cli LPUSH "swarm:${TASK_ID}:metrics:iteration_start" "$ITERATION_START" >/dev/null
|
|
559
|
-
|
|
560
|
-
# Step 1: Spawn Loop 3 agents via CLI
|
|
561
|
-
echo "[Loop 3] Spawning implementers via CLI..."
|
|
562
|
-
IFS=',' read -ra AGENTS <<< "$LOOP3_AGENTS"
|
|
563
|
-
|
|
564
|
-
for AGENT in "${AGENTS[@]}"; do
|
|
565
|
-
echo " Spawning: npx cfn-spawn agent $AGENT --task-id $TASK_ID --iteration $ITERATION"
|
|
566
|
-
|
|
567
|
-
# Spawn agent in background via CLI (using cfn-spawn pattern)
|
|
568
|
-
npx cfn-spawn agent "$AGENT" \
|
|
569
|
-
--task-id "$TASK_ID" \
|
|
570
|
-
--iteration "$ITERATION" \
|
|
571
|
-
--context "Loop 3 implementation" \
|
|
572
|
-
--mode "$MODE" &
|
|
573
|
-
|
|
574
|
-
AGENT_PID=$!
|
|
575
|
-
echo " ✅ Spawned $AGENT (PID: $AGENT_PID)"
|
|
576
|
-
done
|
|
577
|
-
|
|
578
|
-
echo ""
|
|
579
|
-
|
|
580
|
-
# Step 2: Wait for Loop 3 agents to complete
|
|
581
|
-
echo "[Loop 3] Waiting for implementers to complete..."
|
|
582
|
-
|
|
583
|
-
LOOP3_TOTAL=${#AGENTS[@]}
|
|
584
|
-
LOOP3_REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP3" "$LOOP3_TOTAL")
|
|
585
|
-
LOOP3_COMPLETED_AGENTS=()
|
|
586
|
-
LOOP3_FAILED_AGENTS=()
|
|
587
|
-
|
|
588
|
-
echo "[Loop 3] Quorum: $LOOP3_REQUIRED/$LOOP3_TOTAL agents required"
|
|
589
|
-
|
|
590
|
-
# Start Loop 3 heartbeat monitor
|
|
591
|
-
echo "[Loop 3] Starting heartbeat monitor (checking every 30s)..."
|
|
592
|
-
LOOP3_HEARTBEAT_MONITOR_PID=$(start_heartbeat_monitor "$TASK_ID" "loop3" "${AGENTS[@]}")
|
|
593
|
-
|
|
594
|
-
for AGENT in "${AGENTS[@]}"; do
|
|
595
|
-
DONE_KEY="swarm:${TASK_ID}:${AGENT}:done"
|
|
596
|
-
|
|
597
|
-
# Get agent-specific timeout
|
|
598
|
-
AGENT_TIMEOUT=$(get_agent_timeout "$AGENT" "$TASK_ID")
|
|
599
|
-
echo " Waiting for $AGENT (timeout: ${AGENT_TIMEOUT}s)..."
|
|
600
|
-
|
|
601
|
-
# METRICS: Agent latency start
|
|
602
|
-
AGENT_START=$(date +%s%N | cut -b1-13)
|
|
603
|
-
|
|
604
|
-
# BLPOP with retry logic using agent-specific timeout
|
|
605
|
-
if RESULT=$(blpop_with_retry "$AGENT" "$DONE_KEY" "$AGENT_TIMEOUT" "$RETRY_COUNT" "$RETRY_DELAY"); then
|
|
606
|
-
# METRICS: Agent latency end
|
|
607
|
-
AGENT_END=$(date +%s%N | cut -b1-13)
|
|
608
|
-
LATENCY=$((AGENT_END - AGENT_START))
|
|
609
|
-
|
|
610
|
-
# Store latency metric with agent label and loop context
|
|
611
|
-
METRIC=$(jq -nc \
|
|
612
|
-
--arg agent "$AGENT" \
|
|
613
|
-
--arg latency "$LATENCY" \
|
|
614
|
-
--arg loop "loop3" \
|
|
615
|
-
--arg iteration "$ITERATION" \
|
|
616
|
-
'{agent: $agent, latency_ms: ($latency | tonumber), loop: $loop, iteration: ($iteration | tonumber)}')
|
|
617
|
-
echo "$METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:agent_latency" >/dev/null
|
|
618
|
-
|
|
619
|
-
echo " ✅ $AGENT complete (${LATENCY}ms)"
|
|
620
|
-
LOOP3_COMPLETED_AGENTS+=("$AGENT")
|
|
621
|
-
else
|
|
622
|
-
echo " ❌ $AGENT failed after $RETRY_COUNT retry attempts"
|
|
623
|
-
LOOP3_FAILED_AGENTS+=("$AGENT")
|
|
624
|
-
|
|
625
|
-
# METRICS: Increment timeout counter
|
|
626
|
-
redis-cli INCR "swarm:${TASK_ID}:metrics:timeout_count" >/dev/null
|
|
627
|
-
fi
|
|
628
|
-
done
|
|
629
|
-
|
|
630
|
-
# Stop Loop 3 heartbeat monitor
|
|
631
|
-
echo "[Loop 3] Stopping heartbeat monitor..."
|
|
632
|
-
stop_heartbeat_monitor "$TASK_ID" "loop3" "$LOOP3_HEARTBEAT_MONITOR_PID"
|
|
633
|
-
LOOP3_HEARTBEAT_MONITOR_PID=""
|
|
634
|
-
|
|
635
|
-
# Validate quorum
|
|
636
|
-
if [ ${#LOOP3_COMPLETED_AGENTS[@]} -ge "$LOOP3_REQUIRED" ]; then
|
|
637
|
-
echo "[Loop 3] ✅ Quorum met: ${#LOOP3_COMPLETED_AGENTS[@]}/$LOOP3_REQUIRED agents completed"
|
|
638
|
-
if [ ${#LOOP3_FAILED_AGENTS[@]} -gt 0 ]; then
|
|
639
|
-
echo "[Loop 3] ⚠️ Failed agents (continuing with quorum): ${LOOP3_FAILED_AGENTS[*]}"
|
|
640
|
-
|
|
641
|
-
# METRICS: Increment quorum fallback counter
|
|
642
|
-
redis-cli INCR "swarm:${TASK_ID}:metrics:quorum_fallback" >/dev/null
|
|
643
|
-
fi
|
|
644
|
-
else
|
|
645
|
-
echo "[Loop 3] ❌ Quorum FAILED: ${#LOOP3_COMPLETED_AGENTS[@]} < $LOOP3_REQUIRED"
|
|
646
|
-
echo "[Loop 3] Failed agents: ${LOOP3_FAILED_AGENTS[*]}"
|
|
647
|
-
exit 1
|
|
648
|
-
fi
|
|
649
|
-
echo ""
|
|
650
|
-
|
|
651
|
-
# Step 2: Collect Loop 3 confidence scores (only from completed agents)
|
|
652
|
-
echo "[Loop 3] Collecting confidence scores from ${#LOOP3_COMPLETED_AGENTS[@]} agents..."
|
|
653
|
-
LOOP3_COMPLETED_IDS=$(IFS=','; echo "${LOOP3_COMPLETED_AGENTS[*]}")
|
|
654
|
-
LOOP3_CONSENSUS=$(./.claude/skills/redis-coordination/invoke-waiting-mode.sh collect \
|
|
655
|
-
--task-id "$TASK_ID" \
|
|
656
|
-
--agent-ids "$LOOP3_COMPLETED_IDS" | tail -1)
|
|
657
|
-
|
|
658
|
-
echo "[Loop 3] Average confidence: $LOOP3_CONSENSUS (from ${#LOOP3_COMPLETED_AGENTS[@]}/${LOOP3_TOTAL} agents)"
|
|
659
|
-
|
|
660
|
-
# METRICS: Store Loop 3 consensus score
|
|
661
|
-
LOOP3_METRIC=$(jq -nc \
|
|
662
|
-
--arg consensus "$LOOP3_CONSENSUS" \
|
|
663
|
-
--arg iteration "$ITERATION" \
|
|
664
|
-
'{consensus: ($consensus | tonumber), iteration: ($iteration | tonumber)}')
|
|
665
|
-
echo "$LOOP3_METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:loop3_consensus" >/dev/null
|
|
666
|
-
|
|
667
|
-
# Gate check
|
|
668
|
-
if (( $(echo "$LOOP3_CONSENSUS < $GATE" | bc -l) )); then
|
|
669
|
-
echo "❌ Gate FAILED ($LOOP3_CONSENSUS < $GATE)"
|
|
670
|
-
echo "Decision: RELAUNCH iteration $((ITERATION + 1))"
|
|
671
|
-
|
|
672
|
-
# METRICS: Increment gate failure counter
|
|
673
|
-
redis-cli INCR "swarm:${TASK_ID}:metrics:gate_failures" >/dev/null
|
|
674
|
-
|
|
675
|
-
# Wake Loop 3 agents for next iteration with MEDIUM priority (priority=30)
|
|
676
|
-
IFS=',' read -ra AGENTS <<< "$LOOP3_AGENTS"
|
|
677
|
-
for AGENT in "${AGENTS[@]}"; do
|
|
678
|
-
./.claude/skills/redis-coordination/invoke-waiting-mode.sh wake \
|
|
679
|
-
--task-id "$TASK_ID" \
|
|
680
|
-
--agent-id "$AGENT" \
|
|
681
|
-
--priority 30 \
|
|
682
|
-
--reason "gate_failed" \
|
|
683
|
-
--iteration $((ITERATION + 1)) \
|
|
684
|
-
--feedback "Improve confidence from $LOOP3_CONSENSUS to >$GATE"
|
|
685
|
-
done
|
|
686
|
-
|
|
687
|
-
continue # Next iteration
|
|
688
|
-
fi
|
|
689
|
-
|
|
690
|
-
echo "✅ Gate PASSED ($LOOP3_CONSENSUS >= $GATE)"
|
|
691
|
-
echo ""
|
|
692
|
-
|
|
693
|
-
# Signal Loop 2 validators that gate has passed (they can start work)
|
|
694
|
-
GATE_PASS_KEY="swarm:${TASK_ID}:gate-passed"
|
|
695
|
-
redis-cli lpush "$GATE_PASS_KEY" "{\"iteration\": $ITERATION, \"loop3_confidence\": $LOOP3_CONSENSUS}" > /dev/null
|
|
696
|
-
echo "[Loop 3] Gate pass signal sent to Loop 2 validators"
|
|
697
|
-
echo ""
|
|
698
|
-
|
|
699
|
-
# Step 3: Spawn Loop 2 validators via CLI
|
|
700
|
-
echo "[Loop 2] Spawning validators via CLI..."
|
|
701
|
-
IFS=',' read -ra VALIDATORS <<< "$LOOP2_AGENTS"
|
|
702
|
-
|
|
703
|
-
for VALIDATOR in "${VALIDATORS[@]}"; do
|
|
704
|
-
echo " Spawning: npx cfn-spawn agent $VALIDATOR --task-id $TASK_ID --iteration $ITERATION"
|
|
705
|
-
|
|
706
|
-
# Spawn validator in background via CLI (using cfn-spawn pattern)
|
|
707
|
-
npx cfn-spawn agent "$VALIDATOR" \
|
|
708
|
-
--task-id "$TASK_ID" \
|
|
709
|
-
--iteration "$ITERATION" \
|
|
710
|
-
--context "Loop 2 validation" \
|
|
711
|
-
--mode "$MODE" &
|
|
712
|
-
|
|
713
|
-
VALIDATOR_PID=$!
|
|
714
|
-
echo " ✅ Spawned $VALIDATOR (PID: $VALIDATOR_PID)"
|
|
715
|
-
done
|
|
716
|
-
|
|
717
|
-
echo ""
|
|
718
|
-
|
|
719
|
-
# Step 4: Wait for Loop 2 validators to complete
|
|
720
|
-
echo "[Loop 2] Waiting for validators to complete..."
|
|
721
|
-
|
|
722
|
-
LOOP2_TOTAL=${#VALIDATORS[@]}
|
|
723
|
-
LOOP2_REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP2" "$LOOP2_TOTAL")
|
|
724
|
-
LOOP2_COMPLETED_AGENTS=()
|
|
725
|
-
LOOP2_FAILED_AGENTS=()
|
|
726
|
-
|
|
727
|
-
echo "[Loop 2] Quorum: $LOOP2_REQUIRED/$LOOP2_TOTAL agents required"
|
|
728
|
-
|
|
729
|
-
# Start Loop 2 heartbeat monitor
|
|
730
|
-
echo "[Loop 2] Starting heartbeat monitor (checking every 30s)..."
|
|
731
|
-
LOOP2_HEARTBEAT_MONITOR_PID=$(start_heartbeat_monitor "$TASK_ID" "loop2" "${VALIDATORS[@]}")
|
|
732
|
-
|
|
733
|
-
for VALIDATOR in "${VALIDATORS[@]}"; do
|
|
734
|
-
DONE_KEY="swarm:${TASK_ID}:${VALIDATOR}:done"
|
|
735
|
-
|
|
736
|
-
# Get agent-specific timeout
|
|
737
|
-
AGENT_TIMEOUT=$(get_agent_timeout "$VALIDATOR" "$TASK_ID")
|
|
738
|
-
echo " Waiting for $VALIDATOR (timeout: ${AGENT_TIMEOUT}s)..."
|
|
739
|
-
|
|
740
|
-
# METRICS: Agent latency start
|
|
741
|
-
AGENT_START=$(date +%s%N | cut -b1-13)
|
|
742
|
-
|
|
743
|
-
# BLPOP with retry logic using agent-specific timeout
|
|
744
|
-
if RESULT=$(blpop_with_retry "$VALIDATOR" "$DONE_KEY" "$AGENT_TIMEOUT" "$RETRY_COUNT" "$RETRY_DELAY"); then
|
|
745
|
-
# METRICS: Agent latency end
|
|
746
|
-
AGENT_END=$(date +%s%N | cut -b1-13)
|
|
747
|
-
LATENCY=$((AGENT_END - AGENT_START))
|
|
748
|
-
|
|
749
|
-
# Store latency metric with agent label and loop context
|
|
750
|
-
METRIC=$(jq -nc \
|
|
751
|
-
--arg agent "$VALIDATOR" \
|
|
752
|
-
--arg latency "$LATENCY" \
|
|
753
|
-
--arg loop "loop2" \
|
|
754
|
-
--arg iteration "$ITERATION" \
|
|
755
|
-
'{agent: $agent, latency_ms: ($latency | tonumber), loop: $loop, iteration: ($iteration | tonumber)}')
|
|
756
|
-
echo "$METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:agent_latency" >/dev/null
|
|
757
|
-
|
|
758
|
-
echo " ✅ $VALIDATOR complete (${LATENCY}ms)"
|
|
759
|
-
LOOP2_COMPLETED_AGENTS+=("$VALIDATOR")
|
|
760
|
-
else
|
|
761
|
-
echo " ❌ $VALIDATOR failed after $RETRY_COUNT retry attempts"
|
|
762
|
-
LOOP2_FAILED_AGENTS+=("$VALIDATOR")
|
|
763
|
-
|
|
764
|
-
# METRICS: Increment timeout counter
|
|
765
|
-
redis-cli INCR "swarm:${TASK_ID}:metrics:timeout_count" >/dev/null
|
|
766
|
-
fi
|
|
767
|
-
done
|
|
768
|
-
|
|
769
|
-
# Stop Loop 2 heartbeat monitor
|
|
770
|
-
echo "[Loop 2] Stopping heartbeat monitor..."
|
|
771
|
-
stop_heartbeat_monitor "$TASK_ID" "loop2" "$LOOP2_HEARTBEAT_MONITOR_PID"
|
|
772
|
-
LOOP2_HEARTBEAT_MONITOR_PID=""
|
|
773
|
-
|
|
774
|
-
# Validate quorum
|
|
775
|
-
if [ ${#LOOP2_COMPLETED_AGENTS[@]} -ge "$LOOP2_REQUIRED" ]; then
|
|
776
|
-
echo "[Loop 2] ✅ Quorum met: ${#LOOP2_COMPLETED_AGENTS[@]}/$LOOP2_REQUIRED agents completed"
|
|
777
|
-
if [ ${#LOOP2_FAILED_AGENTS[@]} -gt 0 ]; then
|
|
778
|
-
echo "[Loop 2] ⚠️ Failed agents (continuing with quorum): ${LOOP2_FAILED_AGENTS[*]}"
|
|
779
|
-
|
|
780
|
-
# METRICS: Increment quorum fallback counter
|
|
781
|
-
redis-cli INCR "swarm:${TASK_ID}:metrics:quorum_fallback" >/dev/null
|
|
782
|
-
fi
|
|
783
|
-
else
|
|
784
|
-
echo "[Loop 2] ❌ Quorum FAILED: ${#LOOP2_COMPLETED_AGENTS[@]} < $LOOP2_REQUIRED"
|
|
785
|
-
echo "[Loop 2] Failed agents: ${LOOP2_FAILED_AGENTS[*]}"
|
|
786
|
-
exit 1
|
|
787
|
-
fi
|
|
788
|
-
echo ""
|
|
789
|
-
|
|
790
|
-
# Step 4: Collect Loop 2 consensus scores (only from completed agents)
|
|
791
|
-
echo "[Loop 2] Collecting consensus scores from ${#LOOP2_COMPLETED_AGENTS[@]} agents..."
|
|
792
|
-
LOOP2_COMPLETED_IDS=$(IFS=','; echo "${LOOP2_COMPLETED_AGENTS[*]}")
|
|
793
|
-
LOOP2_CONSENSUS=$(./.claude/skills/redis-coordination/invoke-waiting-mode.sh collect \
|
|
794
|
-
--task-id "$TASK_ID" \
|
|
795
|
-
--agent-ids "$LOOP2_COMPLETED_IDS" | tail -1)
|
|
796
|
-
|
|
797
|
-
echo "[Loop 2] Average consensus: $LOOP2_CONSENSUS (from ${#LOOP2_COMPLETED_AGENTS[@]}/${LOOP2_TOTAL} agents)"
|
|
798
|
-
|
|
799
|
-
# METRICS: Store Loop 2 consensus score
|
|
800
|
-
LOOP2_METRIC=$(jq -nc \
|
|
801
|
-
--arg consensus "$LOOP2_CONSENSUS" \
|
|
802
|
-
--arg iteration "$ITERATION" \
|
|
803
|
-
'{consensus: ($consensus | tonumber), iteration: ($iteration | tonumber)}')
|
|
804
|
-
echo "$LOOP2_METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:loop2_consensus" >/dev/null
|
|
805
|
-
|
|
806
|
-
# Consensus check
|
|
807
|
-
if (( $(echo "$LOOP2_CONSENSUS >= $CONSENSUS" | bc -l) )); then
|
|
808
|
-
echo "✅ CONSENSUS REACHED ($LOOP2_CONSENSUS >= $CONSENSUS)"
|
|
809
|
-
echo ""
|
|
810
|
-
|
|
811
|
-
# Wake Product Owner with CRITICAL priority (priority=5)
|
|
812
|
-
echo "[Coordinator] Waking Product Owner with CRITICAL priority..."
|
|
813
|
-
./.claude/skills/redis-coordination/invoke-waiting-mode.sh wake \
|
|
814
|
-
--task-id "$TASK_ID" \
|
|
815
|
-
--agent-id "$PRODUCT_OWNER" \
|
|
816
|
-
--priority 5 \
|
|
817
|
-
--reason "consensus_ready" \
|
|
818
|
-
--iteration "$ITERATION" \
|
|
819
|
-
--feedback "Loop 2 consensus: $LOOP2_CONSENSUS"
|
|
820
|
-
|
|
821
|
-
# Wait for Product Owner decision
|
|
822
|
-
echo "[Product Owner] Waiting for GOAP decision..."
|
|
823
|
-
DECISION_KEY="swarm:${TASK_ID}:${PRODUCT_OWNER}:decision"
|
|
824
|
-
|
|
825
|
-
# Get agent-specific timeout for Product Owner
|
|
826
|
-
PO_TIMEOUT=$(get_agent_timeout "$PRODUCT_OWNER" "$TASK_ID")
|
|
827
|
-
echo "[Product Owner] Using timeout: ${PO_TIMEOUT}s"
|
|
828
|
-
|
|
829
|
-
# BLPOP with retry logic for decision using agent-specific timeout
|
|
830
|
-
if ! DECISION_RESULT=$(blpop_with_retry "$PRODUCT_OWNER" "$DECISION_KEY" "$PO_TIMEOUT" "$RETRY_COUNT" "$RETRY_DELAY"); then
|
|
831
|
-
echo "❌ ERROR: Product Owner failed after $RETRY_COUNT retry attempts"
|
|
832
|
-
exit 1
|
|
833
|
-
fi
|
|
834
|
-
|
|
835
|
-
# Extract decision from BLPOP result (format: key value)
|
|
836
|
-
DECISION=$(echo "$DECISION_RESULT" | tail -1)
|
|
837
|
-
|
|
838
|
-
DECISION_TYPE=$(echo "$DECISION" | jq -r '.decision')
|
|
839
|
-
|
|
840
|
-
echo "[Product Owner] Decision: $DECISION_TYPE"
|
|
841
|
-
|
|
842
|
-
if [ "$DECISION_TYPE" = "PROCEED" ]; then
|
|
843
|
-
echo ""
|
|
844
|
-
echo "🎉 CFN Loop Complete!"
|
|
845
|
-
echo "Final Consensus: $LOOP2_CONSENSUS (Iteration $ITERATION)"
|
|
846
|
-
|
|
847
|
-
# METRICS: Iteration end timestamp and duration
|
|
848
|
-
ITERATION_END=$(date +%s%N | cut -b1-13)
|
|
849
|
-
ITERATION_DURATION=$((ITERATION_END - ITERATION_START))
|
|
850
|
-
|
|
851
|
-
# Store final iteration duration metric
|
|
852
|
-
DURATION_METRIC=$(jq -nc \
|
|
853
|
-
--arg duration "$ITERATION_DURATION" \
|
|
854
|
-
--arg iteration "$ITERATION" \
|
|
855
|
-
'{duration_ms: ($duration | tonumber), iteration: ($iteration | tonumber)}')
|
|
856
|
-
echo "$DURATION_METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:iteration_duration" >/dev/null
|
|
857
|
-
|
|
858
|
-
# Wake all agents with completion signal - CRITICAL priority (priority=5)
|
|
859
|
-
echo "[Coordinator] Waking all agents with CRITICAL priority for completion..."
|
|
860
|
-
IFS=',' read -ra ALL_AGENTS <<< "$LOOP3_AGENTS,$LOOP2_AGENTS"
|
|
861
|
-
for AGENT in "${ALL_AGENTS[@]}"; do
|
|
862
|
-
./.claude/skills/redis-coordination/invoke-waiting-mode.sh wake \
|
|
863
|
-
--task-id "$TASK_ID" \
|
|
864
|
-
--agent-id "$AGENT" \
|
|
865
|
-
--priority 5 \
|
|
866
|
-
--reason "cfn_complete" \
|
|
867
|
-
--iteration "$ITERATION"
|
|
868
|
-
done
|
|
869
|
-
|
|
870
|
-
# Use general complete-swarm primitive
|
|
871
|
-
./.claude/skills/redis-coordination/complete-swarm.sh \
|
|
872
|
-
--swarm-id "$SWARM_ID" \
|
|
873
|
-
--final-metric "final_consensus=$LOOP2_CONSENSUS" \
|
|
874
|
-
--final-metric "total_iterations=$ITERATION" > /dev/null
|
|
875
|
-
|
|
876
|
-
exit 0
|
|
877
|
-
fi
|
|
878
|
-
|
|
879
|
-
else
|
|
880
|
-
echo "⚠️ CONSENSUS NOT REACHED ($LOOP2_CONSENSUS < $CONSENSUS)"
|
|
881
|
-
echo "Decision: RELAUNCH iteration $((ITERATION + 1))"
|
|
882
|
-
echo ""
|
|
883
|
-
fi
|
|
884
|
-
|
|
885
|
-
# METRICS: Iteration end timestamp and duration (for relaunch scenario)
|
|
886
|
-
ITERATION_END=$(date +%s%N | cut -b1-13)
|
|
887
|
-
ITERATION_DURATION=$((ITERATION_END - ITERATION_START))
|
|
888
|
-
|
|
889
|
-
# Store iteration duration metric
|
|
890
|
-
DURATION_METRIC=$(jq -nc \
|
|
891
|
-
--arg duration "$ITERATION_DURATION" \
|
|
892
|
-
--arg iteration "$ITERATION" \
|
|
893
|
-
'{duration_ms: ($duration | tonumber), iteration: ($iteration | tonumber)}')
|
|
894
|
-
echo "$DURATION_METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:iteration_duration" >/dev/null
|
|
895
|
-
|
|
896
|
-
# Relaunch next iteration
|
|
897
|
-
if [ $ITERATION -eq $MAX_ITERATIONS ]; then
|
|
898
|
-
echo "❌ Maximum iterations ($MAX_ITERATIONS) reached without consensus"
|
|
899
|
-
exit 1
|
|
900
|
-
fi
|
|
901
|
-
|
|
902
|
-
# Wake agents for next iteration with role-based priorities
|
|
903
|
-
echo "[Coordinator] Waking agents for iteration $((ITERATION + 1)) with priorities..."
|
|
904
|
-
|
|
905
|
-
# Wake Loop 3 implementers with MEDIUM priority (priority=30)
|
|
906
|
-
IFS=',' read -ra LOOP3_ARRAY <<< "$LOOP3_AGENTS"
|
|
907
|
-
for AGENT in "${LOOP3_ARRAY[@]}"; do
|
|
908
|
-
./.claude/skills/redis-coordination/invoke-waiting-mode.sh wake \
|
|
909
|
-
--task-id "$TASK_ID" \
|
|
910
|
-
--agent-id "$AGENT" \
|
|
911
|
-
--priority 30 \
|
|
912
|
-
--reason "cfn_loop_iteration" \
|
|
913
|
-
--iteration $((ITERATION + 1)) \
|
|
914
|
-
--feedback "Improve consensus from $LOOP2_CONSENSUS to >=$CONSENSUS"
|
|
915
|
-
done
|
|
916
|
-
|
|
917
|
-
# Wake Loop 2 validators with HIGH priority (priority=10)
|
|
918
|
-
IFS=',' read -ra LOOP2_ARRAY <<< "$LOOP2_AGENTS"
|
|
919
|
-
for AGENT in "${LOOP2_ARRAY[@]}"; do
|
|
920
|
-
./.claude/skills/redis-coordination/invoke-waiting-mode.sh wake \
|
|
921
|
-
--task-id "$TASK_ID" \
|
|
922
|
-
--agent-id "$AGENT" \
|
|
923
|
-
--priority 10 \
|
|
924
|
-
--reason "cfn_loop_iteration" \
|
|
925
|
-
--iteration $((ITERATION + 1)) \
|
|
926
|
-
--feedback "Improve consensus from $LOOP2_CONSENSUS to >=$CONSENSUS"
|
|
927
|
-
done
|
|
928
|
-
|
|
929
|
-
echo ""
|
|
930
|
-
done
|
|
931
|
-
|
|
932
|
-
echo "❌ CFN Loop failed after $MAX_ITERATIONS iterations"
|
|
933
|
-
exit 1
|