claude-flow-novice 2.14.0 → 2.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.claude/commands/cfn/README.md +177 -129
  2. package/.claude/commands/cfn/cfn-loop-cli.md +279 -0
  3. package/.claude/commands/cfn/cfn-loop-document.md +20 -1
  4. package/.claude/commands/cfn/cfn-loop-frontend.md +17 -2
  5. package/.claude/commands/cfn/cfn-loop-task.md +460 -0
  6. package/.claude/commands/cfn/context-curate.md +27 -38
  7. package/.claude/commands/cfn/context-inject.md +14 -25
  8. package/.claude/commands/cfn/context-query.md +40 -45
  9. package/.claude/commands/cfn/context-reflect.md +40 -38
  10. package/.claude/commands/cfn/context-stats.md +13 -38
  11. package/.claude/commands/cfn/launch-web-dashboard.md +0 -295
  12. package/.claude/commands/cfn/list-agents-rebuild.md +18 -18
  13. package/.claude/commands/cfn/write-plan.md +246 -75
  14. package/.claude/root-claude-distribute/CFN-CLAUDE.md +126 -233
  15. package/.claude/skills/cfn-backlog-management/add-backlog-item.sh +25 -27
  16. package/.claude/skills/cfn-changelog-management/add-changelog-entry.sh +21 -19
  17. package/.claude/skills/cfn-changelog-management/bulk-import.sh +268 -0
  18. package/.claude/skills/cfn-changelog-management/lib/README.md +212 -0
  19. package/.claude/skills/cfn-changelog-management/lib/validation.sh +72 -0
  20. package/.claude/skills/cfn-product-owner-decision/execute-decision.sh +36 -11
  21. package/claude-assets/agents/cfn-dev-team/analysts/root-cause-analyst.md +259 -0
  22. package/claude-assets/agents/cfn-dev-team/reviewers/reviewer.md +45 -6
  23. package/claude-assets/agents/cfn-dev-team/testers/tester.md +59 -0
  24. package/claude-assets/agents/cfn-dev-team/utility/agent-builder.md +4 -4
  25. package/claude-assets/commands/cfn/auto-compact.md +80 -0
  26. package/claude-assets/commands/cfn/cfn-loop-epic.md +478 -0
  27. package/claude-assets/commands/cfn/cfn-loop-single.md +256 -0
  28. package/claude-assets/commands/cfn/cfn-loop-sprints.md +396 -0
  29. package/claude-assets/commands/cfn/cfn-loop.md +518 -0
  30. package/claude-assets/commands/cfn/claude-md.md +64 -0
  31. package/claude-assets/commands/cfn/claude-soul.md +22 -0
  32. package/claude-assets/commands/cfn/cost-savings-off.md +35 -0
  33. package/claude-assets/commands/cfn/cost-savings-on.md +35 -0
  34. package/claude-assets/commands/cfn/cost-savings-status.md +34 -0
  35. package/claude-assets/commands/cfn/custom-routing-activate.md +55 -0
  36. package/claude-assets/commands/cfn/custom-routing-deactivate.md +46 -0
  37. package/claude-assets/commands/cfn/dependency-recommendations.md +171 -0
  38. package/claude-assets/commands/cfn/fullstack.md +179 -0
  39. package/claude-assets/commands/cfn/github.md +221 -0
  40. package/claude-assets/commands/cfn/hooks.md +38 -0
  41. package/claude-assets/commands/cfn/metrics-summary.md +58 -0
  42. package/claude-assets/commands/cfn/neural.md +39 -0
  43. package/claude-assets/commands/cfn/parse-epic.md +357 -0
  44. package/claude-assets/commands/cfn/performance.md +41 -0
  45. package/claude-assets/commands/cfn/sparc.md +46 -0
  46. package/claude-assets/commands/cfn/suggest-improvements.md +95 -0
  47. package/claude-assets/commands/cfn/suggest-templates.md +147 -0
  48. package/claude-assets/commands/cfn/swarm.md +24 -0
  49. package/claude-assets/root-claude-distribute/CFN-CLAUDE.md +126 -233
  50. package/claude-assets/skills/cfn-backlog-management/add-backlog-item.sh +25 -27
  51. package/claude-assets/skills/cfn-changelog-management/add-changelog-entry.sh +21 -19
  52. package/claude-assets/skills/cfn-changelog-management/bulk-import.sh +268 -0
  53. package/claude-assets/skills/cfn-changelog-management/lib/README.md +212 -0
  54. package/claude-assets/skills/cfn-changelog-management/lib/validation.sh +72 -0
  55. package/claude-assets/skills/cfn-product-owner-decision/execute-decision.sh +36 -11
  56. package/claude-assets/skills/cfn-task-config-init/SKILL.md +204 -0
  57. package/claude-assets/skills/cfn-task-config-init/initialize-config.sh +264 -0
  58. package/dist/cli/config-manager.js +109 -91
  59. package/dist/cli/config-manager.js.map +1 -1
  60. package/package.json +2 -2
  61. package/.claude/commands/README.md +0 -205
  62. package/.claude/skills/cfn-loop-orchestration/orchestrate.sh.backup +0 -840
  63. package/.claude/skills/cfn-redis-coordination/invoke-waiting-mode.sh.backup-p7 +0 -423
  64. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup +0 -38
  65. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup-1761167675 +0 -1672
  66. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup-p5 +0 -1604
  67. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup-phase1 +0 -1550
  68. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup-phase2 +0 -1621
  69. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup-phase3 +0 -1621
  70. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.broken +0 -1627
  71. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.corrupted +0 -80
  72. package/.claude/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.deprecated +0 -1864
  73. package/.claude/skills/cfn-redis-coordination/tests/test_coordination_primitives.sh.deprecated +0 -20
  74. package/claude-assets/skills/cfn-loop-orchestration/orchestrate.sh.backup +0 -840
  75. package/claude-assets/skills/cfn-loop2-output-processing/execute-and-extract.sh.backup +0 -36
  76. package/claude-assets/skills/cfn-redis-coordination/invoke-waiting-mode.sh.backup-p7 +0 -423
  77. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup +0 -38
  78. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup-1761167675 +0 -1672
  79. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup-p5 +0 -1604
  80. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup-phase1 +0 -1550
  81. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup-phase2 +0 -1621
  82. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.backup-phase3 +0 -1621
  83. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.broken +0 -1627
  84. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.corrupted +0 -80
  85. package/claude-assets/skills/cfn-redis-coordination/orchestrate-cfn-loop.sh.deprecated +0 -1864
  86. package/claude-assets/skills/cfn-redis-coordination/tests/test_coordination_primitives.sh.deprecated +0 -20
@@ -1,1621 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- ##############################################################################
4
- # CFN Loop Orchestration v2.0.0
5
- # Manages multi-loop CFN execution with dependency tracking and consensus
6
- #
7
- # Usage:
8
- # ./orchestrate-cfn-loop.sh --task-id <id> \
9
- # --mode <mvp|standard|enterprise> \
10
- # --loop3-agents <agent1,agent2,...> \
11
- # --loop2-agents <agent1,agent2,...> \
12
- # --product-owner <agent-id> \
13
- # [--max-iterations <n>] \
14
- # [--min-quorum-loop3 <n|n%|0.n>] \
15
- # [--min-quorum-loop2 <n|n%|0.n>] \
16
- # [--epic-context <json>] \
17
- # [--phase-context <json>] \
18
- # [--success-criteria <json>]
19
- #
20
- # CFN Loop Structure (CORRECTED):
21
- # Loop 3 (Primary Swarm - Self Validation)
22
- # ↓
23
- # IF Loop 3 self-validation gate FAILS → RELAUNCH Loop 3 (skip Loop 2)
24
- # IF Loop 3 self-validation gate PASSES → Proceed to Loop 2
25
- # ↓
26
- # Loop 2 (Consensus Validators)
27
- # ↓
28
- # Product Owner Decision
29
- #
30
- # Dependency Enforcement:
31
- # - Loop 3 agents self-validate via confidence scores
32
- # - Gate check determines if Loop 2 validators should be engaged
33
- # - Loop 2 agents WAIT for gate pass signal before starting work
34
- # - Product Owner BLOCKS until all Loop 2 agents signal completion
35
- # - Uses Redis BLPOP for zero-token waiting
36
- #
37
- # Quorum Configuration:
38
- # - Absolute: --min-quorum-loop3 3 (requires exactly 3 agents)
39
- # - Percentage: --min-quorum-loop3 85% (requires 85% of agents)
40
- # - Decimal: --min-quorum-loop3 0.66 (requires 66% of agents)
41
- # - Default: 0.66 (2/3 majority) if not specified
42
- #
43
- # Agent Requirements:
44
- # Loop 3 (Implementers):
45
- # 1. Complete work
46
- # 2. Signal done: redis-cli lpush "swarm:${TASK_ID}:${AGENT_ID}:done" "complete"
47
- # 3. Report confidence: invoke-waiting-mode.sh report --confidence <0.0-1.0>
48
- # 4. Enter waiting: invoke-waiting-mode.sh enter (for potential iteration)
49
- #
50
- # Loop 2 (Validators):
51
- # 1. WAIT for gate pass: redis-cli blpop "swarm:${TASK_ID}:gate-passed" 0
52
- # 2. Retrieve Loop 3 results for review
53
- # 3. Perform validation
54
- # 4. Signal done: redis-cli lpush "swarm:${TASK_ID}:${AGENT_ID}:done" "complete"
55
- # 5. Report consensus: invoke-waiting-mode.sh report --confidence <0.0-1.0>
56
- # 6. Enter waiting: invoke-waiting-mode.sh enter (for potential iteration)
57
- ##############################################################################
58
-
59
- set -euo pipefail
60
-
61
- # Configuration
62
- TASK_ID=""
63
- MODE="standard"
64
- LOOP3_AGENTS=""
65
- LOOP2_AGENTS=""
66
- PRODUCT_OWNER=""
67
- MAX_ITERATIONS=10
68
- TIMEOUT=3600 # 60 minute default timeout for agent completion
69
- RETRY_COUNT=3
70
- RETRY_DELAY=5000 # Base delay in milliseconds
71
- MIN_QUORUM_LOOP3="" # Minimum agents required for Loop 3 (absolute or percentage)
72
- MIN_QUORUM_LOOP2="" # Minimum agents required for Loop 2 (absolute or percentage)
73
- ORCHESTRATOR_PID=$$
74
- SHUTDOWN_MONITOR_PID=""
75
- SHUTDOWN_REQUESTED=0
76
- LOOP3_HEARTBEAT_MONITOR_PID=""
77
- LOOP2_HEARTBEAT_MONITOR_PID=""
78
-
79
- # Epic Context (optional - for agent system prompts)
80
- EPIC_CONTEXT=""
81
- PHASE_CONTEXT=""
82
- SUCCESS_CRITERIA=""
83
- EXPECTED_FILES="" # BUG #12 FIX: Explicit file verification
84
- PHASE_ID="" # BUG #16 FIX: Phase identifier for timeout configuration
85
-
86
- # Thresholds by mode
87
- declare -A GATE_THRESHOLD=(
88
- [mvp]=0.70
89
- [standard]=0.75
90
- [enterprise]=0.75
91
- )
92
-
93
- declare -A CONSENSUS_THRESHOLD=(
94
- [mvp]=0.80
95
- [standard]=0.90
96
- [enterprise]=0.95
97
- )
98
-
99
- # Parse arguments
100
- while [[ $# -gt 0 ]]; do
101
- case $1 in
102
- --task-id)
103
- TASK_ID="$2"
104
- shift 2
105
- ;;
106
- --mode)
107
- MODE="$2"
108
- shift 2
109
- ;;
110
- --loop3-agents)
111
- LOOP3_AGENTS="$2"
112
- shift 2
113
- ;;
114
- --loop2-agents)
115
- LOOP2_AGENTS="$2"
116
- shift 2
117
- ;;
118
- --product-owner)
119
- PRODUCT_OWNER="$2"
120
- shift 2
121
- ;;
122
- --max-iterations)
123
- MAX_ITERATIONS="$2"
124
- shift 2
125
- ;;
126
- --retry-count)
127
- RETRY_COUNT="$2"
128
- shift 2
129
- ;;
130
- --retry-delay)
131
- RETRY_DELAY="$2"
132
- shift 2
133
- ;;
134
- --timeout)
135
- TIMEOUT="$2"
136
- shift 2
137
- ;;
138
- --min-quorum-loop3)
139
- MIN_QUORUM_LOOP3="$2"
140
- shift 2
141
- ;;
142
- --min-quorum-loop2)
143
- MIN_QUORUM_LOOP2="$2"
144
- shift 2
145
- ;;
146
- --epic-context)
147
- EPIC_CONTEXT="$2"
148
- shift 2
149
- ;;
150
- --phase-context)
151
- PHASE_CONTEXT="$2"
152
- shift 2
153
- ;;
154
- --success-criteria)
155
- SUCCESS_CRITERIA="$2"
156
- shift 2
157
- ;;
158
- --expected-files)
159
- EXPECTED_FILES="$2"
160
- shift 2
161
- ;;
162
- --phase-id)
163
- PHASE_ID="$2"
164
- shift 2
165
- ;;
166
- *)
167
- echo "Unknown option: $1"
168
- exit 1
169
- ;;
170
- esac
171
- done
172
-
173
- # Validation
174
- if [ -z "$TASK_ID" ] || [ -z "$LOOP3_AGENTS" ] || [ -z "$LOOP2_AGENTS" ] || [ -z "$PRODUCT_OWNER" ]; then
175
- echo "Error: Required parameters missing"
176
- echo "Usage: $0 --task-id <id> --mode <mode> --loop3-agents <agents> --loop2-agents <agents> --product-owner <agent>"
177
- exit 1
178
- fi
179
-
180
- GATE=${GATE_THRESHOLD[$MODE]}
181
- CONSENSUS=${CONSENSUS_THRESHOLD[$MODE]}
182
-
183
- # Set default quorum values if not specified (66% = 2/3 majority)
184
- MIN_QUORUM_LOOP3=${MIN_QUORUM_LOOP3:-0.66}
185
- MIN_QUORUM_LOOP2=${MIN_QUORUM_LOOP2:-0.66}
186
-
187
- ##############################################################################
188
- # Shutdown Handling Functions
189
- ##############################################################################
190
- function cleanup_and_exit() {
191
- local exit_code="${1:-130}"
192
- local reason="${2:-user_interrupt}"
193
-
194
- # Set shutdown flag to stop any ongoing operations
195
- SHUTDOWN_REQUESTED=1
196
-
197
- echo ""
198
- echo "=============================================="
199
- echo "🛑 Orchestrator shutting down gracefully..."
200
- echo "=============================================="
201
- echo "Reason: $reason"
202
- echo "Exit Code: $exit_code"
203
-
204
- # Kill shutdown monitor if running
205
- if [ -n "$SHUTDOWN_MONITOR_PID" ] && kill -0 "$SHUTDOWN_MONITOR_PID" 2>/dev/null; then
206
- kill "$SHUTDOWN_MONITOR_PID" 2>/dev/null || true
207
- wait "$SHUTDOWN_MONITOR_PID" 2>/dev/null || true
208
- fi
209
-
210
- # Stop heartbeat monitors if running
211
- if [ -n "${LOOP3_HEARTBEAT_MONITOR_PID:-}" ]; then
212
- echo "Stopping Loop 3 heartbeat monitor..."
213
- stop_heartbeat_monitor "$TASK_ID" "loop3" "$LOOP3_HEARTBEAT_MONITOR_PID"
214
- fi
215
- if [ -n "${LOOP2_HEARTBEAT_MONITOR_PID:-}" ]; then
216
- echo "Stopping Loop 2 heartbeat monitor..."
217
- stop_heartbeat_monitor "$TASK_ID" "loop2" "$LOOP2_HEARTBEAT_MONITOR_PID"
218
- fi
219
-
220
- # Mark swarm as cancelled if initialized
221
- if [ -n "$TASK_ID" ] && [ -n "${SWARM_ID:-}" ]; then
222
- echo "Marking swarm as cancelled..."
223
- ./.claude/skills/redis-coordination/complete-swarm.sh \
224
- --swarm-id "$SWARM_ID" \
225
- --final-metric "status=cancelled" \
226
- --final-metric "shutdown_reason=$reason" 2>/dev/null || echo " ⚠️ Failed to mark swarm as cancelled"
227
- fi
228
-
229
- # Clean up Redis keys
230
- if [ -n "$TASK_ID" ]; then
231
- echo "Cleaning up Redis keys..."
232
- local keys_deleted=$(redis-cli --scan --pattern "swarm:${TASK_ID}:*" | xargs -r redis-cli DEL 2>/dev/null || echo "0")
233
- echo " Deleted $keys_deleted Redis keys"
234
- fi
235
-
236
- # Clean up heartbeat monitor marker files
237
- rm -f /tmp/heartbeat-monitor-${TASK_ID}-*.active 2>/dev/null || true
238
-
239
- echo "=============================================="
240
- echo "Shutdown complete"
241
- echo "=============================================="
242
-
243
- exit "$exit_code"
244
- }
245
-
246
- # Trap SIGTERM and SIGINT for graceful shutdown
247
- trap 'echo "[TRAP] Caught SIGINT" >&2; cleanup_and_exit 130 "SIGINT_received"' SIGINT
248
- trap 'echo "[TRAP] Caught SIGTERM" >&2; cleanup_and_exit 143 "SIGTERM_received"' SIGTERM
249
-
250
- ##############################################################################
251
- # Start Shutdown Monitor (Background Process)
252
- ##############################################################################
253
- function start_shutdown_monitor() {
254
- local task_id="$1"
255
-
256
- (
257
- # Block on shutdown channel (zero-token waiting)
258
- SHUTDOWN_KEY="swarm:${task_id}:shutdown"
259
- SHUTDOWN_RESULT=$(redis-cli BLPOP "$SHUTDOWN_KEY" 0 2>/dev/null || echo "")
260
-
261
- if [ -n "$SHUTDOWN_RESULT" ]; then
262
- # Extract shutdown payload (format: key value)
263
- SHUTDOWN_PAYLOAD=$(echo "$SHUTDOWN_RESULT" | tail -1)
264
- REASON=$(echo "$SHUTDOWN_PAYLOAD" | jq -r '.reason // "external_shutdown"' 2>/dev/null || echo "external_shutdown")
265
-
266
- echo ""
267
- echo "🛑 Shutdown signal received from Redis channel: $REASON"
268
- echo " Sending SIGTERM to orchestrator PID: $ORCHESTRATOR_PID"
269
-
270
- # Send SIGTERM to main orchestrator process
271
- if kill -TERM "$ORCHESTRATOR_PID" 2>/dev/null; then
272
- echo " ✅ SIGTERM sent successfully"
273
- else
274
- echo " ❌ Failed to send SIGTERM (process may have already exited)"
275
- exit 0
276
- fi
277
- fi
278
- ) &
279
-
280
- SHUTDOWN_MONITOR_PID=$!
281
- echo "Shutdown monitor started (PID: $SHUTDOWN_MONITOR_PID)"
282
- }
283
-
284
- ##############################################################################
285
- # Feedback Accumulation Function (PHASE 1 - BUG #23 FIX)
286
- ##############################################################################
287
- # Accumulates feedback across iterations to enable learning
288
- # Usage: accumulate_feedback <task_id> <iteration> <source> <feedback_message>
289
- function accumulate_feedback() {
290
- local task_id="$1"
291
- local iteration="$2"
292
- local source="$3"
293
- local feedback_message="$4"
294
-
295
- local feedback_key="swarm:${task_id}:feedback:history"
296
-
297
- # Retrieve existing feedback history
298
- local feedback_history
299
- feedback_history=$(redis-cli GET "$feedback_key" 2>/dev/null)
300
- # Normalize empty/nil to valid JSON array
301
- if [ -z "$feedback_history" ] || [ "$feedback_history" = "(nil)" ]; then
302
- feedback_history="[]"
303
- fi
304
-
305
- # Append new feedback with metadata
306
- local new_feedback
307
- new_feedback=$(jq -nc \
308
- --argjson history "$feedback_history" \
309
- --arg iteration "$iteration" \
310
- --arg source "$source" \
311
- --arg feedback "$feedback_message" \
312
- --arg timestamp "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
313
- '$history + [{
314
- iteration: ($iteration | tonumber),
315
- source: $source,
316
- feedback: $feedback,
317
- timestamp: $timestamp
318
- }]')
319
-
320
- # Store accumulated history
321
- echo "$new_feedback" | redis-cli -x SET "$feedback_key" EX 86400 >/dev/null
322
-
323
- echo "[Feedback] ✅ Accumulated feedback for iteration $iteration (source: $source)"
324
- }
325
-
326
- ##############################################################################
327
- # Quorum Calculation Function
328
- ##############################################################################
329
- function calculate_quorum() {
330
- local quorum_spec="$1"
331
- local total_agents="$2"
332
-
333
- # If no quorum specified, require all agents
334
- if [ -z "$quorum_spec" ]; then
335
- echo "$total_agents"
336
- return 0
337
- fi
338
-
339
- # Check if percentage format (e.g., "85%")
340
- if [[ "$quorum_spec" =~ %$ ]]; then
341
- # Extract percentage value (remove % suffix)
342
- local pct="${quorum_spec%\%}"
343
- # Calculate: ceil(total_agents * pct / 100)
344
- echo "scale=0; ($total_agents * $pct + 50) / 100" | bc
345
- # Check if decimal (0.0-1.0), treat as fraction
346
- elif [[ "$quorum_spec" =~ ^0?\.[0-9]+$ ]]; then
347
- # Calculate: ceil(total_agents * fraction)
348
- echo "scale=0; ($quorum_spec * $total_agents + 0.5) / 1" | bc
349
- else
350
- # Absolute number - validate it doesn't exceed total
351
- if [ "$quorum_spec" -gt "$total_agents" ]; then
352
- echo "Error: Quorum ($quorum_spec) exceeds total agents ($total_agents)" >&2
353
- return 1
354
- fi
355
- echo "$quorum_spec"
356
- fi
357
- }
358
-
359
- ##############################################################################
360
- # Dead Letter Queue (DLQ) Functions
361
- ##############################################################################
362
- function write_to_dlq() {
363
- local agent="$1"
364
- local reason="$2"
365
- local retry_count="$3"
366
-
367
- DLQ_KEY="swarm:${TASK_ID}:dlq:${agent}"
368
- DLQ_ENTRY=$(jq -n \
369
- --arg reason "$reason" \
370
- --arg retries "$retry_count" \
371
- --arg ts "$(date +%s)" \
372
- '{reason: $reason, retry_count: ($retries | tonumber), timestamp: ($ts | tonumber)}')
373
-
374
- echo "$DLQ_ENTRY" | redis-cli -x LPUSH "$DLQ_KEY" >/dev/null
375
- redis-cli EXPIRE "$DLQ_KEY" 604800 >/dev/null # 7 days TTL
376
-
377
- echo " ❌ $agent → DLQ (reason: $reason, retries: $retry_count)"
378
- }
379
-
380
- ##############################################################################
381
- # Exponential Backoff Retry Function
382
- ##############################################################################
383
- function retry_with_backoff() {
384
- local agent="$1"
385
- local attempt="$2"
386
- local max_retries="$3"
387
- local base_delay="$4"
388
-
389
- # Check for shutdown before sleeping
390
- if [ "$SHUTDOWN_REQUESTED" -eq 1 ]; then
391
- echo " [SHUTDOWN] Skipping backoff delay for $agent" >&2
392
- return 0
393
- fi
394
-
395
- # Exponential backoff: delay = base_delay * (2 ^ attempt)
396
- local delay=$(echo "$base_delay * (2 ^ $attempt)" | bc)
397
- local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
398
-
399
- echo " [$timestamp] [Retry $attempt/$max_retries] Waiting ${delay}ms before retry for $agent..."
400
-
401
- # Use interruptible sleep - sleep in small increments and check for shutdown
402
- local delay_sec=$(echo "scale=3; $delay / 1000" | bc)
403
- local elapsed=0
404
- while (( $(echo "$elapsed < $delay_sec" | bc -l) )); do
405
- # Sleep for 0.5s increments (or remaining time if less)
406
- local remaining=$(echo "$delay_sec - $elapsed" | bc)
407
- local sleep_time=$(echo "if ($remaining < 0.5) $remaining else 0.5" | bc)
408
-
409
- sleep "$sleep_time" &
410
- wait $! 2>/dev/null || return 0 # If wait is interrupted (SIGTERM), return immediately
411
-
412
- elapsed=$(echo "$elapsed + $sleep_time" | bc)
413
-
414
- # Check for shutdown after each sleep increment
415
- if [ "$SHUTDOWN_REQUESTED" -eq 1 ]; then
416
- echo " [SHUTDOWN] Interrupted backoff delay for $agent" >&2
417
- return 0
418
- fi
419
- done
420
- }
421
-
422
- ##############################################################################
423
- # Heartbeat Monitoring Functions
424
- ##############################################################################
425
- declare -A MISSED_HEARTBEATS # Track missed heartbeats per agent
426
-
427
- function check_agent_heartbeat() {
428
- local agent="$1"
429
- local task_id="$2"
430
- local iteration="$3"
431
-
432
- # Agents create heartbeat as: swarm:${task_id}:agent:${agent_id} (HASH with heartbeat field)
433
- # Agent ID includes iteration suffix: react-frontend-engineer-1
434
- HB_KEY="swarm:${task_id}:agent:${agent}-${iteration}"
435
- HB_DATA=$(redis-cli HGET "$HB_KEY" heartbeat 2>/dev/null || echo "")
436
-
437
- if [ -z "$HB_DATA" ] || [ "$HB_DATA" = "(nil)" ]; then
438
- return 1 # Dead
439
- else
440
- return 0 # Alive
441
- fi
442
- }
443
-
444
- function check_heartbeats_loop() {
445
- local task_id="$1"
446
- local loop_name="$2"
447
- local iteration="$3"
448
- shift 3
449
- local agents=("$@")
450
-
451
- for AGENT in "${agents[@]}"; do
452
- # Skip agents already marked as failed
453
- if [[ " ${LOOP3_FAILED_AGENTS[@]} ${LOOP2_FAILED_AGENTS[@]} " =~ " ${AGENT} " ]]; then
454
- continue
455
- fi
456
-
457
- if ! check_agent_heartbeat "$AGENT" "$task_id" "$iteration"; then
458
- MISSED_HEARTBEATS["$AGENT"]=$((${MISSED_HEARTBEATS["$AGENT"]:-0} + 1))
459
-
460
- if [ ${MISSED_HEARTBEATS["$AGENT"]} -ge 2 ]; then
461
- local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
462
- echo " [$timestamp] [$loop_name] ⚠️ $AGENT appears hung (no heartbeat for 60s)" >&2
463
-
464
- # Determine which loop this agent belongs to and check quorum
465
- if [[ " ${LOOP3_AGENTS} " =~ " ${AGENT} " ]]; then
466
- REMAINING=$((${#LOOP3_COMPLETED_AGENTS[@]}))
467
- REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP3" "$LOOP3_TOTAL")
468
- elif [[ " ${LOOP2_AGENTS} " =~ " ${AGENT} " ]]; then
469
- # Safety check: Skip if Loop 2 hasn't been initialized yet
470
- if [ -z "${LOOP2_COMPLETED_AGENTS+x}" ]; then
471
- continue
472
- fi
473
- REMAINING=$((${#LOOP2_COMPLETED_AGENTS[@]}))
474
- REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP2" "$LOOP2_TOTAL")
475
- else
476
- continue
477
- fi
478
-
479
- if [ $REMAINING -ge $REQUIRED ]; then
480
- echo " [$timestamp] [$loop_name] ℹ️ Continuing with quorum (${REMAINING}/${REQUIRED} agents)" >&2
481
- else
482
- echo " [$timestamp] [$loop_name] ⚠️ Cannot meet quorum without $AGENT (${REMAINING}/${REQUIRED})" >&2
483
- fi
484
- fi
485
- else
486
- MISSED_HEARTBEATS["$AGENT"]=0 # Reset counter
487
- fi
488
- done
489
- }
490
-
491
- function start_heartbeat_monitor() {
492
- local task_id="$1"
493
- local loop_name="$2"
494
- local iteration="$3"
495
- shift 3
496
- local agents=("$@")
497
-
498
- # Create marker file for this monitor
499
- local monitor_marker="/tmp/heartbeat-monitor-${task_id}-${loop_name}.active"
500
- touch "$monitor_marker"
501
-
502
- # [BUG #7 FIX] Spawn background process and let caller capture $!
503
- (
504
- while [ -f "$monitor_marker" ]; do
505
- # Check for shutdown
506
- if [ "$SHUTDOWN_REQUESTED" -eq 1 ]; then
507
- break
508
- fi
509
-
510
- check_heartbeats_loop "$task_id" "$loop_name" "$iteration" "${agents[@]}"
511
- sleep 30
512
- done
513
- ) &
514
-
515
- # No echo - caller will use $! to get PID
516
- }
517
-
518
- function stop_heartbeat_monitor() {
519
- local task_id="$1"
520
- local loop_name="$2"
521
- local monitor_pid="$3"
522
-
523
- # Remove marker file to stop the monitor loop
524
- rm -f "/tmp/heartbeat-monitor-${task_id}-${loop_name}.active"
525
-
526
- # Kill monitor process if still running
527
- if [ -n "$monitor_pid" ] && kill -0 "$monitor_pid" 2>/dev/null; then
528
- kill "$monitor_pid" 2>/dev/null || true
529
- wait "$monitor_pid" 2>/dev/null || true
530
- fi
531
- }
532
-
533
- ##############################################################################
534
- # Get Agent-Specific Timeout
535
- ##############################################################################
536
- function get_agent_timeout() {
537
- local agent="$1"
538
- local task_id="$2"
539
-
540
- # Use get-agent-timeout.sh helper script
541
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
542
- AGENT_TIMEOUT=$("$SCRIPT_DIR/get-agent-timeout.sh" --task-id "$task_id" --agent-id "$agent" 2>/dev/null || echo "$TIMEOUT")
543
-
544
- echo "$AGENT_TIMEOUT"
545
- }
546
-
547
- ##############################################################################
548
- # Process-Based Completion Monitoring
549
- ##############################################################################
550
- function monitor_agent_process() {
551
- local agent_id="$1"
552
- local agent_pid="$2"
553
- local task_id="$3"
554
- local done_key="$4"
555
-
556
- # Monitor agent process in background
557
- (
558
- # Wait for process to exit
559
- wait "$agent_pid" 2>/dev/null
560
- EXIT_CODE=$?
561
-
562
- # Check if done signal already sent (agent may have signaled normally)
563
- DONE_COUNT=$(redis-cli LLEN "$done_key" 2>/dev/null || echo "0")
564
- if [ "$DONE_COUNT" -gt 0 ]; then
565
- # Agent signaled normally - nothing to do
566
- exit 0
567
- fi
568
-
569
- # Process exited without signaling - auto-complete
570
- if [ $EXIT_CODE -eq 0 ]; then
571
- echo " [Process Monitor] $agent_id exited successfully (code 0) - auto-signaling completion" >&2
572
- redis-cli LPUSH "$done_key" "auto-completed-success" >/dev/null
573
- else
574
- echo " [Process Monitor] $agent_id exited with error (code $EXIT_CODE) - auto-signaling failure" >&2
575
- redis-cli LPUSH "$done_key" "auto-completed-error:$EXIT_CODE" >/dev/null
576
-
577
- # METRICS: Increment error counter
578
- redis-cli INCR "swarm:${task_id}:metrics:agent_errors" >/dev/null
579
- fi
580
- ) &
581
- }
582
-
583
- ##############################################################################
584
- # BLPOP with Retry Logic + Process Monitoring
585
- ##############################################################################
586
- function blpop_with_retry() {
587
- local agent="$1"
588
- local done_key="$2"
589
- local timeout="$3"
590
- local retry_count="$4"
591
- local retry_delay="$5"
592
- local agent_pid="${6:-}" # Optional: PID for process monitoring
593
-
594
- for ATTEMPT in $(seq 1 $retry_count); do
595
- # Check for shutdown before attempting BLPOP
596
- if [ "$SHUTDOWN_REQUESTED" -eq 1 ]; then
597
- echo " [SHUTDOWN] Aborting BLPOP for $agent" >&2
598
- return 1
599
- fi
600
-
601
- # Use Redis's native BLPOP timeout instead of shell timeout command
602
- # This allows SIGTERM to properly interrupt the process
603
- RESULT=$(redis-cli blpop "$done_key" "$timeout" 2>/dev/null || echo "")
604
-
605
- if [ -n "$RESULT" ]; then
606
- echo "$RESULT"
607
- return 0 # Success
608
- fi
609
-
610
- # BLPOP timeout - check if process is still alive
611
- if [ -n "$agent_pid" ]; then
612
- if ! kill -0 "$agent_pid" 2>/dev/null; then
613
- echo " [Process Check] Agent process $agent_pid no longer running" >&2
614
-
615
- # Process exited - check if done signal was auto-generated
616
- RESULT=$(redis-cli LPOP "$done_key" 2>/dev/null || echo "")
617
- if [ -n "$RESULT" ]; then
618
- echo " [Auto-Complete] Retrieved: $RESULT" >&2
619
- echo "$RESULT"
620
- return 0
621
- fi
622
- fi
623
- fi
624
-
625
- # Check for shutdown after BLPOP timeout
626
- if [ "$SHUTDOWN_REQUESTED" -eq 1 ]; then
627
- echo " [SHUTDOWN] Aborting retry for $agent" >&2
628
- return 1
629
- fi
630
-
631
- # Check heartbeat status
632
- HEARTBEAT_KEY="swarm:${TASK_ID}:${agent}:heartbeat"
633
- HEARTBEAT_EXISTS=$(redis-cli EXISTS "$HEARTBEAT_KEY" 2>/dev/null || echo "0")
634
-
635
- if [ "$HEARTBEAT_EXISTS" -eq 0 ]; then
636
- echo " ⚠️ No heartbeat from $agent - agent may be stuck or crashed" >&2
637
-
638
- # If we have PID and process is stuck, kill it
639
- if [ -n "$agent_pid" ] && kill -0 "$agent_pid" 2>/dev/null; then
640
- echo " [Timeout Kill] Terminating stuck process $agent_pid" >&2
641
- kill "$agent_pid" 2>/dev/null || true
642
- sleep 2
643
-
644
- # Force kill if still alive
645
- if kill -0 "$agent_pid" 2>/dev/null; then
646
- kill -9 "$agent_pid" 2>/dev/null || true
647
- fi
648
-
649
- # METRICS: Increment timeout counter
650
- redis-cli INCR "swarm:${TASK_ID}:metrics:agent_killed" >/dev/null
651
- fi
652
- fi
653
-
654
- # Log retry attempt (to stderr so it's visible during command substitution)
655
- local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
656
- echo " [$timestamp] ⚠️ BLPOP attempt $ATTEMPT/$retry_count failed for $agent" >&2
657
-
658
- if [ $ATTEMPT -lt $retry_count ]; then
659
- # METRICS: Increment retry counter
660
- redis-cli INCR "swarm:${TASK_ID}:metrics:retry_count" >/dev/null
661
-
662
- retry_with_backoff "$agent" "$ATTEMPT" "$retry_count" "$retry_delay" >&2
663
- else
664
- # Final failure - write to DLQ
665
- echo " [$timestamp] ❌ FINAL FAILURE: $agent after $retry_count attempts" >&2
666
- write_to_dlq "$agent" "timeout_after_retries" "$retry_count"
667
- return 1
668
- fi
669
- done
670
-
671
- return 1
672
- }
673
-
674
- echo "=== CFN Loop Orchestration ==="
675
- echo "Task ID: $TASK_ID"
676
- echo "Mode: $MODE (Gate: $GATE, Consensus: $CONSENSUS)"
677
- echo "Max Iterations: $MAX_ITERATIONS"
678
- echo ""
679
-
680
- # Initialize swarm using general Redis coordination primitive
681
- SWARM_ID="swarm-${TASK_ID}"
682
- ALL_AGENTS="${LOOP3_AGENTS},${LOOP2_AGENTS},${PRODUCT_OWNER}"
683
-
684
- # LOG: Swarm initialization
685
- ./.claude/skills/redis-coordination/log-event.sh \
686
- --task-id "$TASK_ID" \
687
- --event-type "swarm_init" \
688
- --details "{\"mode\": \"$MODE\", \"loop3_agents\": \"$LOOP3_AGENTS\", \"loop2_agents\": \"$LOOP2_AGENTS\", \"product_owner\": \"$PRODUCT_OWNER\", \"max_iterations\": $MAX_ITERATIONS, \"gate_threshold\": $GATE, \"consensus_threshold\": $CONSENSUS}" \
689
- --level "INFO" 2>/dev/null || true
690
-
691
- # Build CFN-specific metadata
692
- CFN_METADATA=$(cat <<EOF
693
- {
694
- "mode": "$MODE",
695
- "loop3_agents": "$LOOP3_AGENTS",
696
- "loop2_agents": "$LOOP2_AGENTS",
697
- "product_owner": "$PRODUCT_OWNER",
698
- "workflow_type": "cfn_loop"
699
- }
700
- EOF
701
- )
702
-
703
- # Use general init-swarm primitive
704
- ./.claude/skills/redis-coordination/init-swarm.sh \
705
- --swarm-id "$SWARM_ID" \
706
- --agents "$ALL_AGENTS" \
707
- --task-id "$TASK_ID" \
708
- --topology "hierarchical" \
709
- --metadata "$CFN_METADATA" > /dev/null
710
-
711
- # Start shutdown monitor in background
712
- start_shutdown_monitor "$TASK_ID"
713
-
714
- # Store epic context in Redis (if provided)
715
- if [ -n "$EPIC_CONTEXT" ]; then
716
- echo "📋 Storing epic context in Redis..."
717
- # Escape single quotes for Redis
718
- EPIC_ESCAPED="${EPIC_CONTEXT//\'/\'\\\'\'}"
719
- redis-cli setex "swarm:${TASK_ID}:epic-context" 604800 "$EPIC_ESCAPED" >/dev/null
720
- echo " ✅ Epic context stored (TTL: 7 days)"
721
- fi
722
-
723
- if [ -n "$PHASE_CONTEXT" ]; then
724
- echo "📋 Storing phase context in Redis..."
725
- PHASE_ESCAPED="${PHASE_CONTEXT//\'/\'\\\'\'}"
726
- redis-cli setex "swarm:${TASK_ID}:phase-context" 604800 "$PHASE_ESCAPED" >/dev/null
727
- echo " ✅ Phase context stored (TTL: 7 days)"
728
- fi
729
-
730
- if [ -n "$SUCCESS_CRITERIA" ]; then
731
- echo "📋 Storing success criteria in Redis..."
732
- CRITERIA_ESCAPED="${SUCCESS_CRITERIA//\'/\'\\\'\'}"
733
- redis-cli setex "swarm:${TASK_ID}:success-criteria" 604800 "$CRITERIA_ESCAPED" >/dev/null
734
- echo " ✅ Success criteria stored (TTL: 7 days)"
735
- fi
736
-
737
- echo ""
738
-
739
- # [BUG #15 FIX] REMOVED: Early Product Owner spawn at iteration 0
740
- # Product Owner now only spawned after Loop 2 completes (see line 1283)
741
- # This prevents timeout issues with waiting mode initialization
742
- echo "[Product Owner] Will spawn after Loop 2 consensus (just-in-time pattern)"
743
- echo ""
744
-
745
- # Iteration loop
746
- for ITERATION in $(seq 1 $MAX_ITERATIONS); do
747
- echo "=== Iteration $ITERATION/$MAX_ITERATIONS ==="
748
-
749
- # METRICS: Iteration start timestamp
750
- ITERATION_START=$(date +%s%N | cut -b1-13) # milliseconds
751
- redis-cli LPUSH "swarm:${TASK_ID}:metrics:iteration_start" "$ITERATION_START" >/dev/null
752
-
753
- # Step 1: Build detailed agent context from Redis (BUG #20 FIX - Option 2)
754
- echo "[Loop 3] Building agent context from Redis..."
755
-
756
- # Retrieve stored context
757
- EPIC_CTX=$(redis-cli get "swarm:${TASK_ID}:epic-context" 2>/dev/null || echo "{}")
758
- PHASE_CTX=$(redis-cli get "swarm:${TASK_ID}:phase-context" 2>/dev/null || echo "{}")
759
- SUCCESS_CTX=$(redis-cli get "swarm:${TASK_ID}:success-criteria" 2>/dev/null || echo "{}")
760
-
761
- # Extract key fields with jq (safe parsing)
762
- EPIC_GOAL=$(echo "$EPIC_CTX" | jq -r '.epicGoal // "No epic goal specified"')
763
- IN_SCOPE=$(echo "$EPIC_CTX" | jq -r '.inScope[]? // empty' | sed 's/^/- /' || echo "- (not specified)")
764
- OUT_SCOPE=$(echo "$EPIC_CTX" | jq -r '.outOfScope[]? // empty' | sed 's/^/- /' || echo "- (not specified)")
765
- DELIVERABLES=$(echo "$PHASE_CTX" | jq -r '.deliverables[]? // empty' | sed 's/^/- /' || echo "- (not specified)")
766
- DIRECTORY=$(echo "$PHASE_CTX" | jq -r '.directory // ""')
767
- ACCEPTANCE=$(echo "$SUCCESS_CTX" | jq -r '.acceptanceCriteria[]? // empty' | sed 's/^/- /' || echo "- (not specified)")
768
-
769
- # Build structured agent context
770
- LOOP3_AGENT_CONTEXT="Loop 3 implementation for iteration $ITERATION
771
-
772
- Epic Goal: $EPIC_GOAL
773
-
774
- In Scope:
775
- $IN_SCOPE
776
-
777
- Out of Scope:
778
- $OUT_SCOPE
779
-
780
- Deliverables (CRITICAL - you MUST create these files):
781
- $DELIVERABLES
782
- $([ -n "$DIRECTORY" ] && echo "
783
- Target Directory: $DIRECTORY")
784
-
785
- Acceptance Criteria:
786
- $ACCEPTANCE
787
-
788
- IMPORTANT:
789
- - Use Write tool to create each deliverable file
790
- - Verify files created with 'ls -la \$DIRECTORY' after each Write
791
- - All deliverables must exist for validation to pass
792
- - Report confidence score based on actual file creation
793
- "
794
-
795
- # PHASE 1 (BUG #23): Inject feedback history for iterative learning
796
- if [ "$ITERATION" -gt 1 ]; then
797
- FEEDBACK_HISTORY=$(redis-cli GET "swarm:${TASK_ID}:feedback:history" 2>/dev/null)
798
- # Normalize empty/nil to valid JSON array
799
- if [ -z "$FEEDBACK_HISTORY" ] || [ "$FEEDBACK_HISTORY" = "(nil)" ]; then
800
- FEEDBACK_HISTORY="[]"
801
- fi
802
-
803
- if [ "$FEEDBACK_HISTORY" != "[]" ]; then
804
- # Format feedback for human readability
805
- FEEDBACK_SUMMARY=$(echo "$FEEDBACK_HISTORY" | jq -r '.[] | "- Iteration \(.iteration) (\(.source)): \(.feedback)"' 2>/dev/null || echo "")
806
-
807
- if [ -n "$FEEDBACK_SUMMARY" ]; then
808
- # Prepend feedback to agent context
809
- LOOP3_AGENT_CONTEXT="Loop 3 implementation for iteration $ITERATION
810
-
811
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
812
- PREVIOUS ITERATION FEEDBACK (LEARN FROM THIS)
813
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
814
-
815
- $FEEDBACK_SUMMARY
816
-
817
- CRITICAL: Address the feedback above. Do NOT repeat previous mistakes.
818
-
819
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
820
-
821
- $LOOP3_AGENT_CONTEXT"
822
- echo " 📝 Injected feedback history ($(echo "$FEEDBACK_HISTORY" | jq '. | length') items)"
823
- fi
824
- fi
825
- fi
826
-
827
- echo " ✅ Agent context built ($(echo "$LOOP3_AGENT_CONTEXT" | wc -c) characters)"
828
- echo ""
829
-
830
- # Step 2: Spawn Loop 3 agents via CLI
831
- echo "[Loop 3] Spawning implementers via CLI..."
832
- IFS=',' read -ra AGENTS <<< "$LOOP3_AGENTS"
833
-
834
- # Track instance counts to generate unique agent IDs for duplicate agent types
835
- declare -A AGENT_INSTANCE_COUNTS
836
- declare -A AGENT_IDS # Map from array index to unique agent ID
837
-
838
- # Pre-calculate unique agent IDs
839
- for i in "${!AGENTS[@]}"; do
840
- AGENT="${AGENTS[$i]}"
841
-
842
- # Increment instance counter for this agent type
843
- AGENT_INSTANCE_COUNTS["$AGENT"]=$((${AGENT_INSTANCE_COUNTS["$AGENT"]:-0} + 1))
844
- INSTANCE_NUM="${AGENT_INSTANCE_COUNTS["$AGENT"]}"
845
-
846
- # Generate unique agent ID: agent-type-iteration-instance
847
- UNIQUE_AGENT_ID="${AGENT}-${ITERATION}-${INSTANCE_NUM}"
848
- AGENT_IDS["$i"]="$UNIQUE_AGENT_ID"
849
-
850
- echo " [Instance Tracking] ${AGENT} #${INSTANCE_NUM} → ${UNIQUE_AGENT_ID}"
851
- done
852
-
853
- echo ""
854
-
855
- # [PHASE 1 INTEGRATION] Loop 3 Skill-Based Output Processing (Parallel)
856
- # Uses .claude/skills/loop3-output-processing/ for guaranteed confidence extraction
857
- echo "[Loop 3] Using skill-based output processing (parallel execution)"
858
-
859
- LOOP3_TOTAL=${#AGENTS[@]}
860
- LOOP3_REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP3" "$LOOP3_TOTAL")
861
- LOOP3_COMPLETED_AGENTS=()
862
- LOOP3_FAILED_AGENTS=()
863
-
864
- echo "[Loop 3] Quorum: $LOOP3_REQUIRED/$LOOP3_TOTAL agents required"
865
- echo ""
866
-
867
- # Step 2a: Spawn all agents in parallel (background processes)
868
- declare -A AGENT_PIDS
869
- declare -A AGENT_OUTPUT_FILES
870
-
871
- for i in "${!AGENTS[@]}"; do
872
- AGENT="${AGENTS[$i]}"
873
- UNIQUE_AGENT_ID="${AGENT_IDS[$i]}"
874
-
875
- # Get agent-specific timeout
876
- AGENT_TIMEOUT=$(get_agent_timeout "$AGENT" "$TASK_ID")
877
-
878
- # Create temp file for agent output
879
- OUTPUT_FILE="/tmp/loop3-${TASK_ID}-${UNIQUE_AGENT_ID}.json"
880
- AGENT_OUTPUT_FILES["$UNIQUE_AGENT_ID"]="$OUTPUT_FILE"
881
-
882
- echo " Spawning $AGENT (ID: $UNIQUE_AGENT_ID, timeout: ${AGENT_TIMEOUT}s)"
883
-
884
- # LOG: Loop 3 agent spawn
885
- ./.claude/skills/redis-coordination/log-event.sh \
886
- --task-id "$TASK_ID" \
887
- --event-type "agent_spawn" \
888
- --loop "loop3" \
889
- --agent-id "$UNIQUE_AGENT_ID" \
890
- --iteration "$ITERATION" \
891
- --details "{\"agent_type\": \"$AGENT\", \"timeout\": $AGENT_TIMEOUT}" \
892
- --level "INFO" 2>/dev/null || true
893
-
894
- # Execute agent via Loop 3 skill in background
895
- (
896
- # Record start time
897
- START_TIME=$(date +%s%N | cut -b1-13)
898
-
899
- # Execute skill (BUG #20 FIX - inject detailed context)
900
- if SKILL_RESULT=$(./.claude/skills/loop3-output-processing/execute-and-extract.sh \
901
- --agent-type "$AGENT" \
902
- --task-id "$TASK_ID" \
903
- --agent-id "$UNIQUE_AGENT_ID" \
904
- --context "$LOOP3_AGENT_CONTEXT" \
905
- --iteration "$ITERATION" \
906
- --timeout "$AGENT_TIMEOUT" 2>&1); then
907
-
908
- # Record end time
909
- END_TIME=$(date +%s%N | cut -b1-13)
910
- LATENCY=$((END_TIME - START_TIME))
911
-
912
- # Add latency to result
913
- RESULT_WITH_LATENCY=$(echo "$SKILL_RESULT" | jq --arg latency "$LATENCY" '. + {latency_ms: ($latency | tonumber)}')
914
-
915
- # Save to temp file
916
- echo "$RESULT_WITH_LATENCY" > "$OUTPUT_FILE"
917
-
918
- # Store result in Redis
919
- echo "$RESULT_WITH_LATENCY" | redis-cli -x LPUSH "swarm:${TASK_ID}:${UNIQUE_AGENT_ID}:result" >/dev/null
920
- redis-cli LPUSH "swarm:${TASK_ID}:${UNIQUE_AGENT_ID}:done" "complete" >/dev/null
921
-
922
- exit 0
923
- else
924
- # Skill failed - save error
925
- echo "{\"error\": true, \"output\": \"$SKILL_RESULT\"}" > "$OUTPUT_FILE"
926
- exit 1
927
- fi
928
- ) &
929
-
930
- AGENT_PIDS["$UNIQUE_AGENT_ID"]=$!
931
- echo " ✅ Spawned $UNIQUE_AGENT_ID (PID: ${AGENT_PIDS[$UNIQUE_AGENT_ID]})"
932
- done
933
-
934
- echo ""
935
- echo "[Loop 3] All agents spawned, waiting for completion..."
936
- echo ""
937
-
938
- # Step 2b: Wait for all agents to complete
939
- for i in "${!AGENTS[@]}"; do
940
- AGENT="${AGENTS[$i]}"
941
- UNIQUE_AGENT_ID="${AGENT_IDS[$i]}"
942
- AGENT_PID="${AGENT_PIDS[$UNIQUE_AGENT_ID]}"
943
- OUTPUT_FILE="${AGENT_OUTPUT_FILES[$UNIQUE_AGENT_ID]}"
944
-
945
- echo " Waiting for $UNIQUE_AGENT_ID (PID: $AGENT_PID)..."
946
-
947
- # Wait for specific agent process
948
- if wait "$AGENT_PID" 2>/dev/null; then
949
- # Success - read result from temp file
950
- if [ -f "$OUTPUT_FILE" ]; then
951
- SKILL_RESULT=$(cat "$OUTPUT_FILE")
952
-
953
- # Check if result has error flag
954
- HAS_ERROR=$(echo "$SKILL_RESULT" | jq -r '.error // false')
955
-
956
- if [ "$HAS_ERROR" = "false" ]; then
957
- # Extract metrics
958
- CONFIDENCE=$(echo "$SKILL_RESULT" | jq -r '.confidence')
959
- FILES_CHANGED=$(echo "$SKILL_RESULT" | jq -r '.files_changed')
960
- CONFIDENCE_SOURCE=$(echo "$SKILL_RESULT" | jq -r '.confidence_source')
961
- LATENCY=$(echo "$SKILL_RESULT" | jq -r '.latency_ms')
962
-
963
- echo " ✅ $UNIQUE_AGENT_ID complete (${LATENCY}ms, confidence: $CONFIDENCE [$CONFIDENCE_SOURCE], files: $FILES_CHANGED)"
964
-
965
- # BUGFIX #21: Store confidence in Redis for consensus collection
966
- # The skill script extracts confidence but doesn't store it where invoke-waiting-mode.sh collect expects
967
- ./.claude/skills/redis-coordination/invoke-waiting-mode.sh report \
968
- --task-id "$TASK_ID" \
969
- --agent-id "$UNIQUE_AGENT_ID" \
970
- --confidence "$CONFIDENCE" \
971
- --iteration "$ITERATION" >/dev/null
972
-
973
- # LOG: Loop 3 agent completion
974
- ./.claude/skills/redis-coordination/log-event.sh \
975
- --task-id "$TASK_ID" \
976
- --event-type "agent_complete" \
977
- --loop "loop3" \
978
- --agent-id "$UNIQUE_AGENT_ID" \
979
- --iteration "$ITERATION" \
980
- --details "{\"confidence\": $CONFIDENCE, \"confidence_source\": \"$CONFIDENCE_SOURCE\", \"files_changed\": $FILES_CHANGED, \"latency_ms\": $LATENCY}" \
981
- --level "INFO" 2>/dev/null || true
982
-
983
- # Store latency metric
984
- METRIC=$(jq -nc \
985
- --arg agent "$UNIQUE_AGENT_ID" \
986
- --arg latency "$LATENCY" \
987
- --arg loop "loop3" \
988
- --arg iteration "$ITERATION" \
989
- '{agent: $agent, latency_ms: ($latency | tonumber), loop: $loop, iteration: ($iteration | tonumber)}')
990
- echo "$METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:agent_latency" >/dev/null
991
-
992
- LOOP3_COMPLETED_AGENTS+=("$UNIQUE_AGENT_ID")
993
- else
994
- ERROR_OUTPUT=$(echo "$SKILL_RESULT" | jq -r '.output')
995
- echo " ❌ $UNIQUE_AGENT_ID failed (skill execution error)"
996
- echo " Error: $ERROR_OUTPUT"
997
-
998
- # LOG: Loop 3 agent failure
999
- ./.claude/skills/redis-coordination/log-event.sh \
1000
- --task-id "$TASK_ID" \
1001
- --event-type "agent_failure" \
1002
- --loop "loop3" \
1003
- --agent-id "$UNIQUE_AGENT_ID" \
1004
- --iteration "$ITERATION" \
1005
- --details "{\"error\": \"skill_execution_error\", \"output\": \"$ERROR_OUTPUT\"}" \
1006
- --level "ERROR" 2>/dev/null || true
1007
-
1008
- LOOP3_FAILED_AGENTS+=("$AGENT")
1009
- redis-cli INCR "swarm:${TASK_ID}:metrics:agent_failure_count" >/dev/null
1010
- fi
1011
-
1012
- # Cleanup temp file
1013
- rm -f "$OUTPUT_FILE"
1014
- else
1015
- echo " ❌ $UNIQUE_AGENT_ID failed (no output file)"
1016
- LOOP3_FAILED_AGENTS+=("$AGENT")
1017
- redis-cli INCR "swarm:${TASK_ID}:metrics:agent_failure_count" >/dev/null
1018
- fi
1019
- else
1020
- echo " ❌ $UNIQUE_AGENT_ID failed (process error)"
1021
- LOOP3_FAILED_AGENTS+=("$AGENT")
1022
- redis-cli INCR "swarm:${TASK_ID}:metrics:agent_failure_count" >/dev/null
1023
- rm -f "$OUTPUT_FILE"
1024
- fi
1025
-
1026
- echo ""
1027
- done
1028
-
1029
- # Validate quorum
1030
- if [ ${#LOOP3_COMPLETED_AGENTS[@]} -ge "$LOOP3_REQUIRED" ]; then
1031
- echo "[Loop 3] ✅ Quorum met: ${#LOOP3_COMPLETED_AGENTS[@]}/$LOOP3_REQUIRED agents completed"
1032
- if [ ${#LOOP3_FAILED_AGENTS[@]} -gt 0 ]; then
1033
- echo "[Loop 3] ⚠️ Failed agents (continuing with quorum): ${LOOP3_FAILED_AGENTS[*]}"
1034
-
1035
- # METRICS: Increment quorum fallback counter
1036
- redis-cli INCR "swarm:${TASK_ID}:metrics:quorum_fallback" >/dev/null
1037
- fi
1038
- else
1039
- echo "[Loop 3] ❌ Quorum FAILED: ${#LOOP3_COMPLETED_AGENTS[@]} < $LOOP3_REQUIRED"
1040
- echo "[Loop 3] Failed agents: ${LOOP3_FAILED_AGENTS[*]}"
1041
- exit 1
1042
- fi
1043
- echo ""
1044
-
1045
- # Step 2: Collect Loop 3 confidence scores (only from completed agents)
1046
- echo "[Loop 3] Collecting confidence scores from ${#LOOP3_COMPLETED_AGENTS[@]} agents..."
1047
- LOOP3_COMPLETED_IDS=$(IFS=','; echo "${LOOP3_COMPLETED_AGENTS[*]}")
1048
- LOOP3_CONSENSUS=$(./.claude/skills/redis-coordination/invoke-waiting-mode.sh collect \
1049
- --task-id "$TASK_ID" \
1050
- --agent-ids "$LOOP3_COMPLETED_IDS" | tail -1)
1051
-
1052
- echo "[Loop 3] Average confidence: $LOOP3_CONSENSUS (from ${#LOOP3_COMPLETED_AGENTS[@]}/${LOOP3_TOTAL} agents)"
1053
-
1054
- # METRICS: Store Loop 3 consensus score
1055
- LOOP3_METRIC=$(jq -nc \
1056
- --arg consensus "$LOOP3_CONSENSUS" \
1057
- --arg iteration "$ITERATION" \
1058
- '{consensus: ($consensus | tonumber), iteration: ($iteration | tonumber)}')
1059
- echo "$LOOP3_METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:loop3_consensus" >/dev/null
1060
-
1061
- # BUG #12 FIX: Deliverable Verification with explicit file checking
1062
- echo "[Deliverable Check] Verifying implementation artifacts..."
1063
-
1064
- # Use enhanced validate-deliverables.sh skill
1065
- DELIVERABLE_ARGS="--task-id $TASK_ID"
1066
- if [ -n "$EXPECTED_FILES" ]; then
1067
- DELIVERABLE_ARGS="$DELIVERABLE_ARGS --expected-files $EXPECTED_FILES"
1068
- echo " Expected files: $EXPECTED_FILES"
1069
- fi
1070
-
1071
- DELIVERABLE_STATUS=$(./.claude/skills/product-owner-decision/validate-deliverables.sh $DELIVERABLE_ARGS)
1072
-
1073
- if [ "$DELIVERABLE_STATUS" = "FAILED" ]; then
1074
- # Retrieve missing files from Redis (if available)
1075
- MISSING_FILES_JSON=$(redis-cli get "swarm:${TASK_ID}:missing-files" 2>/dev/null || echo "[]")
1076
- MISSING_FILES_LIST=$(echo "$MISSING_FILES_JSON" | jq -r '.[]' | tr '\n' ', ' | sed 's/,$//')
1077
-
1078
- if [ -n "$MISSING_FILES_LIST" ]; then
1079
- echo "❌ DELIVERABLE VERIFICATION FAILED: Missing files"
1080
- echo " Expected but not found: $MISSING_FILES_LIST"
1081
- else
1082
- echo "❌ DELIVERABLE VERIFICATION FAILED: No files created or modified"
1083
- fi
1084
-
1085
- echo " This prevents 'consensus on vapor' - validators approving nothing"
1086
- echo ""
1087
- echo "Decision: RELAUNCH iteration $((ITERATION + 1)) (skip Loop 2 validation)"
1088
- echo ""
1089
-
1090
- # METRICS: Increment deliverable failure counter
1091
- redis-cli INCR "swarm:${TASK_ID}:metrics:deliverable_failures" >/dev/null
1092
-
1093
- # Override all Loop 3 confidence scores to 0.0 (prevent gate pass)
1094
- for AGENT in "${LOOP3_COMPLETED_AGENTS[@]}"; do
1095
- redis-cli DEL "swarm:${TASK_ID}:${AGENT}:result" >/dev/null
1096
- redis-cli LPUSH "swarm:${TASK_ID}:${AGENT}:result" "0.0" >/dev/null
1097
- echo " [Override] ${AGENT} confidence: 1.0 → 0.0 (no deliverables)"
1098
- done
1099
-
1100
- # Recalculate consensus (should be 0.0 now)
1101
- LOOP3_CONSENSUS=$(./.claude/skills/redis-coordination/invoke-waiting-mode.sh collect \
1102
- --task-id "$TASK_ID" \
1103
- --agent-ids "$LOOP3_COMPLETED_IDS" | tail -1)
1104
-
1105
- echo ""
1106
- echo "[Loop 3] Recalculated confidence after override: $LOOP3_CONSENSUS"
1107
- echo ""
1108
-
1109
- # Build specific feedback with missing files
1110
- if [ -n "$MISSING_FILES_LIST" ]; then
1111
- FEEDBACK="CRITICAL: Create these missing files: $MISSING_FILES_LIST
1112
-
1113
- Use the Write tool for each file. Verify with 'ls -la' after each Write operation."
1114
- else
1115
- FEEDBACK="CRITICAL: You must create or modify files. No deliverables were produced in iteration $ITERATION."
1116
- fi
1117
-
1118
- # PHASE 1 (BUG #23): Accumulate feedback across iterations for learning
1119
- # BUGFIX #22: Store feedback in Redis for next iteration (agents will be re-spawned, not woken)
1120
- # Per P3 agent lifecycle: agents exit cleanly, orchestrator spawns fresh agents
1121
- accumulate_feedback "$TASK_ID" "$ITERATION" "deliverable_check" "$FEEDBACK"
1122
- echo " Reason: no_deliverables"
1123
- echo " Priority: 40 (HIGH)"
1124
-
1125
- continue # Next iteration (skip gate check and Loop 2)
1126
- fi
1127
-
1128
- echo "[Deliverable Check] ✅ Deliverables verified - proceeding to gate check"
1129
- echo ""
1130
-
1131
- # Gate check
1132
- if (( $(echo "$LOOP3_CONSENSUS < $GATE" | bc -l) )); then
1133
- echo "❌ Gate FAILED ($LOOP3_CONSENSUS < $GATE)"
1134
- echo "Decision: RELAUNCH iteration $((ITERATION + 1))"
1135
-
1136
- # LOG: Gate check failure
1137
- ./.claude/skills/redis-coordination/log-event.sh \
1138
- --task-id "$TASK_ID" \
1139
- --event-type "gate_check" \
1140
- --iteration "$ITERATION" \
1141
- --details "{\"consensus\": $LOOP3_CONSENSUS, \"threshold\": $GATE, \"result\": \"FAIL\", \"decision\": \"RELAUNCH\"}" \
1142
- --level "WARN" 2>/dev/null || true
1143
-
1144
- # METRICS: Increment gate failure counter
1145
- redis-cli INCR "swarm:${TASK_ID}:metrics:gate_failures" >/dev/null
1146
-
1147
- # PHASE 1 (BUG #23): Accumulate feedback across iterations for learning
1148
- # BUGFIX #22: Store feedback in Redis for next iteration (agents will be re-spawned, not woken)
1149
- # Per P3 agent lifecycle: agents exit cleanly, orchestrator spawns fresh agents
1150
- FEEDBACK_MSG="Improve confidence from $LOOP3_CONSENSUS to >$GATE"
1151
- accumulate_feedback "$TASK_ID" "$ITERATION" "gate_check" "$FEEDBACK_MSG"
1152
- echo " Reason: gate_failed"
1153
- echo " Priority: 30 (MEDIUM)"
1154
-
1155
- continue # Next iteration
1156
- fi
1157
-
1158
- echo "✅ Gate PASSED ($LOOP3_CONSENSUS >= $GATE)"
1159
-
1160
- # LOG: Gate check success
1161
- ./.claude/skills/redis-coordination/log-event.sh \
1162
- --task-id "$TASK_ID" \
1163
- --event-type "gate_check" \
1164
- --iteration "$ITERATION" \
1165
- --details "{\"consensus\": $LOOP3_CONSENSUS, \"threshold\": $GATE, \"result\": \"PASS\"}" \
1166
- --level "INFO" 2>/dev/null || true
1167
-
1168
- echo ""
1169
-
1170
- # Signal Loop 2 validators that gate has passed (they can start work)
1171
- GATE_PASS_KEY="swarm:${TASK_ID}:gate-passed"
1172
- redis-cli lpush "$GATE_PASS_KEY" "{\"iteration\": $ITERATION, \"loop3_confidence\": $LOOP3_CONSENSUS}" > /dev/null
1173
- echo "[Loop 3] Gate pass signal sent to Loop 2 validators"
1174
- echo ""
1175
-
1176
- # Step 3: Build Loop 2 validator context (BUG #20 FIX - inject same deliverables)
1177
- LOOP2_VALIDATOR_CONTEXT="Loop 2 validation for iteration $ITERATION
1178
-
1179
- Review Loop 3 implementation against these requirements:
1180
-
1181
- Epic Goal: $EPIC_GOAL
1182
-
1183
- Expected Deliverables:
1184
- $DELIVERABLES
1185
- $([ -n "$DIRECTORY" ] && echo "
1186
- Target Directory: $DIRECTORY")
1187
-
1188
- Acceptance Criteria:
1189
- $ACCEPTANCE
1190
-
1191
- Your Validation Tasks:
1192
- - Verify all deliverable files exist in correct directory
1193
- - Check files contain actual implementation (not placeholders)
1194
- - Validate against acceptance criteria
1195
- - Provide structured feedback (critical/warnings/suggestions)
1196
- - Report confidence score based on deliverable completeness
1197
- "
1198
-
1199
- echo "[Loop 2] Validator context built"
1200
- echo ""
1201
-
1202
- # Step 4: Spawn Loop 2 validators using skill-based output processing (parallel execution)
1203
- echo "[Loop 2] Using skill-based output processing (parallel execution)"
1204
- IFS=',' read -ra VALIDATORS <<< "$LOOP2_AGENTS"
1205
-
1206
- # Track instance counts to generate unique validator IDs for duplicate validator types
1207
- declare -A VALIDATOR_INSTANCE_COUNTS
1208
- declare -A VALIDATOR_IDS # Map from array index to unique validator ID
1209
-
1210
- # Pre-calculate unique validator IDs
1211
- for i in "${!VALIDATORS[@]}"; do
1212
- VALIDATOR="${VALIDATORS[$i]}"
1213
-
1214
- # Increment instance counter for this validator type
1215
- VALIDATOR_INSTANCE_COUNTS["$VALIDATOR"]=$((${VALIDATOR_INSTANCE_COUNTS["$VALIDATOR"]:-0} + 1))
1216
- INSTANCE_NUM="${VALIDATOR_INSTANCE_COUNTS["$VALIDATOR"]}"
1217
-
1218
- # Generate unique validator ID: validator-type-iteration-instance
1219
- UNIQUE_VALIDATOR_ID="${VALIDATOR}-${ITERATION}-${INSTANCE_NUM}"
1220
- VALIDATOR_IDS["$i"]="$UNIQUE_VALIDATOR_ID"
1221
-
1222
- echo " [Instance Tracking] ${VALIDATOR} #${INSTANCE_NUM} → ${UNIQUE_VALIDATOR_ID}"
1223
- done
1224
-
1225
- echo ""
1226
-
1227
- # Step 3a: Spawn all validators in parallel using skill
1228
- echo "[Loop 2] Spawning validators in parallel..."
1229
- declare -A VALIDATOR_PIDS # Map from validator ID to background PID
1230
- declare -A VALIDATOR_OUTPUT_FILES # Map from validator ID to temp output file
1231
-
1232
- LOOP2_TOTAL=${#VALIDATORS[@]}
1233
- LOOP2_REQUIRED=$(calculate_quorum "$MIN_QUORUM_LOOP2" "$LOOP2_TOTAL")
1234
-
1235
- echo "[Loop 2] Quorum: $LOOP2_REQUIRED/$LOOP2_TOTAL validators required"
1236
- echo ""
1237
-
1238
- for i in "${!VALIDATORS[@]}"; do
1239
- VALIDATOR="${VALIDATORS[$i]}"
1240
- UNIQUE_VALIDATOR_ID="${VALIDATOR_IDS[$i]}"
1241
-
1242
- # Get agent-specific timeout (use base validator type, not unique ID)
1243
- AGENT_TIMEOUT=$(get_agent_timeout "$VALIDATOR" "$TASK_ID")
1244
-
1245
- # Create temp output file for this validator
1246
- OUTPUT_FILE="/tmp/loop2-${TASK_ID}-${UNIQUE_VALIDATOR_ID}.json"
1247
- VALIDATOR_OUTPUT_FILES["$UNIQUE_VALIDATOR_ID"]="$OUTPUT_FILE"
1248
-
1249
- echo " Spawning: $VALIDATOR (ID: $UNIQUE_VALIDATOR_ID, timeout: ${AGENT_TIMEOUT}s)"
1250
-
1251
- # Execute skill in background - captures agent output and extracts structured data
1252
- (
1253
- # METRICS: Agent latency start
1254
- AGENT_START=$(date +%s%N | cut -b1-13)
1255
-
1256
- # Execute skill to spawn validator and extract feedback (BUG #20 FIX - inject detailed context)
1257
- SKILL_RESULT=$(./.claude/skills/loop2-output-processing/execute-and-extract.sh \
1258
- --agent-type "$VALIDATOR" \
1259
- --task-id "$TASK_ID" \
1260
- --agent-id "$UNIQUE_VALIDATOR_ID" \
1261
- --context "$LOOP2_VALIDATOR_CONTEXT" \
1262
- --iteration "$ITERATION" \
1263
- --timeout "$AGENT_TIMEOUT" 2>&1)
1264
-
1265
- # METRICS: Agent latency end
1266
- AGENT_END=$(date +%s%N | cut -b1-13)
1267
- LATENCY=$((AGENT_END - AGENT_START))
1268
-
1269
- # Inject latency into result JSON
1270
- SKILL_RESULT_WITH_LATENCY=$(echo "$SKILL_RESULT" | jq --arg latency "$LATENCY" '. + {latency_ms: ($latency | tonumber)}')
1271
-
1272
- # Write result to temp file
1273
- echo "$SKILL_RESULT_WITH_LATENCY" > "$OUTPUT_FILE"
1274
-
1275
- # Also push to Redis for compatibility with existing tools
1276
- echo "$SKILL_RESULT_WITH_LATENCY" | redis-cli -x LPUSH "swarm:${TASK_ID}:${UNIQUE_VALIDATOR_ID}:result" >/dev/null
1277
-
1278
- # Signal completion
1279
- redis-cli LPUSH "swarm:${TASK_ID}:${UNIQUE_VALIDATOR_ID}:done" "complete" >/dev/null
1280
- ) &
1281
-
1282
- # Track background PID
1283
- VALIDATOR_PIDS["$UNIQUE_VALIDATOR_ID"]=$!
1284
- echo " ✅ Spawned $UNIQUE_VALIDATOR_ID (PID: ${VALIDATOR_PIDS[$UNIQUE_VALIDATOR_ID]})"
1285
- done
1286
-
1287
- echo ""
1288
- echo "[Loop 2] All validators spawned, waiting for completion..."
1289
- echo ""
1290
-
1291
- # Step 3b: Wait for all validators to complete and collect results
1292
- LOOP2_COMPLETED_AGENTS=()
1293
- LOOP2_FAILED_AGENTS=()
1294
- declare -A LOOP2_CONFIDENCES # Map from validator ID to confidence score
1295
-
1296
- for i in "${!VALIDATORS[@]}"; do
1297
- VALIDATOR="${VALIDATORS[$i]}"
1298
- UNIQUE_VALIDATOR_ID="${VALIDATOR_IDS[$i]}"
1299
- VALIDATOR_PID="${VALIDATOR_PIDS[$UNIQUE_VALIDATOR_ID]}"
1300
- OUTPUT_FILE="${VALIDATOR_OUTPUT_FILES[$UNIQUE_VALIDATOR_ID]}"
1301
-
1302
- echo " Waiting for $UNIQUE_VALIDATOR_ID (PID: $VALIDATOR_PID)..."
1303
-
1304
- # Wait for background process to complete
1305
- if wait "$VALIDATOR_PID" 2>/dev/null; then
1306
- # Process completed successfully, read result from temp file
1307
- if [ -f "$OUTPUT_FILE" ] && [ -s "$OUTPUT_FILE" ]; then
1308
- SKILL_RESULT=$(cat "$OUTPUT_FILE")
1309
-
1310
- # Validate JSON structure
1311
- if echo "$SKILL_RESULT" | jq empty 2>/dev/null; then
1312
- # Extract confidence score
1313
- CONFIDENCE=$(echo "$SKILL_RESULT" | jq -r '.confidence // 0.0')
1314
- CONFIDENCE_SOURCE=$(echo "$SKILL_RESULT" | jq -r '.confidence_source // "unknown"')
1315
- FEEDBACK=$(echo "$SKILL_RESULT" | jq -r '.feedback // {}')
1316
- LATENCY=$(echo "$SKILL_RESULT" | jq -r '.latency_ms // 0')
1317
-
1318
- # Store confidence for consensus calculation
1319
- LOOP2_CONFIDENCES["$UNIQUE_VALIDATOR_ID"]="$CONFIDENCE"
1320
-
1321
- # Store latency metric
1322
- METRIC=$(jq -nc \
1323
- --arg agent "$UNIQUE_VALIDATOR_ID" \
1324
- --arg latency "$LATENCY" \
1325
- --arg loop "loop2" \
1326
- --arg iteration "$ITERATION" \
1327
- '{agent: $agent, latency_ms: ($latency | tonumber), loop: $loop, iteration: ($iteration | tonumber)}')
1328
- echo "$METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:agent_latency" >/dev/null
1329
-
1330
- # Count feedback items
1331
- CRITICAL_COUNT=$(echo "$FEEDBACK" | jq -r '.critical | length')
1332
- WARNINGS_COUNT=$(echo "$FEEDBACK" | jq -r '.warnings | length')
1333
- SUGGESTIONS_COUNT=$(echo "$FEEDBACK" | jq -r '.suggestions | length')
1334
-
1335
- echo " ✅ $UNIQUE_VALIDATOR_ID complete (${LATENCY}ms, confidence: $CONFIDENCE [$CONFIDENCE_SOURCE], feedback: ${CRITICAL_COUNT}C/${WARNINGS_COUNT}W/${SUGGESTIONS_COUNT}S)"
1336
-
1337
- LOOP2_COMPLETED_AGENTS+=("$UNIQUE_VALIDATOR_ID")
1338
- else
1339
- echo " ⚠️ $UNIQUE_VALIDATOR_ID returned invalid JSON, treating as failed"
1340
- LOOP2_FAILED_AGENTS+=("$VALIDATOR")
1341
-
1342
- # METRICS: Increment timeout counter
1343
- redis-cli INCR "swarm:${TASK_ID}:metrics:timeout_count" >/dev/null
1344
- fi
1345
- else
1346
- echo " ⚠️ $UNIQUE_VALIDATOR_ID completed but no output file found"
1347
- LOOP2_FAILED_AGENTS+=("$VALIDATOR")
1348
-
1349
- # METRICS: Increment timeout counter
1350
- redis-cli INCR "swarm:${TASK_ID}:metrics:timeout_count" >/dev/null
1351
- fi
1352
- else
1353
- echo " ❌ $UNIQUE_VALIDATOR_ID failed (process exited with error)"
1354
- LOOP2_FAILED_AGENTS+=("$VALIDATOR")
1355
-
1356
- # METRICS: Increment timeout counter
1357
- redis-cli INCR "swarm:${TASK_ID}:metrics:timeout_count" >/dev/null
1358
- fi
1359
-
1360
- # Cleanup temp file
1361
- rm -f "$OUTPUT_FILE"
1362
- done
1363
-
1364
- echo ""
1365
-
1366
- # Validate quorum
1367
- if [ ${#LOOP2_COMPLETED_AGENTS[@]} -ge "$LOOP2_REQUIRED" ]; then
1368
- echo "[Loop 2] ✅ Quorum met: ${#LOOP2_COMPLETED_AGENTS[@]}/$LOOP2_REQUIRED validators completed"
1369
- if [ ${#LOOP2_FAILED_AGENTS[@]} -gt 0 ]; then
1370
- echo "[Loop 2] ⚠️ Failed validators (continuing with quorum): ${LOOP2_FAILED_AGENTS[*]}"
1371
-
1372
- # METRICS: Increment quorum fallback counter
1373
- redis-cli INCR "swarm:${TASK_ID}:metrics:quorum_fallback" >/dev/null
1374
- fi
1375
- else
1376
- echo "[Loop 2] ❌ Quorum FAILED: ${#LOOP2_COMPLETED_AGENTS[@]} < $LOOP2_REQUIRED"
1377
- echo "[Loop 2] Failed validators: ${LOOP2_FAILED_AGENTS[*]}"
1378
- exit 1
1379
- fi
1380
- echo ""
1381
-
1382
- # Step 3c: Calculate Loop 2 consensus from extracted confidence scores
1383
- echo "[Loop 2] Calculating consensus from ${#LOOP2_COMPLETED_AGENTS[@]} validators..."
1384
-
1385
- # Calculate average confidence from completed validators
1386
- LOOP2_TOTAL_CONFIDENCE=0
1387
- LOOP2_CONFIDENCE_COUNT=0
1388
-
1389
- for VALIDATOR_ID in "${LOOP2_COMPLETED_AGENTS[@]}"; do
1390
- CONFIDENCE="${LOOP2_CONFIDENCES[$VALIDATOR_ID]}"
1391
- if [ -n "$CONFIDENCE" ] && [ "$CONFIDENCE" != "null" ]; then
1392
- LOOP2_TOTAL_CONFIDENCE=$(echo "$LOOP2_TOTAL_CONFIDENCE + $CONFIDENCE" | bc -l)
1393
- LOOP2_CONFIDENCE_COUNT=$((LOOP2_CONFIDENCE_COUNT + 1))
1394
- fi
1395
- done
1396
-
1397
- if [ "$LOOP2_CONFIDENCE_COUNT" -gt 0 ]; then
1398
- LOOP2_CONSENSUS=$(echo "scale=2; $LOOP2_TOTAL_CONFIDENCE / $LOOP2_CONFIDENCE_COUNT" | bc -l)
1399
- else
1400
- echo "⚠️ No valid confidence scores found, defaulting to 0.0"
1401
- LOOP2_CONSENSUS=0.0
1402
- fi
1403
-
1404
- echo "[Loop 2] Average consensus: $LOOP2_CONSENSUS (from ${LOOP2_CONFIDENCE_COUNT} validators)"
1405
-
1406
- # METRICS: Store Loop 2 consensus score
1407
- LOOP2_METRIC=$(jq -nc \
1408
- --arg consensus "$LOOP2_CONSENSUS" \
1409
- --arg iteration "$ITERATION" \
1410
- '{consensus: ($consensus | tonumber), iteration: ($iteration | tonumber)}')
1411
- echo "$LOOP2_METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:loop2_consensus" >/dev/null
1412
-
1413
- # Display consensus status
1414
- echo ""
1415
- if (( $(echo "$LOOP2_CONSENSUS >= $CONSENSUS" | bc -l) )); then
1416
- echo "✅ CONSENSUS REACHED ($LOOP2_CONSENSUS >= $CONSENSUS)"
1417
- else
1418
- echo "⚠️ CONSENSUS NOT REACHED ($LOOP2_CONSENSUS < $CONSENSUS)"
1419
- fi
1420
- echo ""
1421
-
1422
- # [BUG #11 FIX] Product Owner decision via output parsing (not Redis wait)
1423
- echo "[Product Owner] Spawning Product Owner for strategic decision..."
1424
-
1425
- # BUG #19 FIX: Define PO_UNIQUE_ID BEFORE building context string
1426
- PO_UNIQUE_ID="${PRODUCT_OWNER}-${ITERATION}-decision"
1427
-
1428
- # Build Product Owner context
1429
- PO_CONTEXT="CFN Loop iteration $ITERATION complete.
1430
-
1431
- Loop 2 Consensus: $LOOP2_CONSENSUS (threshold: $CONSENSUS)
1432
- Task ID: $TASK_ID
1433
- Agent ID: $PO_UNIQUE_ID
1434
-
1435
- Make your strategic decision: PROCEED, ITERATE, or ABORT
1436
-
1437
- Decision Framework:
1438
- - PROCEED: Consensus >= $CONSENSUS AND deliverables verified
1439
- - ITERATE: Consensus < $CONSENSUS AND iteration < $MAX_ITERATIONS
1440
- - ABORT: Max iterations reached without consensus
1441
-
1442
- Output your decision clearly with reasoning."
1443
-
1444
- # Spawn Product Owner and capture output
1445
- PO_TIMEOUT=$(get_agent_timeout "$PRODUCT_OWNER" "$TASK_ID")
1446
- echo "[Product Owner] Spawning with timeout: ${PO_TIMEOUT}s"
1447
-
1448
- PO_OUTPUT=$(timeout "$PO_TIMEOUT" npx claude-flow-novice agent "$PRODUCT_OWNER" \
1449
- --task-id "$TASK_ID" \
1450
- --agent-id "$PO_UNIQUE_ID" \
1451
- --context "$PO_CONTEXT" 2>&1 || true)
1452
-
1453
- # Parse structured decision JSON from Redis (created by execute-product-owner-decision.sh)
1454
- echo "[Product Owner] Retrieving structured decision from Redis..."
1455
- DECISION=$(redis-cli lindex "swarm:${TASK_ID}:${PO_UNIQUE_ID}:decision" 0)
1456
-
1457
- if [ -z "$DECISION" ] || [ "$DECISION" = "(nil)" ]; then
1458
- echo "❌ ERROR: Could not retrieve Product Owner decision from Redis"
1459
- echo "Expected key: swarm:${TASK_ID}:${PO_UNIQUE_ID}:decision"
1460
- echo "Product Owner output:"
1461
- echo "$PO_OUTPUT"
1462
- exit 1
1463
- fi
1464
-
1465
- # Extract fields from structured JSON
1466
- DECISION_TYPE=$(echo "$DECISION" | jq -r '.decision')
1467
- DECISION_REASONING=$(echo "$DECISION" | jq -r '.reasoning')
1468
- DECISION_CONFIDENCE=$(echo "$DECISION" | jq -r '.confidence')
1469
- IN_SCOPE_CONSENSUS=$(echo "$DECISION" | jq -r '.scope_analysis.in_scope_consensus // 0')
1470
- BACKLOG_COUNT=$(echo "$DECISION" | jq -r '.backlog_items | length')
1471
-
1472
- if [ -z "$DECISION_TYPE" ] || [ "$DECISION_TYPE" = "null" ]; then
1473
- echo "❌ ERROR: Invalid Product Owner decision JSON"
1474
- echo "Received: $DECISION"
1475
- exit 1
1476
- fi
1477
-
1478
- echo " Decision Type: $DECISION_TYPE"
1479
- echo " Confidence: $DECISION_CONFIDENCE"
1480
- echo " In-Scope Consensus: $IN_SCOPE_CONSENSUS"
1481
- echo " Backlog Items: $BACKLOG_COUNT"
1482
- echo ""
1483
-
1484
- # LOG: Product Owner decision
1485
- ./.claude/skills/redis-coordination/log-event.sh \
1486
- --task-id "$TASK_ID" \
1487
- --event-type "po_decision" \
1488
- --agent-id "$PO_UNIQUE_ID" \
1489
- --iteration "$ITERATION" \
1490
- --details "$DECISION" \
1491
- --level "INFO" 2>/dev/null || true
1492
-
1493
- echo "[Product Owner] Decision: $DECISION_TYPE"
1494
- echo ""
1495
-
1496
- # Handle Product Owner decision
1497
- if [ "$DECISION_TYPE" = "PROCEED" ] || [ "$DECISION_TYPE" = "DEFER_AND_PROCEED" ]; then
1498
- # Handle backlog items if DEFER_AND_PROCEED
1499
- if [ "$DECISION_TYPE" = "DEFER_AND_PROCEED" ]; then
1500
- echo "📋 Product Owner Decision: DEFER_AND_PROCEED"
1501
- echo " In-scope work complete (consensus: $IN_SCOPE_CONSENSUS)"
1502
- echo " Deferred $BACKLOG_COUNT out-of-scope items to backlog"
1503
- echo ""
1504
- fi
1505
-
1506
- # DELIVERABLE VERIFICATION (Sprint 8 - prevent "consensus on vapor")
1507
- echo "[Deliverable Verification] Checking success criteria..."
1508
-
1509
- SUCCESS_CRITERIA_RAW=$(redis-cli GET "swarm:${TASK_ID}:success-criteria" 2>/dev/null)
1510
- if [ -n "$SUCCESS_CRITERIA_RAW" ]; then
1511
- # Check if task description includes file/deliverable keywords
1512
- TASK_DESC=$(redis-cli GET "swarm:${TASK_ID}:task" 2>/dev/null)
1513
-
1514
- if echo "$TASK_DESC" | grep -qiE "create|build|implement|generate|file|component|module|test"; then
1515
- echo "[Deliverable Verification] Task involves implementation - checking for file changes..."
1516
-
1517
- # Count modified/created files since orchestrator started
1518
- FILES_CREATED=$(git status --short 2>/dev/null | grep -E "^(A|M|\\?\\?)" | wc -l)
1519
-
1520
- if [ "$FILES_CREATED" -eq 0 ]; then
1521
- echo "⚠️ DELIVERABLE VERIFICATION FAILED"
1522
- echo " Task requires implementation but no files were created/modified"
1523
- echo " Consensus reached on plans without actual deliverables"
1524
- echo ""
1525
- echo " Options:"
1526
- echo " 1. Force ITERATE to create actual implementation"
1527
- echo " 2. Override verification (--skip-deliverable-check flag)"
1528
- echo " 3. Manual intervention to verify work was done"
1529
- echo ""
1530
- echo " Recommendation: Force ITERATE with explicit deliverable requirement"
1531
-
1532
- # Store verification failure
1533
- redis-cli SET "swarm:${TASK_ID}:deliverable_verification" "failed" EX 86400 >/dev/null
1534
-
1535
- # Optional: Force ITERATE (commented for now - requires flag)
1536
- # echo "[Forced Override] Changing PROCEED → ITERATE due to missing deliverables"
1537
- # DECISION_TYPE="ITERATE"
1538
- # DECISION_REASONING="No deliverables created despite implementation task"
1539
- else
1540
- echo "✅ Deliverable verification passed ($FILES_CREATED files created/modified)"
1541
- redis-cli SET "swarm:${TASK_ID}:deliverable_verification" "passed:$FILES_CREATED" EX 86400 >/dev/null
1542
- fi
1543
- else
1544
- echo "[Deliverable Verification] Task is analysis/planning - skipping file check"
1545
- fi
1546
- fi
1547
-
1548
- echo "🎉 CFN Loop Complete (Product Owner: PROCEED)"
1549
- echo "Final Consensus: $LOOP2_CONSENSUS (Iteration $ITERATION)"
1550
-
1551
- # METRICS: Iteration end timestamp and duration
1552
- ITERATION_END=$(date +%s%N | cut -b1-13)
1553
- ITERATION_DURATION=$((ITERATION_END - ITERATION_START))
1554
-
1555
- # Store final iteration duration metric
1556
- DURATION_METRIC=$(jq -nc \
1557
- --arg duration "$ITERATION_DURATION" \
1558
- --arg iteration "$ITERATION" \
1559
- '{duration_ms: ($duration | tonumber), iteration: ($iteration | tonumber)}')
1560
- echo "$DURATION_METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:iteration_duration" >/dev/null
1561
-
1562
- # BUGFIX #22: Agents have already exited (P3 clean-exit pattern), no wake needed
1563
- # The task is complete, agents were already cleaned up when they reported confidence
1564
- echo "[Coordinator] Task complete (PROCEED decision)"
1565
- echo " All agents have already exited cleanly per P3 lifecycle"
1566
-
1567
- # Use general complete-swarm primitive
1568
- ./.claude/skills/redis-coordination/complete-swarm.sh \
1569
- --swarm-id "$SWARM_ID" \
1570
- --final-metric "final_consensus=$LOOP2_CONSENSUS" \
1571
- --final-metric "total_iterations=$ITERATION" > /dev/null
1572
-
1573
- exit 0
1574
-
1575
- elif [ "$DECISION_TYPE" = "ITERATE" ]; then
1576
- echo "⚠️ Product Owner Decision: ITERATE (improve quality)"
1577
-
1578
- # METRICS: Iteration end timestamp and duration
1579
- ITERATION_END=$(date +%s%N | cut -b1-13)
1580
- ITERATION_DURATION=$((ITERATION_END - ITERATION_START))
1581
-
1582
- # Store iteration duration metric
1583
- DURATION_METRIC=$(jq -nc \
1584
- --arg duration "$ITERATION_DURATION" \
1585
- --arg iteration "$ITERATION" \
1586
- '{duration_ms: ($duration | tonumber), iteration: ($iteration | tonumber)}')
1587
- echo "$DURATION_METRIC" | redis-cli -x LPUSH "swarm:${TASK_ID}:metrics:iteration_duration" >/dev/null
1588
-
1589
- # Check max iterations
1590
- if [ $ITERATION -eq $MAX_ITERATIONS ]; then
1591
- echo "❌ Maximum iterations ($MAX_ITERATIONS) reached - cannot iterate further"
1592
- echo " Product Owner wanted ITERATE but max iterations exhausted"
1593
- exit 1
1594
- fi
1595
-
1596
- # PHASE 1 (BUG #23): Accumulate feedback across iterations for learning
1597
- # BUGFIX #22: Store feedback in Redis for next iteration (agents will be re-spawned, not woken)
1598
- # Per P3 agent lifecycle: agents exit cleanly, orchestrator spawns fresh agents for next iteration
1599
- echo "[Coordinator] Storing feedback for iteration $((ITERATION + 1))..."
1600
-
1601
- FEEDBACK_MSG="Product Owner decision: ITERATE - Improve consensus from $LOOP2_CONSENSUS to >=$CONSENSUS"
1602
- accumulate_feedback "$TASK_ID" "$ITERATION" "product_owner_iterate" "$FEEDBACK_MSG"
1603
-
1604
- echo " Reason: cfn_loop_iteration (Product Owner ITERATE decision)"
1605
- echo " Priority: 30 (MEDIUM - Loop 3), 10 (HIGH - Loop 2)"
1606
- echo ""
1607
-
1608
- elif [ "$DECISION_TYPE" = "ABORT" ]; then
1609
- echo "❌ Product Owner Decision: ABORT (scope too large or out of scope)"
1610
- echo " Consensus: $LOOP2_CONSENSUS, Iteration: $ITERATION"
1611
- exit 1
1612
-
1613
- else
1614
- echo "❌ ERROR: Unknown Product Owner decision: $DECISION_TYPE"
1615
- echo " Expected: PROCEED, DEFER_AND_PROCEED, ITERATE, or ABORT"
1616
- exit 1
1617
- fi
1618
- done
1619
-
1620
- echo "❌ CFN Loop failed after $MAX_ITERATIONS iterations"
1621
- exit 1