shipwright-cli 1.7.0 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/.claude/agents/code-reviewer.md +90 -0
  2. package/.claude/agents/devops-engineer.md +142 -0
  3. package/.claude/agents/pipeline-agent.md +80 -0
  4. package/.claude/agents/shell-script-specialist.md +150 -0
  5. package/.claude/agents/test-specialist.md +196 -0
  6. package/.claude/hooks/post-tool-use.sh +38 -0
  7. package/.claude/hooks/pre-tool-use.sh +25 -0
  8. package/.claude/hooks/session-started.sh +37 -0
  9. package/README.md +212 -814
  10. package/claude-code/CLAUDE.md.shipwright +54 -0
  11. package/claude-code/hooks/notify-idle.sh +2 -2
  12. package/claude-code/hooks/session-start.sh +24 -0
  13. package/claude-code/hooks/task-completed.sh +6 -2
  14. package/claude-code/settings.json.template +12 -0
  15. package/dashboard/public/app.js +4422 -0
  16. package/dashboard/public/index.html +816 -0
  17. package/dashboard/public/styles.css +4755 -0
  18. package/dashboard/server.ts +4315 -0
  19. package/docs/KNOWN-ISSUES.md +18 -10
  20. package/docs/TIPS.md +38 -26
  21. package/docs/patterns/README.md +33 -23
  22. package/package.json +9 -5
  23. package/scripts/adapters/iterm2-adapter.sh +1 -1
  24. package/scripts/adapters/tmux-adapter.sh +52 -23
  25. package/scripts/adapters/wezterm-adapter.sh +26 -14
  26. package/scripts/lib/compat.sh +200 -0
  27. package/scripts/lib/helpers.sh +72 -0
  28. package/scripts/postinstall.mjs +72 -13
  29. package/scripts/{cct → sw} +109 -21
  30. package/scripts/sw-adversarial.sh +274 -0
  31. package/scripts/sw-architecture-enforcer.sh +330 -0
  32. package/scripts/sw-checkpoint.sh +390 -0
  33. package/scripts/{cct-cleanup.sh → sw-cleanup.sh} +3 -1
  34. package/scripts/sw-connect.sh +619 -0
  35. package/scripts/{cct-cost.sh → sw-cost.sh} +368 -34
  36. package/scripts/{cct-daemon.sh → sw-daemon.sh} +2217 -204
  37. package/scripts/sw-dashboard.sh +477 -0
  38. package/scripts/sw-developer-simulation.sh +252 -0
  39. package/scripts/sw-docs.sh +635 -0
  40. package/scripts/sw-doctor.sh +907 -0
  41. package/scripts/{cct-fix.sh → sw-fix.sh} +10 -6
  42. package/scripts/{cct-fleet.sh → sw-fleet.sh} +498 -22
  43. package/scripts/sw-github-checks.sh +521 -0
  44. package/scripts/sw-github-deploy.sh +533 -0
  45. package/scripts/sw-github-graphql.sh +972 -0
  46. package/scripts/sw-heartbeat.sh +293 -0
  47. package/scripts/sw-init.sh +522 -0
  48. package/scripts/sw-intelligence.sh +1196 -0
  49. package/scripts/sw-jira.sh +643 -0
  50. package/scripts/sw-launchd.sh +364 -0
  51. package/scripts/sw-linear.sh +648 -0
  52. package/scripts/{cct-logs.sh → sw-logs.sh} +72 -2
  53. package/scripts/{cct-loop.sh → sw-loop.sh} +534 -44
  54. package/scripts/{cct-memory.sh → sw-memory.sh} +321 -38
  55. package/scripts/sw-patrol-meta.sh +417 -0
  56. package/scripts/sw-pipeline-composer.sh +455 -0
  57. package/scripts/{cct-pipeline.sh → sw-pipeline.sh} +2319 -178
  58. package/scripts/sw-predictive.sh +820 -0
  59. package/scripts/{cct-prep.sh → sw-prep.sh} +339 -49
  60. package/scripts/{cct-ps.sh → sw-ps.sh} +6 -4
  61. package/scripts/{cct-reaper.sh → sw-reaper.sh} +6 -4
  62. package/scripts/sw-remote.sh +687 -0
  63. package/scripts/sw-self-optimize.sh +947 -0
  64. package/scripts/sw-session.sh +519 -0
  65. package/scripts/sw-setup.sh +234 -0
  66. package/scripts/sw-status.sh +605 -0
  67. package/scripts/{cct-templates.sh → sw-templates.sh} +9 -4
  68. package/scripts/sw-tmux.sh +591 -0
  69. package/scripts/sw-tracker-jira.sh +277 -0
  70. package/scripts/sw-tracker-linear.sh +292 -0
  71. package/scripts/sw-tracker.sh +409 -0
  72. package/scripts/{cct-upgrade.sh → sw-upgrade.sh} +103 -46
  73. package/scripts/{cct-worktree.sh → sw-worktree.sh} +3 -0
  74. package/templates/pipelines/autonomous.json +27 -5
  75. package/templates/pipelines/full.json +12 -0
  76. package/templates/pipelines/standard.json +12 -0
  77. package/tmux/{claude-teams-overlay.conf → shipwright-overlay.conf} +27 -9
  78. package/tmux/templates/accessibility.json +34 -0
  79. package/tmux/templates/api-design.json +35 -0
  80. package/tmux/templates/architecture.json +1 -0
  81. package/tmux/templates/bug-fix.json +9 -0
  82. package/tmux/templates/code-review.json +1 -0
  83. package/tmux/templates/compliance.json +36 -0
  84. package/tmux/templates/data-pipeline.json +36 -0
  85. package/tmux/templates/debt-paydown.json +34 -0
  86. package/tmux/templates/devops.json +1 -0
  87. package/tmux/templates/documentation.json +1 -0
  88. package/tmux/templates/exploration.json +1 -0
  89. package/tmux/templates/feature-dev.json +1 -0
  90. package/tmux/templates/full-stack.json +8 -0
  91. package/tmux/templates/i18n.json +34 -0
  92. package/tmux/templates/incident-response.json +36 -0
  93. package/tmux/templates/migration.json +1 -0
  94. package/tmux/templates/observability.json +35 -0
  95. package/tmux/templates/onboarding.json +33 -0
  96. package/tmux/templates/performance.json +35 -0
  97. package/tmux/templates/refactor.json +1 -0
  98. package/tmux/templates/release.json +35 -0
  99. package/tmux/templates/security-audit.json +8 -0
  100. package/tmux/templates/spike.json +34 -0
  101. package/tmux/templates/testing.json +1 -0
  102. package/tmux/tmux.conf +98 -9
  103. package/scripts/cct-doctor.sh +0 -328
  104. package/scripts/cct-init.sh +0 -282
  105. package/scripts/cct-session.sh +0 -284
  106. package/scripts/cct-status.sh +0 -169
@@ -4,8 +4,9 @@
4
4
  # ║ Polls for labeled issues · Spawns pipelines · Manages worktrees ║
5
5
  # ╚═══════════════════════════════════════════════════════════════════════════╝
6
6
  set -euo pipefail
7
+ trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
7
8
 
8
- VERSION="1.7.0"
9
+ VERSION="1.9.0"
9
10
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
10
11
  REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
11
12
 
@@ -20,6 +21,28 @@ DIM='\033[2m'
20
21
  BOLD='\033[1m'
21
22
  RESET='\033[0m'
22
23
 
24
+ # ─── Cross-platform compatibility ──────────────────────────────────────────
25
+ # shellcheck source=lib/compat.sh
26
+ [[ -f "$SCRIPT_DIR/lib/compat.sh" ]] && source "$SCRIPT_DIR/lib/compat.sh"
27
+
28
+ # ─── Intelligence Engine (optional) ──────────────────────────────────────────
29
+ # shellcheck source=sw-intelligence.sh
30
+ [[ -f "$SCRIPT_DIR/sw-intelligence.sh" ]] && source "$SCRIPT_DIR/sw-intelligence.sh"
31
+ # shellcheck source=sw-pipeline-composer.sh
32
+ [[ -f "$SCRIPT_DIR/sw-pipeline-composer.sh" ]] && source "$SCRIPT_DIR/sw-pipeline-composer.sh"
33
+ # shellcheck source=sw-self-optimize.sh
34
+ [[ -f "$SCRIPT_DIR/sw-self-optimize.sh" ]] && source "$SCRIPT_DIR/sw-self-optimize.sh"
35
+ # shellcheck source=sw-predictive.sh
36
+ [[ -f "$SCRIPT_DIR/sw-predictive.sh" ]] && source "$SCRIPT_DIR/sw-predictive.sh"
37
+
38
+ # ─── GitHub API Modules (optional) ────────────────────────────────────────
39
+ # shellcheck source=sw-github-graphql.sh
40
+ [[ -f "$SCRIPT_DIR/sw-github-graphql.sh" ]] && source "$SCRIPT_DIR/sw-github-graphql.sh"
41
+ # shellcheck source=sw-github-checks.sh
42
+ [[ -f "$SCRIPT_DIR/sw-github-checks.sh" ]] && source "$SCRIPT_DIR/sw-github-checks.sh"
43
+ # shellcheck source=sw-github-deploy.sh
44
+ [[ -f "$SCRIPT_DIR/sw-github-deploy.sh" ]] && source "$SCRIPT_DIR/sw-github-deploy.sh"
45
+
23
46
  # ─── Output Helpers ─────────────────────────────────────────────────────────
24
47
  info() { echo -e "${CYAN}${BOLD}▸${RESET} $*"; }
25
48
  success() { echo -e "${GREEN}${BOLD}✓${RESET} $*"; }
@@ -49,7 +72,7 @@ format_duration() {
49
72
  }
50
73
 
51
74
  # ─── Structured Event Log ──────────────────────────────────────────────────
52
- EVENTS_FILE="${HOME}/.claude-teams/events.jsonl"
75
+ EVENTS_FILE="${HOME}/.shipwright/events.jsonl"
53
76
 
54
77
  emit_event() {
55
78
  local event_type="$1"
@@ -61,14 +84,68 @@ emit_event() {
61
84
  if [[ "$val" =~ ^-?[0-9]+\.?[0-9]*$ ]]; then
62
85
  json_fields="${json_fields},\"${key}\":${val}"
63
86
  else
64
- val="${val//\"/\\\"}"
65
- json_fields="${json_fields},\"${key}\":\"${val}\""
87
+ local escaped_val
88
+ escaped_val=$(printf '%s' "$val" | jq -Rs '.' 2>/dev/null || printf '"%s"' "${val//\"/\\\"}")
89
+ json_fields="${json_fields},\"${key}\":${escaped_val}"
66
90
  fi
67
91
  done
68
- mkdir -p "${HOME}/.claude-teams"
92
+ mkdir -p "${HOME}/.shipwright"
69
93
  echo "{\"ts\":\"$(now_iso)\",\"ts_epoch\":$(now_epoch),\"type\":\"${event_type}\"${json_fields}}" >> "$EVENTS_FILE"
70
94
  }
71
95
 
96
+ # ─── Event Log Rotation ─────────────────────────────────────────────────────
97
+ rotate_event_log() {
98
+ local max_size=$((50 * 1024 * 1024)) # 50MB
99
+ local max_rotations=3
100
+
101
+ # Rotate events.jsonl if too large
102
+ if [[ -f "$EVENTS_FILE" ]]; then
103
+ local size
104
+ size=$(wc -c < "$EVENTS_FILE" 2>/dev/null || echo 0)
105
+ if [[ "$size" -gt "$max_size" ]]; then
106
+ # Shift rotations: .3 → delete, .2 → .3, .1 → .2, current → .1
107
+ local i=$max_rotations
108
+ while [[ $i -gt 1 ]]; do
109
+ local prev=$((i - 1))
110
+ [[ -f "${EVENTS_FILE}.${prev}" ]] && mv "${EVENTS_FILE}.${prev}" "${EVENTS_FILE}.${i}"
111
+ i=$((i - 1))
112
+ done
113
+ mv "$EVENTS_FILE" "${EVENTS_FILE}.1"
114
+ touch "$EVENTS_FILE"
115
+ emit_event "daemon.log_rotated" "previous_size=$size"
116
+ info "Rotated events.jsonl (was $(( size / 1048576 ))MB)"
117
+ fi
118
+ fi
119
+
120
+ # Clean old heartbeat files (> 24h)
121
+ local heartbeat_dir="$HOME/.shipwright/heartbeats"
122
+ if [[ -d "$heartbeat_dir" ]]; then
123
+ find "$heartbeat_dir" -name "*.json" -mmin +1440 -delete 2>/dev/null || true
124
+ fi
125
+ }
126
+
127
+ # ─── GitHub Context (loaded once at startup) ──────────────────────────────
128
+ DAEMON_GITHUB_CONTEXT=""
129
+
130
+ daemon_github_context() {
131
+ # Skip if no GitHub
132
+ [[ "${NO_GITHUB:-false}" == "true" ]] && return 0
133
+ type gh_repo_context &>/dev/null 2>&1 || return 0
134
+ type _gh_detect_repo &>/dev/null 2>&1 || return 0
135
+
136
+ _gh_detect_repo 2>/dev/null || return 0
137
+ local owner="${GH_OWNER:-}" repo="${GH_REPO:-}"
138
+ [[ -z "$owner" || -z "$repo" ]] && return 0
139
+
140
+ local context
141
+ context=$(gh_repo_context "$owner" "$repo" 2>/dev/null || echo "{}")
142
+ if [[ -n "$context" && "$context" != "{}" ]]; then
143
+ daemon_log INFO "GitHub context loaded: $(echo "$context" | jq -r '.contributor_count // 0') contributors, $(echo "$context" | jq -r '.security_alert_count // 0') security alerts"
144
+ DAEMON_GITHUB_CONTEXT="$context"
145
+ export DAEMON_GITHUB_CONTEXT
146
+ fi
147
+ }
148
+
72
149
  # ─── GitHub API Retry with Backoff ────────────────────────────────────────
73
150
  # Retries gh commands up to 3 times with exponential backoff (1s, 3s, 9s).
74
151
  # Detects rate-limit (403/429) and transient errors. Returns the gh exit code.
@@ -106,7 +183,7 @@ gh_retry() {
106
183
  }
107
184
 
108
185
  # ─── Defaults ───────────────────────────────────────────────────────────────
109
- DAEMON_DIR="$HOME/.claude-teams"
186
+ DAEMON_DIR="$HOME/.shipwright"
110
187
  PID_FILE="$DAEMON_DIR/daemon.pid"
111
188
  SHUTDOWN_FLAG="$DAEMON_DIR/daemon.shutdown"
112
189
  STATE_FILE=""
@@ -153,8 +230,17 @@ PATROL_INTERVAL="${PATROL_INTERVAL:-3600}"
153
230
  PATROL_MAX_ISSUES="${PATROL_MAX_ISSUES:-5}"
154
231
  PATROL_LABEL="${PATROL_LABEL:-auto-patrol}"
155
232
  PATROL_DRY_RUN=false
233
+ PATROL_AUTO_WATCH=false
234
+ PATROL_FAILURES_THRESHOLD=3
235
+ PATROL_DORA_ENABLED=true
236
+ PATROL_UNTESTED_ENABLED=true
237
+ PATROL_RETRY_ENABLED=true
238
+ PATROL_RETRY_THRESHOLD=2
156
239
  LAST_PATROL_EPOCH=0
157
240
 
241
+ # Team dashboard coordination
242
+ DASHBOARD_URL="${DASHBOARD_URL:-http://localhost:8767}"
243
+
158
244
  # Runtime
159
245
  NO_GITHUB=false
160
246
  CONFIG_PATH=""
@@ -320,6 +406,12 @@ load_config() {
320
406
  PATROL_INTERVAL=$(jq -r '.patrol.interval // 3600' "$config_file")
321
407
  PATROL_MAX_ISSUES=$(jq -r '.patrol.max_issues // 5' "$config_file")
322
408
  PATROL_LABEL=$(jq -r '.patrol.label // "auto-patrol"' "$config_file")
409
+ PATROL_AUTO_WATCH=$(jq -r '.patrol.auto_watch // false' "$config_file")
410
+ PATROL_FAILURES_THRESHOLD=$(jq -r '.patrol.checks.recurring_failures.threshold // 3' "$config_file")
411
+ PATROL_DORA_ENABLED=$(jq -r '.patrol.checks.dora_degradation.enabled // true' "$config_file")
412
+ PATROL_UNTESTED_ENABLED=$(jq -r '.patrol.checks.untested_scripts.enabled // true' "$config_file")
413
+ PATROL_RETRY_ENABLED=$(jq -r '.patrol.checks.retry_exhaustion.enabled // true' "$config_file")
414
+ PATROL_RETRY_THRESHOLD=$(jq -r '.patrol.checks.retry_exhaustion.threshold // 2' "$config_file")
323
415
 
324
416
  # adaptive template selection
325
417
  AUTO_TEMPLATE=$(jq -r '.auto_template // false' "$config_file")
@@ -333,6 +425,18 @@ load_config() {
333
425
  SELF_OPTIMIZE=$(jq -r '.self_optimize // false' "$config_file")
334
426
  OPTIMIZE_INTERVAL=$(jq -r '.optimize_interval // 10' "$config_file")
335
427
 
428
+ # intelligence engine settings
429
+ INTELLIGENCE_ENABLED=$(jq -r '.intelligence.enabled // false' "$config_file")
430
+ INTELLIGENCE_CACHE_TTL=$(jq -r '.intelligence.cache_ttl_seconds // 3600' "$config_file")
431
+ COMPOSER_ENABLED=$(jq -r '.intelligence.composer_enabled // false' "$config_file")
432
+ OPTIMIZATION_ENABLED=$(jq -r '.intelligence.optimization_enabled // false' "$config_file")
433
+ PREDICTION_ENABLED=$(jq -r '.intelligence.prediction_enabled // false' "$config_file")
434
+ ANOMALY_THRESHOLD=$(jq -r '.intelligence.anomaly_threshold // 3.0' "$config_file")
435
+
436
+ # adaptive thresholds (intelligence-driven operational tuning)
437
+ ADAPTIVE_THRESHOLDS_ENABLED=$(jq -r '.intelligence.adaptive_enabled // false' "$config_file")
438
+ PRIORITY_STRATEGY=$(jq -r '.intelligence.priority_strategy // "quick-wins-first"' "$config_file")
439
+
336
440
  # gh_retry: enable retry wrapper on critical GitHub API calls
337
441
  GH_RETRY_ENABLED=$(jq -r '.gh_retry // true' "$config_file")
338
442
 
@@ -361,6 +465,23 @@ load_config() {
361
465
  WORKER_MEM_GB=$(jq -r '.worker_mem_gb // 4' "$config_file")
362
466
  EST_COST_PER_JOB=$(jq -r '.estimated_cost_per_job_usd // 5.0' "$config_file")
363
467
 
468
+ # heartbeat + checkpoint recovery
469
+ HEALTH_HEARTBEAT_TIMEOUT=$(jq -r '.health.heartbeat_timeout_s // 120' "$config_file")
470
+ CHECKPOINT_ENABLED=$(jq -r '.health.checkpoint_enabled // true' "$config_file")
471
+
472
+ # progress-based health monitoring (replaces static timeouts)
473
+ PROGRESS_MONITORING=$(jq -r '.health.progress_based // true' "$config_file")
474
+ PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 3' "$config_file")
475
+ PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 6' "$config_file")
476
+ PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 10800' "$config_file") # 3hr absolute max
477
+
478
+ # team dashboard URL (for coordinated claiming)
479
+ local cfg_dashboard_url
480
+ cfg_dashboard_url=$(jq -r '.dashboard_url // ""' "$config_file")
481
+ if [[ -n "$cfg_dashboard_url" && "$cfg_dashboard_url" != "null" ]]; then
482
+ DASHBOARD_URL="$cfg_dashboard_url"
483
+ fi
484
+
364
485
  success "Config loaded"
365
486
  }
366
487
 
@@ -375,6 +496,546 @@ setup_dirs() {
375
496
  WORKTREE_DIR=".worktrees"
376
497
 
377
498
  mkdir -p "$LOG_DIR"
499
+ mkdir -p "$HOME/.shipwright/progress"
500
+ }
501
+
502
+ # ─── Adaptive Threshold Helpers ──────────────────────────────────────────────
503
+ # When intelligence.adaptive_enabled=true, operational thresholds are learned
504
+ # from historical data instead of using fixed defaults.
505
+ # Every function falls back to the current hardcoded value when no data exists.
506
+
507
+ ADAPTIVE_THRESHOLDS_ENABLED="${ADAPTIVE_THRESHOLDS_ENABLED:-false}"
508
+ PRIORITY_STRATEGY="${PRIORITY_STRATEGY:-quick-wins-first}"
509
+ EMPTY_QUEUE_CYCLES=0
510
+
511
+ # Adapt poll interval based on queue state
512
+ # Empty queue 5+ cycles → 120s; queue has items → 30s; processing → 60s
513
+ get_adaptive_poll_interval() {
514
+ local queue_depth="$1"
515
+ local active_count="$2"
516
+
517
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
518
+ echo "$POLL_INTERVAL"
519
+ return
520
+ fi
521
+
522
+ if [[ "$queue_depth" -eq 0 && "$active_count" -eq 0 ]]; then
523
+ EMPTY_QUEUE_CYCLES=$((EMPTY_QUEUE_CYCLES + 1))
524
+ else
525
+ EMPTY_QUEUE_CYCLES=0
526
+ fi
527
+
528
+ local interval="$POLL_INTERVAL"
529
+ if [[ "$EMPTY_QUEUE_CYCLES" -ge 5 ]]; then
530
+ interval=120
531
+ elif [[ "$queue_depth" -gt 0 ]]; then
532
+ interval=30
533
+ else
534
+ interval=60
535
+ fi
536
+
537
+ # Persist current setting for dashboard visibility
538
+ local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
539
+ mkdir -p "$HOME/.shipwright/optimization"
540
+ local tmp_tuning="${tuning_file}.tmp.$$"
541
+ if [[ -f "$tuning_file" ]]; then
542
+ jq --argjson pi "$interval" --argjson eqc "$EMPTY_QUEUE_CYCLES" \
543
+ '.poll_interval = $pi | .empty_queue_cycles = $eqc' \
544
+ "$tuning_file" > "$tmp_tuning" 2>/dev/null && mv "$tmp_tuning" "$tuning_file"
545
+ else
546
+ jq -n --argjson pi "$interval" --argjson eqc "$EMPTY_QUEUE_CYCLES" \
547
+ '{poll_interval: $pi, empty_queue_cycles: $eqc}' > "$tmp_tuning" \
548
+ && mv "$tmp_tuning" "$tuning_file"
549
+ fi
550
+
551
+ echo "$interval"
552
+ }
553
+
554
+ # Rolling average cost per template from costs.json (last 10 runs)
555
+ get_adaptive_cost_estimate() {
556
+ local template="${1:-autonomous}"
557
+
558
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
559
+ echo "$EST_COST_PER_JOB"
560
+ return
561
+ fi
562
+
563
+ local costs_file="$HOME/.shipwright/costs.json"
564
+ if [[ ! -f "$costs_file" ]]; then
565
+ echo "$EST_COST_PER_JOB"
566
+ return
567
+ fi
568
+
569
+ local avg_cost
570
+ avg_cost=$(jq -r --arg tpl "$template" '
571
+ [.sessions // [] | .[] | select(.template == $tpl) | .total_cost_usd // 0] |
572
+ .[-10:] | if length > 0 then (add / length) else null end
573
+ ' "$costs_file" 2>/dev/null || echo "")
574
+
575
+ if [[ -n "$avg_cost" && "$avg_cost" != "null" && "$avg_cost" != "0" ]]; then
576
+ echo "$avg_cost"
577
+ else
578
+ echo "$EST_COST_PER_JOB"
579
+ fi
580
+ }
581
+
582
+ # Per-stage adaptive heartbeat timeout from learned stage durations
583
+ get_adaptive_heartbeat_timeout() {
584
+ local stage="${1:-unknown}"
585
+
586
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
587
+ echo "${HEALTH_HEARTBEAT_TIMEOUT:-120}"
588
+ return
589
+ fi
590
+
591
+ # Stage-specific defaults (used when no learned data)
592
+ local default_timeout="${HEALTH_HEARTBEAT_TIMEOUT:-120}"
593
+ case "$stage" in
594
+ build) default_timeout=300 ;;
595
+ test) default_timeout=180 ;;
596
+ review|compound_quality) default_timeout=180 ;;
597
+ lint|format|intake|plan|design) default_timeout=60 ;;
598
+ esac
599
+
600
+ local durations_file="$HOME/.shipwright/optimization/stage-durations.json"
601
+ if [[ ! -f "$durations_file" ]]; then
602
+ echo "$default_timeout"
603
+ return
604
+ fi
605
+
606
+ local learned_duration
607
+ learned_duration=$(jq -r --arg s "$stage" \
608
+ '.stages[$s].p90_duration_s // 0' "$durations_file" 2>/dev/null || echo "0")
609
+
610
+ if [[ "$learned_duration" -gt 0 ]]; then
611
+ # 150% of p90 duration, floor of 60s
612
+ local adaptive_timeout=$(( (learned_duration * 3) / 2 ))
613
+ [[ "$adaptive_timeout" -lt 60 ]] && adaptive_timeout=60
614
+ echo "$adaptive_timeout"
615
+ else
616
+ echo "$default_timeout"
617
+ fi
618
+ }
619
+
620
+ # Adaptive stale pipeline timeout using 95th percentile of historical durations
621
+ get_adaptive_stale_timeout() {
622
+ local template="${1:-autonomous}"
623
+
624
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
625
+ echo "${HEALTH_STALE_TIMEOUT:-1800}"
626
+ return
627
+ fi
628
+
629
+ local durations_file="$HOME/.shipwright/optimization/pipeline-durations.json"
630
+ if [[ ! -f "$durations_file" ]]; then
631
+ echo "${HEALTH_STALE_TIMEOUT:-1800}"
632
+ return
633
+ fi
634
+
635
+ local p95_duration
636
+ p95_duration=$(jq -r --arg tpl "$template" \
637
+ '.templates[$tpl].p95_duration_s // 0' "$durations_file" 2>/dev/null || echo "0")
638
+
639
+ if [[ "$p95_duration" -gt 0 ]]; then
640
+ # 1.5x safety margin, clamped 600s-7200s
641
+ local adaptive_timeout=$(( (p95_duration * 3) / 2 ))
642
+ [[ "$adaptive_timeout" -lt 600 ]] && adaptive_timeout=600
643
+ [[ "$adaptive_timeout" -gt 7200 ]] && adaptive_timeout=7200
644
+ echo "$adaptive_timeout"
645
+ else
646
+ echo "${HEALTH_STALE_TIMEOUT:-1800}"
647
+ fi
648
+ }
649
+
650
+ # Record pipeline duration for future threshold learning
651
+ record_pipeline_duration() {
652
+ local template="$1" duration_s="$2" result="$3"
653
+
654
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
655
+ return
656
+ fi
657
+ [[ ! "$duration_s" =~ ^[0-9]+$ ]] && return
658
+
659
+ local durations_file="$HOME/.shipwright/optimization/pipeline-durations.json"
660
+ mkdir -p "$HOME/.shipwright/optimization"
661
+
662
+ if [[ ! -f "$durations_file" ]]; then
663
+ echo '{"templates":{}}' > "$durations_file"
664
+ fi
665
+
666
+ local tmp_dur="${durations_file}.tmp.$$"
667
+ jq --arg tpl "$template" --argjson dur "$duration_s" --arg res "$result" --arg ts "$(now_iso)" '
668
+ .templates[$tpl] = (
669
+ (.templates[$tpl] // {durations: [], p95_duration_s: 0}) |
670
+ .durations = ((.durations + [{duration_s: $dur, result: $res, ts: $ts}]) | .[-50:]) |
671
+ .p95_duration_s = (
672
+ [.durations[].duration_s] | sort |
673
+ if length > 0 then .[((length * 95 / 100) | floor)] else 0 end
674
+ )
675
+ )
676
+ ' "$durations_file" > "$tmp_dur" 2>/dev/null && mv "$tmp_dur" "$durations_file"
677
+ }
678
+
679
+ # ─── Progress-Based Health Monitoring ─────────────────────────────────────────
680
+ # Instead of killing jobs after a static timeout, we check for forward progress.
681
+ # Progress signals: stage transitions, iteration advances, git diff growth, new files.
682
+ # Graduated response: healthy → slowing → stalled → stuck → kill.
683
+
684
+ PROGRESS_DIR="$HOME/.shipwright/progress"
685
+
686
+ # Collect a progress snapshot for an active job
687
+ # Returns JSON with stage, iteration, diff_lines, files_changed
688
+ daemon_collect_snapshot() {
689
+ local issue_num="$1" worktree="$2" pid="$3"
690
+
691
+ local stage="" iteration=0 diff_lines=0 files_changed=0 last_error=""
692
+
693
+ # Get stage and iteration from heartbeat (fastest source)
694
+ local heartbeat_dir="$HOME/.shipwright/heartbeats"
695
+ if [[ -d "$heartbeat_dir" ]]; then
696
+ local hb_file
697
+ for hb_file in "$heartbeat_dir"/*.json; do
698
+ [[ ! -f "$hb_file" ]] && continue
699
+ local hb_pid
700
+ hb_pid=$(jq -r '.pid // 0' "$hb_file" 2>/dev/null || echo 0)
701
+ if [[ "$hb_pid" == "$pid" ]]; then
702
+ stage=$(jq -r '.stage // "unknown"' "$hb_file" 2>/dev/null || echo "unknown")
703
+ iteration=$(jq -r '.iteration // 0' "$hb_file" 2>/dev/null || echo 0)
704
+ [[ "$iteration" == "null" ]] && iteration=0
705
+ break
706
+ fi
707
+ done
708
+ fi
709
+
710
+ # Fallback: read stage from pipeline-state.md in worktree
711
+ if [[ -z "$stage" || "$stage" == "unknown" ]] && [[ -d "$worktree" ]]; then
712
+ local state_file="$worktree/.claude/pipeline-state.md"
713
+ if [[ -f "$state_file" ]]; then
714
+ stage=$(grep -m1 '^current_stage:' "$state_file" 2>/dev/null | sed 's/^current_stage: *//' || echo "unknown")
715
+ fi
716
+ fi
717
+
718
+ # Get git diff stats from worktree (how much code has been written)
719
+ if [[ -d "$worktree/.git" ]] || [[ -f "$worktree/.git" ]]; then
720
+ diff_lines=$(cd "$worktree" && git diff --stat 2>/dev/null | tail -1 | grep -o '[0-9]* insertion' | grep -o '[0-9]*' || echo "0")
721
+ [[ -z "$diff_lines" ]] && diff_lines=0
722
+ files_changed=$(cd "$worktree" && git diff --name-only 2>/dev/null | wc -l | tr -d ' ' || echo "0")
723
+ # Also count untracked files the agent has created
724
+ local untracked
725
+ untracked=$(cd "$worktree" && git ls-files --others --exclude-standard 2>/dev/null | wc -l | tr -d ' ' || echo "0")
726
+ files_changed=$((files_changed + untracked))
727
+ fi
728
+
729
+ # Check last error from error log
730
+ if [[ -d "$worktree" ]]; then
731
+ local error_log="$worktree/.claude/pipeline-artifacts/error-log.jsonl"
732
+ if [[ -f "$error_log" ]]; then
733
+ last_error=$(tail -1 "$error_log" 2>/dev/null | jq -r '.signature // ""' 2>/dev/null || echo "")
734
+ fi
735
+ fi
736
+
737
+ # Output JSON snapshot
738
+ jq -n \
739
+ --arg stage "$stage" \
740
+ --argjson iteration "${iteration:-0}" \
741
+ --argjson diff_lines "${diff_lines:-0}" \
742
+ --argjson files_changed "${files_changed:-0}" \
743
+ --arg last_error "$last_error" \
744
+ --arg ts "$(now_iso)" \
745
+ '{
746
+ stage: $stage,
747
+ iteration: $iteration,
748
+ diff_lines: $diff_lines,
749
+ files_changed: $files_changed,
750
+ last_error: $last_error,
751
+ ts: $ts
752
+ }'
753
+ }
754
+
755
+ # Assess job progress by comparing current snapshot to previous
756
+ # Returns: healthy | slowing | stalled | stuck
757
+ daemon_assess_progress() {
758
+ local issue_num="$1" current_snapshot="$2"
759
+
760
+ mkdir -p "$PROGRESS_DIR"
761
+ local progress_file="$PROGRESS_DIR/issue-${issue_num}.json"
762
+
763
+ # If no previous snapshot, store this one and return healthy
764
+ if [[ ! -f "$progress_file" ]]; then
765
+ jq -n \
766
+ --argjson snap "$current_snapshot" \
767
+ --arg issue "$issue_num" \
768
+ '{
769
+ issue: $issue,
770
+ snapshots: [$snap],
771
+ no_progress_count: 0,
772
+ last_progress_at: $snap.ts,
773
+ repeated_error_count: 0
774
+ }' > "$progress_file"
775
+ echo "healthy"
776
+ return
777
+ fi
778
+
779
+ local prev_data
780
+ prev_data=$(cat "$progress_file")
781
+
782
+ # Get previous snapshot values
783
+ local prev_stage prev_iteration prev_diff_lines prev_files prev_error prev_no_progress
784
+ prev_stage=$(echo "$prev_data" | jq -r '.snapshots[-1].stage // "unknown"')
785
+ prev_iteration=$(echo "$prev_data" | jq -r '.snapshots[-1].iteration // 0')
786
+ prev_diff_lines=$(echo "$prev_data" | jq -r '.snapshots[-1].diff_lines // 0')
787
+ prev_files=$(echo "$prev_data" | jq -r '.snapshots[-1].files_changed // 0')
788
+ prev_error=$(echo "$prev_data" | jq -r '.snapshots[-1].last_error // ""')
789
+ prev_no_progress=$(echo "$prev_data" | jq -r '.no_progress_count // 0')
790
+ local prev_repeated_errors
791
+ prev_repeated_errors=$(echo "$prev_data" | jq -r '.repeated_error_count // 0')
792
+
793
+ # Get current values
794
+ local cur_stage cur_iteration cur_diff cur_files cur_error
795
+ cur_stage=$(echo "$current_snapshot" | jq -r '.stage')
796
+ cur_iteration=$(echo "$current_snapshot" | jq -r '.iteration')
797
+ cur_diff=$(echo "$current_snapshot" | jq -r '.diff_lines')
798
+ cur_files=$(echo "$current_snapshot" | jq -r '.files_changed')
799
+ cur_error=$(echo "$current_snapshot" | jq -r '.last_error')
800
+
801
+ # Detect progress
802
+ local has_progress=false
803
+
804
+ # Stage advanced → clear progress
805
+ if [[ "$cur_stage" != "$prev_stage" && "$cur_stage" != "unknown" ]]; then
806
+ has_progress=true
807
+ daemon_log INFO "Progress: issue #${issue_num} stage ${prev_stage} → ${cur_stage}"
808
+ fi
809
+
810
+ # Iteration increased → clear progress (agent is looping but advancing)
811
+ if [[ "$cur_iteration" -gt "$prev_iteration" ]]; then
812
+ has_progress=true
813
+ daemon_log INFO "Progress: issue #${issue_num} iteration ${prev_iteration} → ${cur_iteration}"
814
+ fi
815
+
816
+ # Diff lines grew (agent is writing code)
817
+ if [[ "$cur_diff" -gt "$prev_diff_lines" ]]; then
818
+ has_progress=true
819
+ fi
820
+
821
+ # More files touched
822
+ if [[ "$cur_files" -gt "$prev_files" ]]; then
823
+ has_progress=true
824
+ fi
825
+
826
+ # Detect repeated errors (same error signature hitting again)
827
+ local repeated_errors="$prev_repeated_errors"
828
+ if [[ -n "$cur_error" && "$cur_error" == "$prev_error" ]]; then
829
+ repeated_errors=$((repeated_errors + 1))
830
+ elif [[ -n "$cur_error" && "$cur_error" != "$prev_error" ]]; then
831
+ # Different error — reset counter (agent is making different mistakes, that's progress)
832
+ repeated_errors=0
833
+ fi
834
+
835
+ # Update no_progress counter
836
+ local no_progress_count
837
+ if [[ "$has_progress" == "true" ]]; then
838
+ no_progress_count=0
839
+ repeated_errors=0
840
+ else
841
+ no_progress_count=$((prev_no_progress + 1))
842
+ fi
843
+
844
+ # Update progress file (keep last 10 snapshots)
845
+ local tmp_progress="${progress_file}.tmp.$$"
846
+ jq \
847
+ --argjson snap "$current_snapshot" \
848
+ --argjson npc "$no_progress_count" \
849
+ --argjson rec "$repeated_errors" \
850
+ --arg ts "$(now_iso)" \
851
+ '
852
+ .snapshots = ((.snapshots + [$snap]) | .[-10:]) |
853
+ .no_progress_count = $npc |
854
+ .repeated_error_count = $rec |
855
+ if $npc == 0 then .last_progress_at = $ts else . end
856
+ ' "$progress_file" > "$tmp_progress" 2>/dev/null && mv "$tmp_progress" "$progress_file"
857
+
858
+ # Determine verdict
859
+ local warn_threshold="${PROGRESS_CHECKS_BEFORE_WARN:-3}"
860
+ local kill_threshold="${PROGRESS_CHECKS_BEFORE_KILL:-6}"
861
+
862
+ # Stuck in same error loop — accelerate to kill
863
+ if [[ "$repeated_errors" -ge 3 ]]; then
864
+ echo "stuck"
865
+ return
866
+ fi
867
+
868
+ if [[ "$no_progress_count" -ge "$kill_threshold" ]]; then
869
+ echo "stuck"
870
+ elif [[ "$no_progress_count" -ge "$warn_threshold" ]]; then
871
+ echo "stalled"
872
+ elif [[ "$no_progress_count" -ge 1 ]]; then
873
+ echo "slowing"
874
+ else
875
+ echo "healthy"
876
+ fi
877
+ }
878
+
879
+ # Clean up progress tracking for a completed/failed job
880
+ daemon_clear_progress() {
881
+ local issue_num="$1"
882
+ rm -f "$PROGRESS_DIR/issue-${issue_num}.json"
883
+ }
884
+
885
+ # Learn actual worker memory from peak RSS of pipeline processes
886
+ learn_worker_memory() {
887
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
888
+ return
889
+ fi
890
+ if [[ ! -f "$STATE_FILE" ]]; then
891
+ return
892
+ fi
893
+
894
+ local total_rss=0
895
+ local process_count=0
896
+
897
+ while IFS= read -r job; do
898
+ local pid
899
+ pid=$(echo "$job" | jq -r '.pid // empty')
900
+ [[ -z "$pid" || ! "$pid" =~ ^[0-9]+$ ]] && continue
901
+ if kill -0 "$pid" 2>/dev/null; then
902
+ local rss_kb
903
+ rss_kb=$(ps -o rss= -p "$pid" 2>/dev/null | tr -d ' ' || echo "0")
904
+ [[ ! "$rss_kb" =~ ^[0-9]+$ ]] && rss_kb=0
905
+ if [[ "$rss_kb" -gt 0 ]]; then
906
+ total_rss=$((total_rss + rss_kb))
907
+ process_count=$((process_count + 1))
908
+ fi
909
+ fi
910
+ done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
911
+
912
+ if [[ "$process_count" -gt 0 ]]; then
913
+ local avg_rss_gb=$(( total_rss / process_count / 1048576 ))
914
+ # 125% headroom, minimum 1GB, max 16GB
915
+ local learned_mem_gb=$(( (avg_rss_gb * 5 + 3) / 4 ))
916
+ [[ "$learned_mem_gb" -lt 1 ]] && learned_mem_gb=1
917
+ [[ "$learned_mem_gb" -gt 16 ]] && learned_mem_gb=16
918
+
919
+ local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
920
+ mkdir -p "$HOME/.shipwright/optimization"
921
+ local tmp_tuning="${tuning_file}.tmp.$$"
922
+ if [[ -f "$tuning_file" ]]; then
923
+ jq --argjson mem "$learned_mem_gb" --argjson rss "$total_rss" --argjson cnt "$process_count" \
924
+ '.learned_worker_mem_gb = $mem | .last_rss_total_kb = $rss | .last_rss_process_count = $cnt' \
925
+ "$tuning_file" > "$tmp_tuning" 2>/dev/null && mv "$tmp_tuning" "$tuning_file"
926
+ else
927
+ jq -n --argjson mem "$learned_mem_gb" \
928
+ '{learned_worker_mem_gb: $mem}' > "$tmp_tuning" && mv "$tmp_tuning" "$tuning_file"
929
+ fi
930
+
931
+ WORKER_MEM_GB="$learned_mem_gb"
932
+ fi
933
+ }
934
+
935
+ # Record scaling outcome for learning optimal parallelism
936
+ record_scaling_outcome() {
937
+ local parallelism="$1" result="$2"
938
+
939
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
940
+ return
941
+ fi
942
+
943
+ local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
944
+ mkdir -p "$HOME/.shipwright/optimization"
945
+ local tmp_tuning="${tuning_file}.tmp.$$"
946
+ if [[ -f "$tuning_file" ]]; then
947
+ jq --argjson p "$parallelism" --arg r "$result" --arg ts "$(now_iso)" '
948
+ .scaling_history = ((.scaling_history // []) + [{parallelism: $p, result: $r, ts: $ts}]) |
949
+ .scaling_history |= .[-50:]
950
+ ' "$tuning_file" > "$tmp_tuning" 2>/dev/null && mv "$tmp_tuning" "$tuning_file"
951
+ else
952
+ jq -n --argjson p "$parallelism" --arg r "$result" --arg ts "$(now_iso)" '
953
+ {scaling_history: [{parallelism: $p, result: $r, ts: $ts}]}
954
+ ' > "$tmp_tuning" && mv "$tmp_tuning" "$tuning_file"
955
+ fi
956
+ }
957
+
958
+ # Get success rate at a given parallelism level (for gradual scaling decisions)
959
+ get_success_rate_at_parallelism() {
960
+ local target_parallelism="$1"
961
+
962
+ local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
963
+ if [[ ! -f "$tuning_file" ]]; then
964
+ echo "100"
965
+ return
966
+ fi
967
+
968
+ local rate
969
+ rate=$(jq -r --argjson p "$target_parallelism" '
970
+ [.scaling_history // [] | .[] | select(.parallelism == $p)] |
971
+ if length > 0 then
972
+ ([.[] | select(.result == "success")] | length) * 100 / length | floor
973
+ else 100 end
974
+ ' "$tuning_file" 2>/dev/null || echo "100")
975
+
976
+ echo "${rate:-100}"
977
+ }
978
+
979
+ # Adapt patrol limits based on hit rate
980
+ adapt_patrol_limits() {
981
+ local findings="$1" max_issues="$2"
982
+
983
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
984
+ return
985
+ fi
986
+
987
+ local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
988
+ mkdir -p "$HOME/.shipwright/optimization"
989
+
990
+ local new_max="$max_issues"
991
+ if [[ "$findings" -ge "$max_issues" ]]; then
992
+ # Consistently hitting limit — increase
993
+ new_max=$((max_issues + 2))
994
+ [[ "$new_max" -gt 20 ]] && new_max=20
995
+ elif [[ "$findings" -eq 0 ]]; then
996
+ # Finds nothing — reduce
997
+ if [[ "$max_issues" -gt 3 ]]; then
998
+ new_max=$((max_issues - 1))
999
+ else
1000
+ new_max=3
1001
+ fi
1002
+ fi
1003
+
1004
+ local tmp_tuning="${tuning_file}.tmp.$$"
1005
+ if [[ -f "$tuning_file" ]]; then
1006
+ jq --argjson pm "$new_max" --argjson lf "$findings" --arg ts "$(now_iso)" \
1007
+ '.patrol_max_issues = $pm | .last_patrol_findings = $lf | .patrol_adapted_at = $ts' \
1008
+ "$tuning_file" > "$tmp_tuning" 2>/dev/null && mv "$tmp_tuning" "$tuning_file"
1009
+ else
1010
+ jq -n --argjson pm "$new_max" --argjson lf "$findings" --arg ts "$(now_iso)" \
1011
+ '{patrol_max_issues: $pm, last_patrol_findings: $lf, patrol_adapted_at: $ts}' \
1012
+ > "$tmp_tuning" && mv "$tmp_tuning" "$tuning_file"
1013
+ fi
1014
+ }
1015
+
1016
+ # Load adaptive patrol limits from tuning config
1017
+ load_adaptive_patrol_limits() {
1018
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" != "true" ]]; then
1019
+ return
1020
+ fi
1021
+
1022
+ local tuning_file="$HOME/.shipwright/optimization/daemon-tuning.json"
1023
+ if [[ ! -f "$tuning_file" ]]; then
1024
+ return
1025
+ fi
1026
+
1027
+ local adaptive_max_issues
1028
+ adaptive_max_issues=$(jq -r '.patrol_max_issues // 0' "$tuning_file" 2>/dev/null || echo "0")
1029
+ if [[ "$adaptive_max_issues" -gt 0 ]]; then
1030
+ PATROL_MAX_ISSUES="$adaptive_max_issues"
1031
+ fi
1032
+ }
1033
+
1034
+ # Extract dependency issue numbers from issue text
1035
+ extract_issue_dependencies() {
1036
+ local text="$1"
1037
+
1038
+ echo "$text" | grep -oE '(depends on|blocked by|after) #[0-9]+' | grep -oE '#[0-9]+' | sort -u || true
378
1039
  }
379
1040
 
380
1041
  # ─── Logging ─────────────────────────────────────────────────────────────────
@@ -387,6 +1048,18 @@ daemon_log() {
387
1048
  ts=$(now_iso)
388
1049
  echo "[$ts] [$level] $msg" >> "$LOG_FILE"
389
1050
 
1051
+ # Rotate daemon.log if over 20MB (checked every ~100 writes)
1052
+ if [[ $(( RANDOM % 100 )) -eq 0 ]] && [[ -f "$LOG_FILE" ]]; then
1053
+ local log_size
1054
+ log_size=$(wc -c < "$LOG_FILE" 2>/dev/null || echo 0)
1055
+ if [[ "$log_size" -gt 20971520 ]]; then
1056
+ [[ -f "${LOG_FILE}.2" ]] && mv "${LOG_FILE}.2" "${LOG_FILE}.3"
1057
+ [[ -f "${LOG_FILE}.1" ]] && mv "${LOG_FILE}.1" "${LOG_FILE}.2"
1058
+ mv "$LOG_FILE" "${LOG_FILE}.1"
1059
+ touch "$LOG_FILE"
1060
+ fi
1061
+ fi
1062
+
390
1063
  # Also print to stdout
391
1064
  case "$level" in
392
1065
  INFO) info "$msg" ;;
@@ -431,6 +1104,40 @@ notify() {
431
1104
  fi
432
1105
  }
433
1106
 
1107
+ # ─── GitHub Rate-Limit Circuit Breaker ─────────────────────────────────────
1108
+ # Tracks consecutive GitHub API failures. If we hit too many failures in a row,
1109
+ # we back off exponentially to avoid hammering a rate-limited API.
1110
+
1111
+ GH_CONSECUTIVE_FAILURES=0
1112
+ GH_BACKOFF_UNTIL=0 # epoch seconds — skip gh calls until this time
1113
+
1114
+ gh_rate_limited() {
1115
+ # Returns 0 (true) if we should skip GitHub API calls
1116
+ local now_e
1117
+ now_e=$(now_epoch)
1118
+ if [[ "$GH_BACKOFF_UNTIL" -gt "$now_e" ]]; then
1119
+ return 0
1120
+ fi
1121
+ return 1
1122
+ }
1123
+
1124
+ gh_record_success() {
1125
+ GH_CONSECUTIVE_FAILURES=0
1126
+ GH_BACKOFF_UNTIL=0
1127
+ }
1128
+
1129
+ gh_record_failure() {
1130
+ GH_CONSECUTIVE_FAILURES=$((GH_CONSECUTIVE_FAILURES + 1))
1131
+ if [[ "$GH_CONSECUTIVE_FAILURES" -ge 3 ]]; then
1132
+ # Exponential backoff: 30s, 60s, 120s, 240s (capped at 5min)
1133
+ local backoff_secs=$((30 * (1 << (GH_CONSECUTIVE_FAILURES - 3))))
1134
+ [[ "$backoff_secs" -gt 300 ]] && backoff_secs=300
1135
+ GH_BACKOFF_UNTIL=$(( $(now_epoch) + backoff_secs ))
1136
+ daemon_log WARN "GitHub rate-limit circuit breaker: backing off ${backoff_secs}s after ${GH_CONSECUTIVE_FAILURES} failures"
1137
+ emit_event "daemon.rate_limit" "failures=$GH_CONSECUTIVE_FAILURES" "backoff_s=$backoff_secs"
1138
+ fi
1139
+ }
1140
+
434
1141
  # ─── Pre-flight Checks ──────────────────────────────────────────────────────
435
1142
 
436
1143
  preflight_checks() {
@@ -490,10 +1197,10 @@ preflight_checks() {
490
1197
  fi
491
1198
 
492
1199
  # 4. Pipeline script
493
- if [[ -x "$SCRIPT_DIR/cct-pipeline.sh" ]]; then
494
- echo -e " ${GREEN}✓${RESET} cct-pipeline.sh available"
1200
+ if [[ -x "$SCRIPT_DIR/sw-pipeline.sh" ]]; then
1201
+ echo -e " ${GREEN}✓${RESET} sw-pipeline.sh available"
495
1202
  else
496
- echo -e " ${RED}✗${RESET} cct-pipeline.sh not found at $SCRIPT_DIR"
1203
+ echo -e " ${RED}✗${RESET} sw-pipeline.sh not found at $SCRIPT_DIR"
497
1204
  errors=$((errors + 1))
498
1205
  fi
499
1206
 
@@ -518,17 +1225,59 @@ preflight_checks() {
518
1225
 
519
1226
  # ─── State Management ───────────────────────────────────────────────────────
520
1227
 
1228
+ # State file lock FD (used by locked_state_update for serialized read-modify-write)
1229
+ STATE_LOCK_FD=7
1230
+
521
1231
  # Atomic write: write to tmp file, then mv (prevents corruption on crash)
522
1232
  atomic_write_state() {
523
1233
  local content="$1"
524
- local tmp_file="${STATE_FILE}.tmp.$$"
525
- echo "$content" > "$tmp_file"
526
- mv "$tmp_file" "$STATE_FILE"
1234
+ local tmp_file
1235
+ tmp_file=$(mktemp "${STATE_FILE}.tmp.XXXXXX") || {
1236
+ daemon_log ERROR "Failed to create temp file for state write"
1237
+ return 1
1238
+ }
1239
+ echo "$content" > "$tmp_file" || {
1240
+ daemon_log ERROR "Failed to write state to temp file"
1241
+ rm -f "$tmp_file"
1242
+ return 1
1243
+ }
1244
+ mv "$tmp_file" "$STATE_FILE" || {
1245
+ daemon_log ERROR "Failed to move temp state file into place"
1246
+ rm -f "$tmp_file"
1247
+ return 1
1248
+ }
1249
+ }
1250
+
1251
+ # Locked read-modify-write: prevents TOCTOU race on state file.
1252
+ # Usage: locked_state_update '.queued += [42]'
1253
+ # The jq expression is applied to the current state file atomically.
1254
+ locked_state_update() {
1255
+ local jq_expr="$1"
1256
+ shift
1257
+ local lock_file="${STATE_FILE}.lock"
1258
+ (
1259
+ if command -v flock &>/dev/null; then
1260
+ flock -w 5 200 2>/dev/null || {
1261
+ daemon_log ERROR "locked_state_update: lock acquisition timed out — aborting"
1262
+ return 1
1263
+ }
1264
+ fi
1265
+ local tmp
1266
+ tmp=$(jq "$jq_expr" "$@" "$STATE_FILE" 2>&1) || {
1267
+ daemon_log ERROR "locked_state_update: jq failed — $(echo "$tmp" | head -1)"
1268
+ return 1
1269
+ }
1270
+ atomic_write_state "$tmp" || {
1271
+ daemon_log ERROR "locked_state_update: atomic_write_state failed"
1272
+ return 1
1273
+ }
1274
+ ) 200>"$lock_file"
527
1275
  }
528
1276
 
529
1277
  init_state() {
530
1278
  if [[ ! -f "$STATE_FILE" ]]; then
531
- jq -n \
1279
+ local init_json
1280
+ init_json=$(jq -n \
532
1281
  --arg pid "$$" \
533
1282
  --arg started "$(now_iso)" \
534
1283
  --argjson interval "$POLL_INTERVAL" \
@@ -550,25 +1299,32 @@ init_state() {
550
1299
  queued: [],
551
1300
  completed: [],
552
1301
  retry_counts: {},
553
- priority_lane_active: []
554
- }' > "$STATE_FILE"
1302
+ priority_lane_active: [],
1303
+ titles: {}
1304
+ }')
1305
+ local lock_file="${STATE_FILE}.lock"
1306
+ (
1307
+ if command -v flock &>/dev/null; then
1308
+ flock -w 5 200 2>/dev/null || {
1309
+ daemon_log ERROR "init_state: lock acquisition timed out"
1310
+ return 1
1311
+ }
1312
+ fi
1313
+ atomic_write_state "$init_json"
1314
+ ) 200>"$lock_file"
555
1315
  else
556
1316
  # Update PID and start time in existing state
557
- local tmp
558
- tmp=$(jq \
1317
+ locked_state_update \
559
1318
  --arg pid "$$" \
560
1319
  --arg started "$(now_iso)" \
561
- '.pid = ($pid | tonumber) | .started_at = $started' \
562
- "$STATE_FILE")
563
- atomic_write_state "$tmp"
1320
+ '.pid = ($pid | tonumber) | .started_at = $started'
564
1321
  fi
565
1322
  }
566
1323
 
567
1324
  update_state_field() {
568
1325
  local field="$1" value="$2"
569
- local tmp
570
- tmp=$(jq --arg val "$value" ".${field} = \$val" "$STATE_FILE")
571
- atomic_write_state "$tmp"
1326
+ locked_state_update --arg field "$field" --arg val "$value" \
1327
+ '.[$field] = $val'
572
1328
  }
573
1329
 
574
1330
  # ─── Inflight Check ─────────────────────────────────────────────────────────
@@ -611,15 +1367,36 @@ get_active_count() {
611
1367
  jq -r '.active_jobs | length' "$STATE_FILE" 2>/dev/null || echo 0
612
1368
  }
613
1369
 
1370
+ # Race-safe active count: acquires state lock before reading.
1371
+ # Returns MAX_PARALLEL on lock timeout (safe fail — prevents over-spawning).
1372
+ locked_get_active_count() {
1373
+ if [[ ! -f "$STATE_FILE" ]]; then
1374
+ echo 0
1375
+ return
1376
+ fi
1377
+ local lock_file="${STATE_FILE}.lock"
1378
+ local count
1379
+ count=$(
1380
+ (
1381
+ if command -v flock &>/dev/null; then
1382
+ flock -w 5 200 2>/dev/null || {
1383
+ daemon_log WARN "locked_get_active_count: lock timeout — returning MAX_PARALLEL as safe default"
1384
+ echo "$MAX_PARALLEL"
1385
+ exit 0
1386
+ }
1387
+ fi
1388
+ jq -r '.active_jobs | length' "$STATE_FILE" 2>/dev/null || echo "$MAX_PARALLEL"
1389
+ ) 200>"$lock_file"
1390
+ )
1391
+ echo "${count:-0}"
1392
+ }
1393
+
614
1394
  # ─── Queue Management ───────────────────────────────────────────────────────
615
1395
 
616
1396
  enqueue_issue() {
617
1397
  local issue_num="$1"
618
- local tmp
619
- tmp=$(jq --argjson num "$issue_num" \
620
- '.queued += [$num] | .queued |= unique' \
621
- "$STATE_FILE")
622
- atomic_write_state "$tmp"
1398
+ locked_state_update --argjson num "$issue_num" \
1399
+ '.queued += [$num] | .queued |= unique'
623
1400
  daemon_log INFO "Queued issue #${issue_num} (at capacity)"
624
1401
  }
625
1402
 
@@ -631,10 +1408,8 @@ dequeue_next() {
631
1408
  local next
632
1409
  next=$(jq -r '.queued[0] // empty' "$STATE_FILE" 2>/dev/null || true)
633
1410
  if [[ -n "$next" ]]; then
634
- # Remove from queue
635
- local tmp
636
- tmp=$(jq '.queued = .queued[1:]' "$STATE_FILE")
637
- atomic_write_state "$tmp"
1411
+ # Remove from queue (locked to prevent race with enqueue)
1412
+ locked_state_update '.queued = .queued[1:]'
638
1413
  echo "$next"
639
1414
  fi
640
1415
  }
@@ -667,11 +1442,8 @@ get_priority_active_count() {
667
1442
 
668
1443
  track_priority_job() {
669
1444
  local issue_num="$1"
670
- local tmp
671
- tmp=$(jq --argjson num "$issue_num" \
672
- '.priority_lane_active = ((.priority_lane_active // []) + [$num] | unique)' \
673
- "$STATE_FILE")
674
- atomic_write_state "$tmp"
1445
+ locked_state_update --argjson num "$issue_num" \
1446
+ '.priority_lane_active = ((.priority_lane_active // []) + [$num] | unique)'
675
1447
  }
676
1448
 
677
1449
  untrack_priority_job() {
@@ -679,11 +1451,63 @@ untrack_priority_job() {
679
1451
  if [[ ! -f "$STATE_FILE" ]]; then
680
1452
  return
681
1453
  fi
682
- local tmp
683
- tmp=$(jq --argjson num "$issue_num" \
684
- '.priority_lane_active = [(.priority_lane_active // [])[] | select(. != $num)]' \
685
- "$STATE_FILE")
686
- atomic_write_state "$tmp"
1454
+ locked_state_update --argjson num "$issue_num" \
1455
+ '.priority_lane_active = [(.priority_lane_active // [])[] | select(. != $num)]'
1456
+ }
1457
+
1458
+ # ─── Distributed Issue Claiming ───────────────────────────────────────────
1459
+
1460
+ claim_issue() {
1461
+ local issue_num="$1"
1462
+ local machine_name="$2"
1463
+
1464
+ [[ "$NO_GITHUB" == "true" ]] && return 0 # No claiming in no-github mode
1465
+
1466
+ # Try dashboard-coordinated claim first (atomic label-based)
1467
+ local resp
1468
+ resp=$(curl -s --max-time 5 -X POST "${DASHBOARD_URL}/api/claim" \
1469
+ -H "Content-Type: application/json" \
1470
+ -d "$(jq -n --argjson issue "$issue_num" --arg machine "$machine_name" \
1471
+ '{issue: $issue, machine: $machine}')" 2>/dev/null || echo "")
1472
+
1473
+ if [[ -n "$resp" ]] && echo "$resp" | jq -e '.approved == true' &>/dev/null; then
1474
+ return 0
1475
+ elif [[ -n "$resp" ]] && echo "$resp" | jq -e '.approved == false' &>/dev/null; then
1476
+ local claimed_by
1477
+ claimed_by=$(echo "$resp" | jq -r '.claimed_by // "another machine"')
1478
+ daemon_log INFO "Issue #${issue_num} claimed by ${claimed_by} (via dashboard)"
1479
+ return 1
1480
+ fi
1481
+
1482
+ # Fallback: direct GitHub label check (dashboard unreachable)
1483
+ daemon_log WARN "Dashboard unreachable — falling back to direct GitHub label claim"
1484
+ local existing_claim
1485
+ existing_claim=$(gh issue view "$issue_num" --json labels --jq \
1486
+ '[.labels[].name | select(startswith("claimed:"))] | .[0] // ""' 2>/dev/null || true)
1487
+
1488
+ if [[ -n "$existing_claim" ]]; then
1489
+ daemon_log INFO "Issue #${issue_num} already claimed: ${existing_claim}"
1490
+ return 1
1491
+ fi
1492
+
1493
+ gh issue edit "$issue_num" --add-label "claimed:${machine_name}" 2>/dev/null || return 1
1494
+ return 0
1495
+ }
1496
+
1497
+ release_claim() {
1498
+ local issue_num="$1"
1499
+ local machine_name="$2"
1500
+
1501
+ [[ "$NO_GITHUB" == "true" ]] && return 0
1502
+
1503
+ # Try dashboard-coordinated release first
1504
+ curl -s --max-time 5 -X POST "${DASHBOARD_URL}/api/claim/release" \
1505
+ -H "Content-Type: application/json" \
1506
+ -d "$(jq -n --argjson issue "$issue_num" --arg machine "$machine_name" \
1507
+ '{issue: $issue, machine: $machine}')" 2>/dev/null || true
1508
+
1509
+ # Also remove label directly as backup (idempotent)
1510
+ gh issue edit "$issue_num" --remove-label "claimed:${machine_name}" 2>/dev/null || true
687
1511
  }
688
1512
 
689
1513
  # ─── Org-Wide Repo Management ─────────────────────────────────────────────
@@ -718,6 +1542,38 @@ daemon_spawn_pipeline() {
718
1542
 
719
1543
  daemon_log INFO "Spawning pipeline for issue #${issue_num}: ${issue_title}"
720
1544
 
1545
+ # Extract goal text from issue (title + first line of body)
1546
+ local issue_goal="$issue_title"
1547
+ if [[ "$NO_GITHUB" != "true" ]]; then
1548
+ local issue_body_first
1549
+ issue_body_first=$(gh issue view "$issue_num" --json body --jq '.body' 2>/dev/null | head -3 | tr '\n' ' ' | cut -c1-200 || true)
1550
+ if [[ -n "$issue_body_first" ]]; then
1551
+ issue_goal="${issue_title}: ${issue_body_first}"
1552
+ fi
1553
+ fi
1554
+
1555
+ # ── Predictive risk assessment (if enabled) ──
1556
+ if [[ "${PREDICTION_ENABLED:-false}" == "true" ]] && type predict_pipeline_risk &>/dev/null 2>&1; then
1557
+ local issue_json_for_pred=""
1558
+ if [[ "$NO_GITHUB" != "true" ]]; then
1559
+ issue_json_for_pred=$(gh issue view "$issue_num" --json number,title,body,labels 2>/dev/null || echo "")
1560
+ fi
1561
+ if [[ -n "$issue_json_for_pred" ]]; then
1562
+ local risk_result
1563
+ risk_result=$(predict_pipeline_risk "$issue_json_for_pred" "" 2>/dev/null || echo "")
1564
+ if [[ -n "$risk_result" ]]; then
1565
+ local overall_risk
1566
+ overall_risk=$(echo "$risk_result" | jq -r '.overall_risk // 50' 2>/dev/null || echo "50")
1567
+ if [[ "$overall_risk" -gt 80 ]]; then
1568
+ daemon_log WARN "HIGH RISK (${overall_risk}%) predicted for issue #${issue_num} — upgrading model"
1569
+ export CLAUDE_MODEL="opus"
1570
+ elif [[ "$overall_risk" -lt 30 ]]; then
1571
+ daemon_log INFO "LOW RISK (${overall_risk}%) predicted for issue #${issue_num}"
1572
+ fi
1573
+ fi
1574
+ fi
1575
+ fi
1576
+
721
1577
  # Check disk space before spawning
722
1578
  local free_space_kb
723
1579
  free_space_kb=$(df -k "." 2>/dev/null | tail -1 | awk '{print $4}')
@@ -747,13 +1603,23 @@ daemon_spawn_pipeline() {
747
1603
  # Standard mode: use git worktree
748
1604
  work_dir="${WORKTREE_DIR}/daemon-issue-${issue_num}"
749
1605
 
750
- # Clean up stale worktree if it exists
751
- if [[ -d "$work_dir" ]]; then
752
- git worktree remove "$work_dir" --force 2>/dev/null || true
753
- fi
754
- git branch -D "$branch_name" 2>/dev/null || true
1606
+ # Serialize worktree operations with a lock file (run in subshell to auto-close FD)
1607
+ mkdir -p "$WORKTREE_DIR"
1608
+ local wt_ok=0
1609
+ (
1610
+ flock -w 30 200 2>/dev/null || true
1611
+
1612
+ # Clean up stale worktree if it exists
1613
+ if [[ -d "$work_dir" ]]; then
1614
+ git worktree remove "$work_dir" --force 2>/dev/null || true
1615
+ fi
1616
+ git branch -D "$branch_name" 2>/dev/null || true
1617
+
1618
+ git worktree add "$work_dir" -b "$branch_name" "$BASE_BRANCH" 2>/dev/null
1619
+ ) 200>"${WORKTREE_DIR}/.worktree.lock"
1620
+ wt_ok=$?
755
1621
 
756
- if ! git worktree add "$work_dir" -b "$branch_name" "$BASE_BRANCH" 2>/dev/null; then
1622
+ if [[ $wt_ok -ne 0 ]]; then
757
1623
  daemon_log ERROR "Failed to create worktree for issue #${issue_num}"
758
1624
  return 1
759
1625
  fi
@@ -773,17 +1639,19 @@ daemon_spawn_pipeline() {
773
1639
  fi
774
1640
 
775
1641
  # Run pipeline in work directory (background)
1642
+ echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
776
1643
  (
777
1644
  cd "$work_dir"
778
- "$SCRIPT_DIR/cct-pipeline.sh" "${pipeline_args[@]}"
779
- ) > "$LOG_DIR/issue-${issue_num}.log" 2>&1 &
1645
+ "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
1646
+ ) >> "$LOG_DIR/issue-${issue_num}.log" 2>&1 200>&- &
780
1647
  local pid=$!
781
1648
 
782
1649
  daemon_log INFO "Pipeline started for issue #${issue_num} (PID: ${pid})"
783
1650
 
784
- # Track the job (include repo for org mode)
785
- daemon_track_job "$issue_num" "$pid" "$work_dir" "$issue_title" "$repo_full_name"
1651
+ # Track the job (include repo and goal for org mode)
1652
+ daemon_track_job "$issue_num" "$pid" "$work_dir" "$issue_title" "$repo_full_name" "$issue_goal"
786
1653
  emit_event "daemon.spawn" "issue=$issue_num" "pid=$pid" "repo=${repo_full_name:-local}"
1654
+ "$SCRIPT_DIR/sw-tracker.sh" notify "spawn" "$issue_num" 2>/dev/null || true
787
1655
 
788
1656
  # Comment on the issue
789
1657
  if [[ "$NO_GITHUB" != "true" ]]; then
@@ -791,9 +1659,9 @@ daemon_spawn_pipeline() {
791
1659
  if [[ -n "$repo_full_name" ]]; then
792
1660
  gh_args+=("--repo" "$repo_full_name")
793
1661
  fi
794
- gh issue comment "$issue_num" "${gh_args[@]}" --body "## 🤖 Pipeline Started
1662
+ gh issue comment "$issue_num" ${gh_args[@]+"${gh_args[@]}"} --body "## 🤖 Pipeline Started
795
1663
 
796
- **Daemon** picked up this issue and started an autonomous pipeline.
1664
+ **Delivering:** ${issue_title}
797
1665
 
798
1666
  | Field | Value |
799
1667
  |-------|-------|
@@ -802,32 +1670,31 @@ daemon_spawn_pipeline() {
802
1670
  | Repo | \`${repo_full_name:-local}\` |
803
1671
  | Started | $(now_iso) |
804
1672
 
805
- _Progress updates will be posted as the pipeline advances._" 2>/dev/null || true
1673
+ _Progress updates will appear below as the pipeline advances through each stage._" 2>/dev/null || true
806
1674
  fi
807
1675
  }
808
1676
 
809
1677
  # ─── Track Job ───────────────────────────────────────────────────────────────
810
1678
 
811
1679
  daemon_track_job() {
812
- local issue_num="$1" pid="$2" worktree="$3" title="${4:-}" repo="${5:-}"
813
- local tmp
814
- tmp=$(jq \
1680
+ local issue_num="$1" pid="$2" worktree="$3" title="${4:-}" repo="${5:-}" goal="${6:-}"
1681
+ locked_state_update \
815
1682
  --argjson num "$issue_num" \
816
1683
  --argjson pid "$pid" \
817
1684
  --arg wt "$worktree" \
818
1685
  --arg title "$title" \
819
1686
  --arg started "$(now_iso)" \
820
1687
  --arg repo "$repo" \
1688
+ --arg goal "$goal" \
821
1689
  '.active_jobs += [{
822
1690
  issue: $num,
823
1691
  pid: $pid,
824
1692
  worktree: $wt,
825
1693
  title: $title,
826
1694
  started_at: $started,
827
- repo: $repo
828
- }]' \
829
- "$STATE_FILE")
830
- atomic_write_state "$tmp"
1695
+ repo: $repo,
1696
+ goal: $goal
1697
+ }]'
831
1698
  }
832
1699
 
833
1700
  # ─── Reap Completed Jobs ────────────────────────────────────────────────────
@@ -843,11 +1710,17 @@ daemon_reap_completed() {
843
1710
  return
844
1711
  fi
845
1712
 
1713
+ local _retry_spawned_for=""
1714
+
846
1715
  while IFS= read -r job; do
847
1716
  local issue_num pid worktree
848
- issue_num=$(echo "$job" | jq -r '.issue')
849
- pid=$(echo "$job" | jq -r '.pid')
850
- worktree=$(echo "$job" | jq -r '.worktree')
1717
+ issue_num=$(echo "$job" | jq -r '.issue // empty')
1718
+ pid=$(echo "$job" | jq -r '.pid // empty')
1719
+ worktree=$(echo "$job" | jq -r '.worktree // empty')
1720
+
1721
+ # Skip malformed entries (corrupted state file)
1722
+ [[ -z "$issue_num" || ! "$issue_num" =~ ^[0-9]+$ ]] && continue
1723
+ [[ -z "$pid" || ! "$pid" =~ ^[0-9]+$ ]] && continue
851
1724
 
852
1725
  # Check if process is still running
853
1726
  if kill -0 "$pid" 2>/dev/null; then
@@ -855,13 +1728,30 @@ daemon_reap_completed() {
855
1728
  fi
856
1729
 
857
1730
  # Process is dead — determine exit code
1731
+ # Note: wait returns 127 if process was already reaped (e.g., by init)
1732
+ # In that case, check pipeline log for success/failure indicators
858
1733
  local exit_code=0
859
1734
  wait "$pid" 2>/dev/null || exit_code=$?
1735
+ if [[ "$exit_code" -eq 127 ]]; then
1736
+ # Process already reaped — check log file for real outcome
1737
+ local issue_log="$LOG_DIR/issue-${issue_num}.log"
1738
+ if [[ -f "$issue_log" ]]; then
1739
+ if grep -q "Pipeline completed successfully" "$issue_log" 2>/dev/null; then
1740
+ exit_code=0
1741
+ elif grep -q "Pipeline failed\|ERROR.*stage.*failed\|exited with status" "$issue_log" 2>/dev/null; then
1742
+ exit_code=1
1743
+ else
1744
+ daemon_log WARN "Could not determine exit code for issue #${issue_num} (PID ${pid} already reaped) — marking as failure"
1745
+ exit_code=1
1746
+ fi
1747
+ else
1748
+ exit_code=1
1749
+ fi
1750
+ fi
860
1751
 
861
- local started_at duration_str=""
1752
+ local started_at duration_str="" start_epoch=0 end_epoch=0
862
1753
  started_at=$(echo "$job" | jq -r '.started_at // empty')
863
1754
  if [[ -n "$started_at" ]]; then
864
- local start_epoch end_epoch
865
1755
  # macOS date -j for parsing ISO dates (TZ=UTC to parse Z-suffix correctly)
866
1756
  start_epoch=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$started_at" +%s 2>/dev/null || date -d "$started_at" +%s 2>/dev/null || echo "0")
867
1757
  end_epoch=$(now_epoch)
@@ -882,31 +1772,52 @@ daemon_reap_completed() {
882
1772
  daemon_on_failure "$issue_num" "$exit_code" "$duration_str"
883
1773
  fi
884
1774
 
885
- # Remove from active_jobs and priority lane tracking
886
- local tmp
887
- tmp=$(jq --argjson num "$issue_num" \
888
- '.active_jobs = [.active_jobs[] | select(.issue != $num)]' \
889
- "$STATE_FILE")
890
- atomic_write_state "$tmp"
891
- untrack_priority_job "$issue_num"
892
-
893
- # Clean up worktree (skip for org-mode clones they persist)
894
- local job_repo
895
- job_repo=$(echo "$job" | jq -r '.repo // ""')
896
- if [[ -z "$job_repo" ]] && [[ -d "$worktree" ]]; then
897
- git worktree remove "$worktree" --force 2>/dev/null || true
898
- daemon_log INFO "Cleaned worktree: $worktree"
899
- git branch -D "daemon/issue-${issue_num}" 2>/dev/null || true
900
- elif [[ -n "$job_repo" ]]; then
901
- daemon_log INFO "Org-mode: preserving clone for ${job_repo}"
902
- fi
903
-
904
- # Dequeue next issue if available
905
- local next_issue
906
- next_issue=$(dequeue_next)
907
- if [[ -n "$next_issue" ]]; then
908
- daemon_log INFO "Dequeuing issue #${next_issue}"
909
- daemon_spawn_pipeline "$next_issue"
1775
+ # Clean up progress tracking for this job
1776
+ daemon_clear_progress "$issue_num"
1777
+
1778
+ # Release claim lock (label-based coordination)
1779
+ local reap_machine_name
1780
+ reap_machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
1781
+ release_claim "$issue_num" "$reap_machine_name"
1782
+
1783
+ # Skip cleanup if a retry was just spawned for this issue
1784
+ if [[ "$_retry_spawned_for" == "$issue_num" ]]; then
1785
+ daemon_log INFO "Retry spawned for issue #${issue_num} skipping worktree cleanup"
1786
+ else
1787
+ # Remove from active_jobs and priority lane tracking (locked)
1788
+ locked_state_update --argjson num "$issue_num" \
1789
+ '.active_jobs = [.active_jobs[] | select(.issue != $num)]'
1790
+ untrack_priority_job "$issue_num"
1791
+
1792
+ # Clean up worktree (skip for org-mode clones — they persist)
1793
+ local job_repo
1794
+ job_repo=$(echo "$job" | jq -r '.repo // ""')
1795
+ if [[ -z "$job_repo" ]] && [[ -d "$worktree" ]]; then
1796
+ git worktree remove "$worktree" --force 2>/dev/null || true
1797
+ daemon_log INFO "Cleaned worktree: $worktree"
1798
+ git branch -D "daemon/issue-${issue_num}" 2>/dev/null || true
1799
+ elif [[ -n "$job_repo" ]]; then
1800
+ daemon_log INFO "Org-mode: preserving clone for ${job_repo}"
1801
+ fi
1802
+ fi
1803
+
1804
+ # Dequeue next issue if available AND we have capacity
1805
+ # NOTE: locked_get_active_count prevents TOCTOU race with the
1806
+ # active_jobs removal above. A tiny window remains between
1807
+ # the count read and dequeue_next's own lock acquisition, but
1808
+ # dequeue_next is itself locked, so the worst case is a
1809
+ # missed dequeue that the next poll cycle will pick up.
1810
+ local current_active
1811
+ current_active=$(locked_get_active_count)
1812
+ if [[ "$current_active" -lt "$MAX_PARALLEL" ]]; then
1813
+ local next_issue
1814
+ next_issue=$(dequeue_next)
1815
+ if [[ -n "$next_issue" ]]; then
1816
+ local next_title
1817
+ next_title=$(jq -r --arg n "$next_issue" '.titles[$n] // ""' "$STATE_FILE" 2>/dev/null || true)
1818
+ daemon_log INFO "Dequeuing issue #${next_issue}: ${next_title}"
1819
+ daemon_spawn_pipeline "$next_issue" "$next_title"
1820
+ fi
910
1821
  fi
911
1822
  done <<< "$jobs"
912
1823
  }
@@ -918,9 +1829,23 @@ daemon_on_success() {
918
1829
 
919
1830
  daemon_log SUCCESS "Pipeline completed for issue #${issue_num} (${duration:-unknown})"
920
1831
 
921
- # Record in completed list
922
- local tmp
923
- tmp=$(jq \
1832
+ # Record pipeline duration for adaptive threshold learning
1833
+ if [[ -n "$duration" && "$duration" != "unknown" ]]; then
1834
+ # Parse duration string back to seconds (e.g. "5m 30s" → 330)
1835
+ local dur_secs=0
1836
+ local _h _m _s
1837
+ _h=$(echo "$duration" | grep -oE '[0-9]+h' | grep -oE '[0-9]+' || true)
1838
+ _m=$(echo "$duration" | grep -oE '[0-9]+m' | grep -oE '[0-9]+' || true)
1839
+ _s=$(echo "$duration" | grep -oE '[0-9]+s' | grep -oE '[0-9]+' || true)
1840
+ dur_secs=$(( ${_h:-0} * 3600 + ${_m:-0} * 60 + ${_s:-0} ))
1841
+ if [[ "$dur_secs" -gt 0 ]]; then
1842
+ record_pipeline_duration "$PIPELINE_TEMPLATE" "$dur_secs" "success"
1843
+ record_scaling_outcome "$MAX_PARALLEL" "success"
1844
+ fi
1845
+ fi
1846
+
1847
+ # Record in completed list + clear retry count for this issue
1848
+ locked_state_update \
924
1849
  --argjson num "$issue_num" \
925
1850
  --arg result "success" \
926
1851
  --arg dur "${duration:-unknown}" \
@@ -930,9 +1855,8 @@ daemon_on_success() {
930
1855
  result: $result,
931
1856
  duration: $dur,
932
1857
  completed_at: $completed_at
933
- }]' \
934
- "$STATE_FILE")
935
- atomic_write_state "$tmp"
1858
+ }] | .completed = .completed[-500:]
1859
+ | del(.retry_counts[($num | tostring)])'
936
1860
 
937
1861
  if [[ "$NO_GITHUB" != "true" ]]; then
938
1862
  # Remove watch label, add success label
@@ -960,6 +1884,7 @@ Check the associated PR for the implementation." 2>/dev/null || true
960
1884
 
961
1885
  notify "Pipeline Complete — Issue #${issue_num}" \
962
1886
  "Duration: ${duration:-unknown}" "success"
1887
+ "$SCRIPT_DIR/sw-tracker.sh" notify "completed" "$issue_num" 2>/dev/null || true
963
1888
  }
964
1889
 
965
1890
  # ─── Failure Handler ────────────────────────────────────────────────────────
@@ -969,9 +1894,22 @@ daemon_on_failure() {
969
1894
 
970
1895
  daemon_log ERROR "Pipeline failed for issue #${issue_num} (exit: ${exit_code}, ${duration:-unknown})"
971
1896
 
1897
+ # Record pipeline duration for adaptive threshold learning
1898
+ if [[ -n "$duration" && "$duration" != "unknown" ]]; then
1899
+ local dur_secs=0
1900
+ local _h _m _s
1901
+ _h=$(echo "$duration" | grep -oE '[0-9]+h' | grep -oE '[0-9]+' || true)
1902
+ _m=$(echo "$duration" | grep -oE '[0-9]+m' | grep -oE '[0-9]+' || true)
1903
+ _s=$(echo "$duration" | grep -oE '[0-9]+s' | grep -oE '[0-9]+' || true)
1904
+ dur_secs=$(( ${_h:-0} * 3600 + ${_m:-0} * 60 + ${_s:-0} ))
1905
+ if [[ "$dur_secs" -gt 0 ]]; then
1906
+ record_pipeline_duration "$PIPELINE_TEMPLATE" "$dur_secs" "failure"
1907
+ record_scaling_outcome "$MAX_PARALLEL" "failure"
1908
+ fi
1909
+ fi
1910
+
972
1911
  # Record in completed list
973
- local tmp
974
- tmp=$(jq \
1912
+ locked_state_update \
975
1913
  --argjson num "$issue_num" \
976
1914
  --arg result "failed" \
977
1915
  --argjson code "$exit_code" \
@@ -983,9 +1921,7 @@ daemon_on_failure() {
983
1921
  exit_code: $code,
984
1922
  duration: $dur,
985
1923
  completed_at: $completed_at
986
- }]' \
987
- "$STATE_FILE")
988
- atomic_write_state "$tmp"
1924
+ }] | .completed = .completed[-500:]'
989
1925
 
990
1926
  # ── Auto-retry with strategy escalation ──
991
1927
  if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
@@ -996,15 +1932,32 @@ daemon_on_failure() {
996
1932
  if [[ "$retry_count" -lt "${MAX_RETRIES:-2}" ]]; then
997
1933
  retry_count=$((retry_count + 1))
998
1934
 
999
- # Update retry count in state
1000
- local tmp_state
1001
- tmp_state=$(jq --arg num "$issue_num" --argjson count "$retry_count" \
1002
- '.retry_counts[$num] = $count' "$STATE_FILE")
1003
- atomic_write_state "$tmp_state"
1935
+ # Update retry count in state (locked to prevent race)
1936
+ locked_state_update \
1937
+ --arg num "$issue_num" --argjson count "$retry_count" \
1938
+ '.retry_counts[$num] = $count'
1004
1939
 
1005
1940
  daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num}"
1006
1941
  emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}"
1007
1942
 
1943
+ # Check for checkpoint to enable resume-from-checkpoint
1944
+ local checkpoint_args=()
1945
+ if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
1946
+ # Try to find worktree for this issue to check for checkpoints
1947
+ local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
1948
+ if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
1949
+ local latest_checkpoint=""
1950
+ for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
1951
+ [[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
1952
+ done
1953
+ if [[ -n "$latest_checkpoint" ]]; then
1954
+ daemon_log INFO "Found checkpoint: $latest_checkpoint"
1955
+ emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
1956
+ checkpoint_args+=("--resume")
1957
+ fi
1958
+ fi
1959
+ fi
1960
+
1008
1961
  # Build escalated pipeline args
1009
1962
  local retry_template="$PIPELINE_TEMPLATE"
1010
1963
  local retry_model="${MODEL:-opus}"
@@ -1038,12 +1991,18 @@ Pipeline failed — retrying with escalated strategy.
1038
1991
  _Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
1039
1992
  fi
1040
1993
 
1994
+ # Backoff before retry: 30s * retry_count (30s, 60s, ...)
1995
+ local backoff_secs=$((30 * retry_count))
1996
+ daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
1997
+ sleep "$backoff_secs"
1998
+
1041
1999
  # Re-spawn with escalated strategy
1042
2000
  local orig_template="$PIPELINE_TEMPLATE"
1043
2001
  local orig_model="$MODEL"
1044
2002
  PIPELINE_TEMPLATE="$retry_template"
1045
2003
  MODEL="$retry_model"
1046
2004
  daemon_spawn_pipeline "$issue_num" "retry-${retry_count}"
2005
+ _retry_spawned_for="$issue_num"
1047
2006
  PIPELINE_TEMPLATE="$orig_template"
1048
2007
  MODEL="$orig_model"
1049
2008
  return
@@ -1099,18 +2058,66 @@ _Re-add the \`${WATCH_LABEL}\` label to retry._" 2>/dev/null || true
1099
2058
 
1100
2059
  notify "Pipeline Failed — Issue #${issue_num}" \
1101
2060
  "Exit code: ${exit_code}, Duration: ${duration:-unknown}" "error"
2061
+ "$SCRIPT_DIR/sw-tracker.sh" notify "failed" "$issue_num" "Exit code: ${exit_code}, Duration: ${duration:-unknown}" 2>/dev/null || true
1102
2062
  }
1103
2063
 
1104
2064
  # ─── Intelligent Triage ──────────────────────────────────────────────────────
1105
2065
 
1106
2066
  # Score an issue from 0-100 based on multiple signals for intelligent prioritization.
1107
2067
  # Combines priority labels, age, complexity, dependencies, type, and memory signals.
2068
+ # When intelligence engine is enabled, uses semantic AI analysis for richer scoring.
1108
2069
  triage_score_issue() {
1109
2070
  local issue_json="$1"
1110
2071
  local issue_num issue_title issue_body labels_csv created_at
1111
2072
  issue_num=$(echo "$issue_json" | jq -r '.number')
1112
2073
  issue_title=$(echo "$issue_json" | jq -r '.title // ""')
1113
2074
  issue_body=$(echo "$issue_json" | jq -r '.body // ""')
2075
+
2076
+ # ── Intelligence-powered triage (if enabled) ──
2077
+ if [[ "${INTELLIGENCE_ENABLED:-false}" == "true" ]] && type intelligence_analyze_issue &>/dev/null 2>&1; then
2078
+ daemon_log INFO "Intelligence: using AI triage (intelligence enabled)"
2079
+ local analysis
2080
+ analysis=$(intelligence_analyze_issue "$issue_json" 2>/dev/null || echo "")
2081
+ if [[ -n "$analysis" && "$analysis" != "{}" && "$analysis" != "null" ]]; then
2082
+ # Extract complexity (1-10) and convert to score (0-100)
2083
+ local ai_complexity ai_risk ai_success_prob
2084
+ ai_complexity=$(echo "$analysis" | jq -r '.complexity // 0' 2>/dev/null || echo "0")
2085
+ ai_risk=$(echo "$analysis" | jq -r '.risk_level // "medium"' 2>/dev/null || echo "medium")
2086
+ ai_success_prob=$(echo "$analysis" | jq -r '.success_probability // 50' 2>/dev/null || echo "50")
2087
+
2088
+ # Store analysis for downstream use (composer, predictions)
2089
+ export INTELLIGENCE_ANALYSIS="$analysis"
2090
+ export INTELLIGENCE_COMPLEXITY="$ai_complexity"
2091
+
2092
+ # Convert AI analysis to triage score:
2093
+ # Higher success probability + lower complexity = higher score (process sooner)
2094
+ local ai_score
2095
+ ai_score=$(( ai_success_prob - (ai_complexity * 3) ))
2096
+ # Risk adjustment
2097
+ case "$ai_risk" in
2098
+ critical) ai_score=$((ai_score + 15)) ;; # Critical = process urgently
2099
+ high) ai_score=$((ai_score + 10)) ;;
2100
+ low) ai_score=$((ai_score - 5)) ;;
2101
+ esac
2102
+ # Clamp
2103
+ [[ "$ai_score" -lt 0 ]] && ai_score=0
2104
+ [[ "$ai_score" -gt 100 ]] && ai_score=100
2105
+
2106
+ emit_event "intelligence.triage" \
2107
+ "issue=$issue_num" \
2108
+ "complexity=$ai_complexity" \
2109
+ "risk=$ai_risk" \
2110
+ "success_prob=$ai_success_prob" \
2111
+ "score=$ai_score"
2112
+
2113
+ echo "$ai_score"
2114
+ return
2115
+ fi
2116
+ # Fall through to heuristic scoring if intelligence call failed
2117
+ daemon_log INFO "Intelligence: AI triage failed, falling back to heuristic scoring"
2118
+ else
2119
+ daemon_log INFO "Intelligence: using heuristic triage (intelligence disabled, enable with intelligence.enabled=true)"
2120
+ fi
1114
2121
  labels_csv=$(echo "$issue_json" | jq -r '[.labels[].name] | join(",")')
1115
2122
  created_at=$(echo "$issue_json" | jq -r '.createdAt // ""')
1116
2123
 
@@ -1211,9 +2218,9 @@ triage_score_issue() {
1211
2218
 
1212
2219
  # ── 6. Memory bonus (0-10 points / -5 for prior failures) ──
1213
2220
  local memory_score=0
1214
- if [[ -x "$SCRIPT_DIR/cct-memory.sh" ]]; then
2221
+ if [[ -x "$SCRIPT_DIR/sw-memory.sh" ]]; then
1215
2222
  local memory_result
1216
- memory_result=$("$SCRIPT_DIR/cct-memory.sh" search --issue "$issue_num" --json 2>/dev/null || true)
2223
+ memory_result=$("$SCRIPT_DIR/sw-memory.sh" search --issue "$issue_num" --json 2>/dev/null || true)
1217
2224
  if [[ -n "$memory_result" ]]; then
1218
2225
  local prior_result
1219
2226
  prior_result=$(echo "$memory_result" | jq -r '.last_result // ""' 2>/dev/null || true)
@@ -1245,6 +2252,7 @@ triage_score_issue() {
1245
2252
  }
1246
2253
 
1247
2254
  # Auto-select pipeline template based on issue labels
2255
+ # When intelligence/composer is enabled, composes a custom pipeline instead of static selection.
1248
2256
  select_pipeline_template() {
1249
2257
  local labels="$1"
1250
2258
  local score="${2:-50}"
@@ -1255,7 +2263,57 @@ select_pipeline_template() {
1255
2263
  return
1256
2264
  fi
1257
2265
 
1258
- # ── Label-based overrides (highest priority) ──
2266
+ # ── Intelligence-composed pipeline (if enabled) ──
2267
+ if [[ "${COMPOSER_ENABLED:-false}" == "true" ]] && type composer_create_pipeline &>/dev/null 2>&1; then
2268
+ daemon_log INFO "Intelligence: using AI pipeline composition (composer enabled)"
2269
+ local analysis="${INTELLIGENCE_ANALYSIS:-{}}"
2270
+ local repo_context=""
2271
+ if [[ -f "${REPO_DIR:-}/.claude/pipeline-state.md" ]]; then
2272
+ repo_context="has_pipeline_state"
2273
+ fi
2274
+ local budget_json="{}"
2275
+ if [[ -x "$SCRIPT_DIR/sw-cost.sh" ]]; then
2276
+ local remaining
2277
+ remaining=$(bash "$SCRIPT_DIR/sw-cost.sh" remaining-budget 2>/dev/null || echo "")
2278
+ if [[ -n "$remaining" ]]; then
2279
+ budget_json="{\"remaining_usd\": $remaining}"
2280
+ fi
2281
+ fi
2282
+ local composed_path
2283
+ composed_path=$(composer_create_pipeline "$analysis" "$repo_context" "$budget_json" 2>/dev/null || echo "")
2284
+ if [[ -n "$composed_path" && -f "$composed_path" ]]; then
2285
+ emit_event "daemon.composed_pipeline" "labels=$labels" "score=$score"
2286
+ echo "composed"
2287
+ return
2288
+ fi
2289
+ # Fall through to static selection if composition failed
2290
+ daemon_log INFO "Intelligence: AI pipeline composition failed, falling back to static template selection"
2291
+ else
2292
+ daemon_log INFO "Intelligence: using static template selection (composer disabled, enable with intelligence.composer_enabled=true)"
2293
+ fi
2294
+
2295
+ # ── Branch protection escalation (highest priority) ──
2296
+ if type gh_branch_protection &>/dev/null 2>&1 && [[ "${NO_GITHUB:-false}" != "true" ]]; then
2297
+ if type _gh_detect_repo &>/dev/null 2>&1; then
2298
+ _gh_detect_repo 2>/dev/null || true
2299
+ fi
2300
+ local gh_owner="${GH_OWNER:-}" gh_repo="${GH_REPO:-}"
2301
+ if [[ -n "$gh_owner" && -n "$gh_repo" ]]; then
2302
+ local protection
2303
+ protection=$(gh_branch_protection "$gh_owner" "$gh_repo" "${BASE_BRANCH:-main}" 2>/dev/null || echo '{"protected": false}')
2304
+ local strict_protection
2305
+ strict_protection=$(echo "$protection" | jq -r '.enforce_admins.enabled // false' 2>/dev/null || echo "false")
2306
+ local required_reviews
2307
+ required_reviews=$(echo "$protection" | jq -r '.required_pull_request_reviews.required_approving_review_count // 0' 2>/dev/null || echo "0")
2308
+ if [[ "$strict_protection" == "true" ]] || [[ "${required_reviews:-0}" -gt 1 ]]; then
2309
+ daemon_log INFO "Branch has strict protection — escalating to enterprise template"
2310
+ echo "enterprise"
2311
+ return
2312
+ fi
2313
+ fi
2314
+ fi
2315
+
2316
+ # ── Label-based overrides ──
1259
2317
  if echo "$labels" | grep -qi "hotfix\|incident"; then
1260
2318
  echo "hotfix"
1261
2319
  return
@@ -1363,6 +2421,16 @@ daemon_triage_show() {
1363
2421
  echo ""
1364
2422
  }
1365
2423
 
2424
+ # ─── Patrol Self-Labeling ─────────────────────────────────────────────────
2425
+ patrol_build_labels() {
2426
+ local check_label="$1"
2427
+ local labels="${PATROL_LABEL},${check_label}"
2428
+ if [[ "$PATROL_AUTO_WATCH" == "true" && -n "${WATCH_LABEL:-}" ]]; then
2429
+ labels="${labels},${WATCH_LABEL}"
2430
+ fi
2431
+ echo "$labels"
2432
+ }
2433
+
1366
2434
  # ─── Proactive Patrol Mode ───────────────────────────────────────────────────
1367
2435
 
1368
2436
  daemon_patrol() {
@@ -1413,7 +2481,7 @@ daemon_patrol() {
1413
2481
  fi
1414
2482
 
1415
2483
  findings=$((findings + 1))
1416
- emit_event "patrol.finding" "type=security" "severity=$severity" "package=$name"
2484
+ emit_event "patrol.finding" "check=security" "severity=$severity" "package=$name"
1417
2485
 
1418
2486
  # Check if issue already exists
1419
2487
  if [[ "$NO_GITHUB" != "true" ]] && [[ "$dry_run" != "true" ]]; then
@@ -1434,9 +2502,9 @@ daemon_patrol() {
1434
2502
  | Date | $(now_iso) |
1435
2503
 
1436
2504
  Auto-detected by \`shipwright daemon patrol\`." \
1437
- --label "security" --label "$PATROL_LABEL" 2>/dev/null || true
2505
+ --label "$(patrol_build_labels "security")" 2>/dev/null || true
1438
2506
  issues_created=$((issues_created + 1))
1439
- emit_event "patrol.issue_created" "type=security" "package=$name"
2507
+ emit_event "patrol.issue_created" "check=security" "package=$name"
1440
2508
  fi
1441
2509
  else
1442
2510
  echo -e " ${RED}●${RESET} ${BOLD}${severity}${RESET}: ${title} in ${CYAN}${name}${RESET}"
@@ -1467,6 +2535,39 @@ Auto-detected by \`shipwright daemon patrol\`." \
1467
2535
  fi
1468
2536
  fi
1469
2537
 
2538
+ # Enrich with GitHub security alerts
2539
+ if type gh_security_alerts &>/dev/null 2>&1 && [[ "${NO_GITHUB:-false}" != "true" ]]; then
2540
+ if type _gh_detect_repo &>/dev/null 2>&1; then
2541
+ _gh_detect_repo 2>/dev/null || true
2542
+ fi
2543
+ local gh_owner="${GH_OWNER:-}" gh_repo="${GH_REPO:-}"
2544
+ if [[ -n "$gh_owner" && -n "$gh_repo" ]]; then
2545
+ local gh_alerts
2546
+ gh_alerts=$(gh_security_alerts "$gh_owner" "$gh_repo" 2>/dev/null || echo "[]")
2547
+ local gh_alert_count
2548
+ gh_alert_count=$(echo "$gh_alerts" | jq 'length' 2>/dev/null || echo "0")
2549
+ if [[ "${gh_alert_count:-0}" -gt 0 ]]; then
2550
+ daemon_log WARN "Patrol: $gh_alert_count GitHub security alert(s) found"
2551
+ findings=$((findings + gh_alert_count))
2552
+ fi
2553
+ fi
2554
+ fi
2555
+
2556
+ # Enrich with GitHub Dependabot alerts
2557
+ if type gh_dependabot_alerts &>/dev/null 2>&1 && [[ "${NO_GITHUB:-false}" != "true" ]]; then
2558
+ local gh_owner="${GH_OWNER:-}" gh_repo="${GH_REPO:-}"
2559
+ if [[ -n "$gh_owner" && -n "$gh_repo" ]]; then
2560
+ local dep_alerts
2561
+ dep_alerts=$(gh_dependabot_alerts "$gh_owner" "$gh_repo" 2>/dev/null || echo "[]")
2562
+ local dep_alert_count
2563
+ dep_alert_count=$(echo "$dep_alerts" | jq 'length' 2>/dev/null || echo "0")
2564
+ if [[ "${dep_alert_count:-0}" -gt 0 ]]; then
2565
+ daemon_log WARN "Patrol: $dep_alert_count Dependabot alert(s) found"
2566
+ findings=$((findings + dep_alert_count))
2567
+ fi
2568
+ fi
2569
+ fi
2570
+
1470
2571
  total_findings=$((total_findings + findings))
1471
2572
  if [[ "$findings" -gt 0 ]]; then
1472
2573
  daemon_log INFO "Patrol: found ${findings} security vulnerability(ies)"
@@ -1499,7 +2600,7 @@ Auto-detected by \`shipwright daemon patrol\`." \
1499
2600
  if [[ "$diff" -ge 2 ]]; then
1500
2601
  findings=$((findings + 1))
1501
2602
  stale_packages="${stale_packages}\n- \`${name}\`: ${current} → ${latest} (${diff} major versions behind)"
1502
- emit_event "patrol.finding" "type=stale_dependency" "package=$name" "current=$current" "latest=$latest"
2603
+ emit_event "patrol.finding" "check=stale_dependency" "package=$name" "current=$current" "latest=$latest"
1503
2604
 
1504
2605
  if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
1505
2606
  echo -e " ${YELLOW}●${RESET} ${CYAN}${name}${RESET}: ${current} → ${latest} (${diff} major versions behind)"
@@ -1522,9 +2623,9 @@ The following packages are 2+ major versions behind:
1522
2623
  $(echo -e "$stale_packages")
1523
2624
 
1524
2625
  Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
1525
- --label "dependencies" --label "$PATROL_LABEL" 2>/dev/null || true
2626
+ --label "$(patrol_build_labels "dependencies")" 2>/dev/null || true
1526
2627
  issues_created=$((issues_created + 1))
1527
- emit_event "patrol.issue_created" "type=stale_dependency" "count=$findings"
2628
+ emit_event "patrol.issue_created" "check=stale_dependency" "count=$findings"
1528
2629
  fi
1529
2630
  fi
1530
2631
  fi
@@ -1586,9 +2687,9 @@ $(echo -e "$dead_files")
1586
2687
  > **Note:** Some files may be entry points or dynamically loaded. Verify before removing.
1587
2688
 
1588
2689
  Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
1589
- --label "tech-debt" --label "$PATROL_LABEL" 2>/dev/null || true
2690
+ --label "$(patrol_build_labels "tech-debt")" 2>/dev/null || true
1590
2691
  issues_created=$((issues_created + 1))
1591
- emit_event "patrol.issue_created" "type=dead_code" "count=$findings"
2692
+ emit_event "patrol.issue_created" "check=dead_code" "count=$findings"
1592
2693
  fi
1593
2694
  fi
1594
2695
 
@@ -1649,9 +2750,9 @@ These files have < 50% line coverage:
1649
2750
  $(echo -e "$low_cov_files")
1650
2751
 
1651
2752
  Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
1652
- --label "testing" --label "$PATROL_LABEL" 2>/dev/null || true
2753
+ --label "$(patrol_build_labels "testing")" 2>/dev/null || true
1653
2754
  issues_created=$((issues_created + 1))
1654
- emit_event "patrol.issue_created" "type=coverage" "count=$findings"
2755
+ emit_event "patrol.issue_created" "check=coverage" "count=$findings"
1655
2756
  fi
1656
2757
  fi
1657
2758
 
@@ -1694,9 +2795,49 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
1694
2795
  tag_epoch=$(git log -1 --format=%ct "$latest_tag" 2>/dev/null || echo "0")
1695
2796
  if [[ "$tag_epoch" -gt "$changelog_epoch" ]] && [[ "$changelog_epoch" -gt 0 ]]; then
1696
2797
  findings=$((findings + 1))
1697
- stale_docs="${stale_docs}\n- \`CHANGELOG.md\`: not updated since tag \`${latest_tag}\`"
2798
+ stale_docs="${stale_docs}\n- \`CHANGELOG.md\`: not updated since tag \`${latest_tag}\`"
2799
+ if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
2800
+ echo -e " ${YELLOW}●${RESET} CHANGELOG.md not updated since ${latest_tag}"
2801
+ fi
2802
+ fi
2803
+ fi
2804
+ fi
2805
+
2806
+ # Check CLAUDE.md staleness (same pattern as README)
2807
+ if [[ -f ".claude/CLAUDE.md" ]]; then
2808
+ local claudemd_epoch claudemd_src_epoch
2809
+ claudemd_src_epoch=$(git log -1 --format=%ct -- "*.ts" "*.js" "*.py" "*.go" "*.rs" "*.sh" 2>/dev/null || echo "0")
2810
+ claudemd_epoch=$(git log -1 --format=%ct -- ".claude/CLAUDE.md" 2>/dev/null || echo "0")
2811
+ if [[ "$claudemd_src_epoch" -gt 0 ]] && [[ "$claudemd_epoch" -gt 0 ]]; then
2812
+ local claude_drift=$((claudemd_src_epoch - claudemd_epoch))
2813
+ if [[ "$claude_drift" -gt 2592000 ]]; then
2814
+ findings=$((findings + 1))
2815
+ local claude_days_behind=$((claude_drift / 86400))
2816
+ stale_docs="${stale_docs}\n- \`.claude/CLAUDE.md\`: ${claude_days_behind} days behind source code"
1698
2817
  if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
1699
- echo -e " ${YELLOW}●${RESET} CHANGELOG.md not updated since ${latest_tag}"
2818
+ echo -e " ${YELLOW}●${RESET} CLAUDE.md is ${claude_days_behind} days behind source code"
2819
+ fi
2820
+ fi
2821
+ fi
2822
+ fi
2823
+
2824
+ # Check AUTO section freshness (if sw-docs.sh available)
2825
+ if [[ -x "$SCRIPT_DIR/sw-docs.sh" ]]; then
2826
+ local docs_stale=false
2827
+ bash "$SCRIPT_DIR/sw-docs.sh" check >/dev/null 2>&1 || docs_stale=true
2828
+ if [[ "$docs_stale" == "true" ]]; then
2829
+ findings=$((findings + 1))
2830
+ stale_docs="${stale_docs}\n- AUTO sections: some documentation sections are stale"
2831
+ if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
2832
+ echo -e " ${YELLOW}●${RESET} AUTO documentation sections are stale"
2833
+ fi
2834
+ # Auto-sync if not dry run
2835
+ if [[ "$dry_run" != "true" ]] && [[ "$NO_GITHUB" != "true" ]]; then
2836
+ daemon_log INFO "Auto-syncing stale documentation sections"
2837
+ bash "$SCRIPT_DIR/sw-docs.sh" sync 2>/dev/null || true
2838
+ if ! git diff --quiet -- '*.md' 2>/dev/null; then
2839
+ git add -A '*.md' 2>/dev/null || true
2840
+ git commit -m "docs: auto-sync stale documentation sections" 2>/dev/null || true
1700
2841
  fi
1701
2842
  fi
1702
2843
  fi
@@ -1715,9 +2856,9 @@ The following docs may need updating:
1715
2856
  $(echo -e "$stale_docs")
1716
2857
 
1717
2858
  Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
1718
- --label "documentation" --label "$PATROL_LABEL" 2>/dev/null || true
2859
+ --label "$(patrol_build_labels "documentation")" 2>/dev/null || true
1719
2860
  issues_created=$((issues_created + 1))
1720
- emit_event "patrol.issue_created" "type=documentation" "count=$findings"
2861
+ emit_event "patrol.issue_created" "check=documentation" "count=$findings"
1721
2862
  fi
1722
2863
  fi
1723
2864
 
@@ -1754,7 +2895,7 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
1754
2895
  if [[ "$recent_test_dur" -gt "$threshold" ]]; then
1755
2896
  total_findings=$((total_findings + 1))
1756
2897
  local pct_slower=$(( (recent_test_dur - baseline_dur) * 100 / baseline_dur ))
1757
- emit_event "patrol.finding" "type=performance" "baseline=${baseline_dur}s" "current=${recent_test_dur}s" "regression=${pct_slower}%"
2898
+ emit_event "patrol.finding" "check=performance" "baseline=${baseline_dur}s" "current=${recent_test_dur}s" "regression=${pct_slower}%"
1758
2899
 
1759
2900
  if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
1760
2901
  echo -e " ${RED}●${RESET} Test suite ${pct_slower}% slower than baseline (${baseline_dur}s → ${recent_test_dur}s)"
@@ -1774,9 +2915,9 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
1774
2915
  | Regression | ${pct_slower}% |
1775
2916
 
1776
2917
  Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
1777
- --label "performance" --label "$PATROL_LABEL" 2>/dev/null || true
2918
+ --label "$(patrol_build_labels "performance")" 2>/dev/null || true
1778
2919
  issues_created=$((issues_created + 1))
1779
- emit_event "patrol.issue_created" "type=performance"
2920
+ emit_event "patrol.issue_created" "check=performance"
1780
2921
  fi
1781
2922
  fi
1782
2923
 
@@ -1792,31 +2933,557 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
1792
2933
  daemon_log INFO "Patrol: performance baseline updated (${recent_test_dur}s)"
1793
2934
  }
1794
2935
 
1795
- # ── Run all patrol checks ──
2936
+ # ── 7. Recurring Failure Patterns ──
2937
+ patrol_recurring_failures() {
2938
+ if [[ "$PATROL_FAILURES_THRESHOLD" -le 0 ]]; then return; fi
2939
+ daemon_log INFO "Patrol: checking recurring failure patterns"
2940
+ local findings=0
2941
+
2942
+ # Source memory functions if available
2943
+ local memory_script="$SCRIPT_DIR/sw-memory.sh"
2944
+ if [[ ! -f "$memory_script" ]]; then
2945
+ daemon_log INFO "Patrol: memory script not found — skipping recurring failures"
2946
+ return
2947
+ fi
2948
+
2949
+ # Get actionable failures from memory
2950
+ # Note: sw-memory.sh runs its CLI router on source, so we must redirect
2951
+ # the source's stdout to /dev/null and only capture the function's output
2952
+ local failures_json
2953
+ failures_json=$(
2954
+ (
2955
+ source "$memory_script" > /dev/null 2>&1 || true
2956
+ if command -v memory_get_actionable_failures &>/dev/null; then
2957
+ memory_get_actionable_failures "$PATROL_FAILURES_THRESHOLD"
2958
+ else
2959
+ echo "[]"
2960
+ fi
2961
+ )
2962
+ )
2963
+
2964
+ local count
2965
+ count=$(echo "$failures_json" | jq 'length' 2>/dev/null || echo "0")
2966
+ if [[ "${count:-0}" -eq 0 ]]; then
2967
+ daemon_log INFO "Patrol: no recurring failures above threshold ($PATROL_FAILURES_THRESHOLD)"
2968
+ return
2969
+ fi
2970
+
2971
+ while IFS= read -r failure; do
2972
+ local pattern stage seen_count last_seen root_cause
2973
+ pattern=$(echo "$failure" | jq -r '.pattern // "unknown"')
2974
+ stage=$(echo "$failure" | jq -r '.stage // "unknown"')
2975
+ seen_count=$(echo "$failure" | jq -r '.seen_count // 0')
2976
+ last_seen=$(echo "$failure" | jq -r '.last_seen // "unknown"')
2977
+ root_cause=$(echo "$failure" | jq -r '.root_cause // "Not yet identified"')
2978
+
2979
+ # Truncate pattern for title (first 60 chars)
2980
+ local short_pattern
2981
+ short_pattern=$(echo "$pattern" | cut -c1-60)
2982
+
2983
+ findings=$((findings + 1))
2984
+ emit_event "patrol.finding" "check=recurring_failure" "pattern=$short_pattern" "seen_count=$seen_count"
2985
+
2986
+ if [[ "$NO_GITHUB" != "true" ]] && [[ "$dry_run" != "true" ]]; then
2987
+ # Deduplicate
2988
+ local existing
2989
+ existing=$(gh issue list --label "$PATROL_LABEL" --label "recurring-failure" \
2990
+ --search "Fix recurring: ${short_pattern}" --json number -q 'length' 2>/dev/null || echo "0")
2991
+ if [[ "${existing:-0}" -eq 0 ]] && [[ "$issues_created" -lt "$PATROL_MAX_ISSUES" ]]; then
2992
+ gh issue create \
2993
+ --title "Fix recurring: ${short_pattern}" \
2994
+ --body "## Recurring Failure Pattern
2995
+
2996
+ | Field | Value |
2997
+ |-------|-------|
2998
+ | Stage | \`${stage}\` |
2999
+ | Pattern | \`${pattern}\` |
3000
+ | Seen count | **${seen_count}** |
3001
+ | Last seen | ${last_seen} |
3002
+ | Root cause | ${root_cause} |
3003
+ | Found by | Shipwright patrol |
3004
+ | Date | $(now_iso) |
3005
+
3006
+ ### Suggested Actions
3007
+ - Investigate the root cause in the \`${stage}\` stage
3008
+ - Check if recent changes introduced the failure
3009
+ - Add a targeted test to prevent regression
3010
+
3011
+ Auto-detected by \`shipwright daemon patrol\`." \
3012
+ --label "$(patrol_build_labels "recurring-failure")" 2>/dev/null || true
3013
+ issues_created=$((issues_created + 1))
3014
+ emit_event "patrol.issue_created" "check=recurring_failure" "pattern=$short_pattern"
3015
+ fi
3016
+ else
3017
+ echo -e " ${RED}●${RESET} ${BOLD}recurring${RESET}: ${short_pattern} (${seen_count}x in ${CYAN}${stage}${RESET})"
3018
+ fi
3019
+ done < <(echo "$failures_json" | jq -c '.[]' 2>/dev/null)
3020
+
3021
+ total_findings=$((total_findings + findings))
3022
+ daemon_log INFO "Patrol: found ${findings} recurring failure pattern(s)"
3023
+ }
3024
+
3025
+ # ── 8. DORA Metric Degradation ──
3026
+ patrol_dora_degradation() {
3027
+ if [[ "$PATROL_DORA_ENABLED" != "true" ]]; then return; fi
3028
+ daemon_log INFO "Patrol: checking DORA metric degradation"
3029
+
3030
+ if [[ ! -f "$EVENTS_FILE" ]]; then
3031
+ daemon_log INFO "Patrol: no events file — skipping DORA check"
3032
+ return
3033
+ fi
3034
+
3035
+ local now_e
3036
+ now_e=$(now_epoch)
3037
+
3038
+ # Current 7-day window
3039
+ local current_start=$((now_e - 604800))
3040
+ # Previous 7-day window
3041
+ local prev_start=$((now_e - 1209600))
3042
+ local prev_end=$current_start
3043
+
3044
+ # Get events for both windows
3045
+ local current_events prev_events
3046
+ current_events=$(jq -s --argjson start "$current_start" \
3047
+ '[.[] | select(.ts_epoch >= $start)]' "$EVENTS_FILE" 2>/dev/null || echo "[]")
3048
+ prev_events=$(jq -s --argjson start "$prev_start" --argjson end "$prev_end" \
3049
+ '[.[] | select(.ts_epoch >= $start and .ts_epoch < $end)]' "$EVENTS_FILE" 2>/dev/null || echo "[]")
3050
+
3051
+ # Helper: calculate DORA metrics from an event set
3052
+ calc_dora() {
3053
+ local events="$1"
3054
+ local total successes failures
3055
+ total=$(echo "$events" | jq '[.[] | select(.type == "pipeline.completed")] | length' 2>/dev/null || echo "0")
3056
+ successes=$(echo "$events" | jq '[.[] | select(.type == "pipeline.completed" and .result == "success")] | length' 2>/dev/null || echo "0")
3057
+ failures=$(echo "$events" | jq '[.[] | select(.type == "pipeline.completed" and .result == "failure")] | length' 2>/dev/null || echo "0")
3058
+
3059
+ local deploy_freq="0"
3060
+ [[ "$total" -gt 0 ]] && deploy_freq=$(echo "$successes 7" | awk '{printf "%.1f", $1 / ($2 / 7)}')
3061
+
3062
+ local cfr="0"
3063
+ [[ "$total" -gt 0 ]] && cfr=$(echo "$failures $total" | awk '{printf "%.1f", ($1 / $2) * 100}')
3064
+
3065
+ local cycle_time="0"
3066
+ cycle_time=$(echo "$events" | jq '[.[] | select(.type == "pipeline.completed" and .result == "success") | .duration_s] | sort | if length > 0 then .[length/2 | floor] else 0 end' 2>/dev/null || echo "0")
3067
+
3068
+ echo "{\"deploy_freq\":$deploy_freq,\"cfr\":$cfr,\"cycle_time\":$cycle_time,\"total\":$total}"
3069
+ }
3070
+
3071
+ local current_metrics prev_metrics
3072
+ current_metrics=$(calc_dora "$current_events")
3073
+ prev_metrics=$(calc_dora "$prev_events")
3074
+
3075
+ local prev_total
3076
+ prev_total=$(echo "$prev_metrics" | jq '.total' 2>/dev/null || echo "0")
3077
+ local current_total
3078
+ current_total=$(echo "$current_metrics" | jq '.total' 2>/dev/null || echo "0")
3079
+
3080
+ # Need data in both windows to compare
3081
+ if [[ "${prev_total:-0}" -lt 3 ]] || [[ "${current_total:-0}" -lt 3 ]]; then
3082
+ daemon_log INFO "Patrol: insufficient data for DORA comparison (prev=$prev_total, current=$current_total)"
3083
+ return
3084
+ fi
3085
+
3086
+ # Grade each metric using dora_grade (defined in daemon_metrics, redefined here inline)
3087
+ local_dora_grade() {
3088
+ local metric="$1" value="$2"
3089
+ case "$metric" in
3090
+ deploy_freq)
3091
+ if awk "BEGIN{exit !($value >= 7)}" 2>/dev/null; then echo "Elite"; return; fi
3092
+ if awk "BEGIN{exit !($value >= 1)}" 2>/dev/null; then echo "High"; return; fi
3093
+ if awk "BEGIN{exit !($value >= 0.25)}" 2>/dev/null; then echo "Medium"; return; fi
3094
+ echo "Low" ;;
3095
+ cfr)
3096
+ if awk "BEGIN{exit !($value < 5)}" 2>/dev/null; then echo "Elite"; return; fi
3097
+ if awk "BEGIN{exit !($value < 10)}" 2>/dev/null; then echo "High"; return; fi
3098
+ if awk "BEGIN{exit !($value < 15)}" 2>/dev/null; then echo "Medium"; return; fi
3099
+ echo "Low" ;;
3100
+ cycle_time)
3101
+ [[ "$value" -lt 3600 ]] && echo "Elite" && return
3102
+ [[ "$value" -lt 86400 ]] && echo "High" && return
3103
+ [[ "$value" -lt 604800 ]] && echo "Medium" && return
3104
+ echo "Low" ;;
3105
+ esac
3106
+ }
3107
+
3108
+ grade_rank() {
3109
+ case "$1" in
3110
+ Elite) echo 4 ;; High) echo 3 ;; Medium) echo 2 ;; Low) echo 1 ;; *) echo 0 ;;
3111
+ esac
3112
+ }
3113
+
3114
+ local degraded_metrics=""
3115
+ local degradation_details=""
3116
+
3117
+ # Check deploy frequency
3118
+ local prev_df curr_df
3119
+ prev_df=$(echo "$prev_metrics" | jq -r '.deploy_freq')
3120
+ curr_df=$(echo "$current_metrics" | jq -r '.deploy_freq')
3121
+ local prev_df_grade curr_df_grade
3122
+ prev_df_grade=$(local_dora_grade deploy_freq "$prev_df")
3123
+ curr_df_grade=$(local_dora_grade deploy_freq "$curr_df")
3124
+ if [[ "$(grade_rank "$curr_df_grade")" -lt "$(grade_rank "$prev_df_grade")" ]]; then
3125
+ degraded_metrics="${degraded_metrics}deploy_freq "
3126
+ degradation_details="${degradation_details}\n| Deploy Frequency | ${prev_df_grade} (${prev_df}/wk) | ${curr_df_grade} (${curr_df}/wk) | Check for blocked PRs, increase automation |"
3127
+ fi
3128
+
3129
+ # Check CFR
3130
+ local prev_cfr curr_cfr
3131
+ prev_cfr=$(echo "$prev_metrics" | jq -r '.cfr')
3132
+ curr_cfr=$(echo "$current_metrics" | jq -r '.cfr')
3133
+ local prev_cfr_grade curr_cfr_grade
3134
+ prev_cfr_grade=$(local_dora_grade cfr "$prev_cfr")
3135
+ curr_cfr_grade=$(local_dora_grade cfr "$curr_cfr")
3136
+ if [[ "$(grade_rank "$curr_cfr_grade")" -lt "$(grade_rank "$prev_cfr_grade")" ]]; then
3137
+ degraded_metrics="${degraded_metrics}cfr "
3138
+ degradation_details="${degradation_details}\n| Change Failure Rate | ${prev_cfr_grade} (${prev_cfr}%) | ${curr_cfr_grade} (${curr_cfr}%) | Investigate recent failures, improve test coverage |"
3139
+ fi
3140
+
3141
+ # Check Cycle Time
3142
+ local prev_ct curr_ct
3143
+ prev_ct=$(echo "$prev_metrics" | jq -r '.cycle_time')
3144
+ curr_ct=$(echo "$current_metrics" | jq -r '.cycle_time')
3145
+ local prev_ct_grade curr_ct_grade
3146
+ prev_ct_grade=$(local_dora_grade cycle_time "$prev_ct")
3147
+ curr_ct_grade=$(local_dora_grade cycle_time "$curr_ct")
3148
+ if [[ "$(grade_rank "$curr_ct_grade")" -lt "$(grade_rank "$prev_ct_grade")" ]]; then
3149
+ degraded_metrics="${degraded_metrics}cycle_time "
3150
+ degradation_details="${degradation_details}\n| Cycle Time | ${prev_ct_grade} (${prev_ct}s) | ${curr_ct_grade} (${curr_ct}s) | Profile slow stages, check for new slow tests |"
3151
+ fi
3152
+
3153
+ if [[ -z "$degraded_metrics" ]]; then
3154
+ daemon_log INFO "Patrol: no DORA degradation detected"
3155
+ return
3156
+ fi
3157
+
3158
+ local findings=0
3159
+ findings=1
3160
+ total_findings=$((total_findings + findings))
3161
+ emit_event "patrol.finding" "check=dora_regression" "metrics=$degraded_metrics"
3162
+
3163
+ if [[ "$NO_GITHUB" != "true" ]] && [[ "$dry_run" != "true" ]]; then
3164
+ local trimmed
3165
+ trimmed=$(echo "$degraded_metrics" | sed 's/ *$//' | tr ' ' ',')
3166
+ local existing
3167
+ existing=$(gh issue list --label "$PATROL_LABEL" --label "dora-regression" \
3168
+ --search "DORA regression" --json number -q 'length' 2>/dev/null || echo "0")
3169
+ if [[ "${existing:-0}" -eq 0 ]] && [[ "$issues_created" -lt "$PATROL_MAX_ISSUES" ]]; then
3170
+ gh issue create \
3171
+ --title "DORA regression: ${trimmed}" \
3172
+ --body "## DORA Metric Degradation
3173
+
3174
+ | Metric | Previous (7d) | Current (7d) | Suggested Action |
3175
+ |--------|---------------|--------------|------------------|$(echo -e "$degradation_details")
3176
+
3177
+ > Compared: previous 7-day window vs current 7-day window.
3178
+
3179
+ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
3180
+ --label "$(patrol_build_labels "dora-regression")" 2>/dev/null || true
3181
+ issues_created=$((issues_created + 1))
3182
+ emit_event "patrol.issue_created" "check=dora_regression" "metrics=$trimmed"
3183
+ fi
3184
+ else
3185
+ local trimmed
3186
+ trimmed=$(echo "$degraded_metrics" | sed 's/ *$//')
3187
+ echo -e " ${RED}●${RESET} ${BOLD}DORA regression${RESET}: ${trimmed}"
3188
+ fi
3189
+
3190
+ daemon_log INFO "Patrol: DORA degradation detected in: ${degraded_metrics}"
3191
+ }
3192
+
3193
+ # ── 9. Untested Scripts ──
3194
+ patrol_untested_scripts() {
3195
+ if [[ "$PATROL_UNTESTED_ENABLED" != "true" ]]; then return; fi
3196
+ daemon_log INFO "Patrol: checking for untested scripts"
3197
+ local findings=0
3198
+ local untested_list=""
3199
+
3200
+ local scripts_dir="$SCRIPT_DIR"
3201
+ if [[ ! -d "$scripts_dir" ]]; then
3202
+ daemon_log INFO "Patrol: scripts directory not found — skipping"
3203
+ return
3204
+ fi
3205
+
3206
+ # Collect untested scripts with usage counts
3207
+ local untested_entries=""
3208
+ while IFS= read -r script; do
3209
+ local basename
3210
+ basename=$(basename "$script")
3211
+ # Skip test scripts themselves
3212
+ [[ "$basename" == *-test.sh ]] && continue
3213
+ # Skip the main CLI router
3214
+ [[ "$basename" == "sw" ]] && continue
3215
+
3216
+ # Extract the name part (sw-NAME.sh -> NAME)
3217
+ local name
3218
+ name=$(echo "$basename" | sed 's/^sw-//' | sed 's/\.sh$//')
3219
+
3220
+ # Check if a test file exists
3221
+ if [[ ! -f "$scripts_dir/sw-${name}-test.sh" ]]; then
3222
+ # Count usage across other scripts
3223
+ local usage_count
3224
+ usage_count=$(grep -rl "sw-${name}" "$scripts_dir"/sw-*.sh 2>/dev/null | grep -cv "$basename" || true)
3225
+ usage_count=${usage_count:-0}
3226
+
3227
+ local line_count
3228
+ line_count=$(wc -l < "$script" 2>/dev/null | tr -d ' ')
3229
+
3230
+ untested_entries="${untested_entries}${usage_count}|${basename}|${line_count}\n"
3231
+ findings=$((findings + 1))
3232
+ fi
3233
+ done < <(find "$scripts_dir" -maxdepth 1 -name "sw-*.sh" -type f 2>/dev/null | sort)
3234
+
3235
+ if [[ "$findings" -eq 0 ]]; then
3236
+ daemon_log INFO "Patrol: all scripts have test files"
3237
+ return
3238
+ fi
3239
+
3240
+ # Sort by usage count descending
3241
+ local sorted_entries
3242
+ sorted_entries=$(echo -e "$untested_entries" | sort -t'|' -k1 -rn | head -10)
3243
+
3244
+ while IFS='|' read -r usage_count basename line_count; do
3245
+ [[ -z "$basename" ]] && continue
3246
+ untested_list="${untested_list}\n- \`${basename}\` (${line_count} lines, referenced by ${usage_count} scripts)"
3247
+ emit_event "patrol.finding" "check=untested_script" "script=$basename" "lines=$line_count" "usage=$usage_count"
3248
+
3249
+ if [[ "$dry_run" == "true" ]] || [[ "$NO_GITHUB" == "true" ]]; then
3250
+ echo -e " ${YELLOW}●${RESET} ${CYAN}${basename}${RESET} (${line_count} lines, ${usage_count} refs)"
3251
+ fi
3252
+ done <<< "$sorted_entries"
3253
+
3254
+ total_findings=$((total_findings + findings))
3255
+
3256
+ if [[ "$NO_GITHUB" != "true" ]] && [[ "$dry_run" != "true" ]]; then
3257
+ local existing
3258
+ existing=$(gh issue list --label "$PATROL_LABEL" --label "test-coverage" \
3259
+ --search "Add tests for untested scripts" --json number -q 'length' 2>/dev/null || echo "0")
3260
+ if [[ "${existing:-0}" -eq 0 ]] && [[ "$issues_created" -lt "$PATROL_MAX_ISSUES" ]]; then
3261
+ gh issue create \
3262
+ --title "Add tests for ${findings} untested script(s)" \
3263
+ --body "## Untested Scripts
3264
+
3265
+ The following scripts have no corresponding test file (\`sw-*-test.sh\`):
3266
+ $(echo -e "$untested_list")
3267
+
3268
+ ### How to Add Tests
3269
+ Each test file should follow the pattern in existing test scripts (e.g., \`sw-daemon-test.sh\`):
3270
+ - Mock environment with TEMP_DIR
3271
+ - PASS/FAIL counters
3272
+ - \`run_test\` harness
3273
+ - Register in \`package.json\` test script
3274
+
3275
+ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
3276
+ --label "$(patrol_build_labels "test-coverage")" 2>/dev/null || true
3277
+ issues_created=$((issues_created + 1))
3278
+ emit_event "patrol.issue_created" "check=untested_scripts" "count=$findings"
3279
+ fi
3280
+ fi
3281
+
3282
+ daemon_log INFO "Patrol: found ${findings} untested script(s)"
3283
+ }
3284
+
3285
+ # ── 10. Retry Exhaustion Patterns ──
3286
+ patrol_retry_exhaustion() {
3287
+ if [[ "$PATROL_RETRY_ENABLED" != "true" ]]; then return; fi
3288
+ daemon_log INFO "Patrol: checking retry exhaustion patterns"
3289
+ local findings=0
3290
+
3291
+ if [[ ! -f "$EVENTS_FILE" ]]; then
3292
+ daemon_log INFO "Patrol: no events file — skipping retry check"
3293
+ return
3294
+ fi
3295
+
3296
+ local seven_days_ago
3297
+ seven_days_ago=$(($(now_epoch) - 604800))
3298
+
3299
+ # Find retry_exhausted events in last 7 days
3300
+ local exhausted_events
3301
+ exhausted_events=$(jq -s --argjson since "$seven_days_ago" \
3302
+ '[.[] | select(.type == "daemon.retry_exhausted" and (.ts_epoch // 0) >= $since)]' \
3303
+ "$EVENTS_FILE" 2>/dev/null || echo "[]")
3304
+
3305
+ local exhausted_count
3306
+ exhausted_count=$(echo "$exhausted_events" | jq 'length' 2>/dev/null || echo "0")
3307
+
3308
+ if [[ "${exhausted_count:-0}" -lt "$PATROL_RETRY_THRESHOLD" ]]; then
3309
+ daemon_log INFO "Patrol: retry exhaustions ($exhausted_count) below threshold ($PATROL_RETRY_THRESHOLD)"
3310
+ return
3311
+ fi
3312
+
3313
+ findings=1
3314
+ total_findings=$((total_findings + findings))
3315
+
3316
+ # Get unique issue patterns
3317
+ local issue_list
3318
+ issue_list=$(echo "$exhausted_events" | jq -r '[.[] | .issue // "unknown"] | unique | join(", ")' 2>/dev/null || echo "unknown")
3319
+
3320
+ local first_ts last_ts
3321
+ first_ts=$(echo "$exhausted_events" | jq -r '[.[] | .ts] | sort | first // "unknown"' 2>/dev/null || echo "unknown")
3322
+ last_ts=$(echo "$exhausted_events" | jq -r '[.[] | .ts] | sort | last // "unknown"' 2>/dev/null || echo "unknown")
3323
+
3324
+ emit_event "patrol.finding" "check=retry_exhaustion" "count=$exhausted_count" "issues=$issue_list"
3325
+
3326
+ if [[ "$NO_GITHUB" != "true" ]] && [[ "$dry_run" != "true" ]]; then
3327
+ local existing
3328
+ existing=$(gh issue list --label "$PATROL_LABEL" --label "reliability" \
3329
+ --search "Retry exhaustion pattern" --json number -q 'length' 2>/dev/null || echo "0")
3330
+ if [[ "${existing:-0}" -eq 0 ]] && [[ "$issues_created" -lt "$PATROL_MAX_ISSUES" ]]; then
3331
+ gh issue create \
3332
+ --title "Retry exhaustion pattern (${exhausted_count} in 7 days)" \
3333
+ --body "## Retry Exhaustion Pattern
3334
+
3335
+ | Field | Value |
3336
+ |-------|-------|
3337
+ | Exhaustions (7d) | **${exhausted_count}** |
3338
+ | Threshold | ${PATROL_RETRY_THRESHOLD} |
3339
+ | Affected issues | ${issue_list} |
3340
+ | First occurrence | ${first_ts} |
3341
+ | Latest occurrence | ${last_ts} |
3342
+
3343
+ ### Investigation Steps
3344
+ 1. Check the affected issues for common patterns
3345
+ 2. Review pipeline logs for root cause
3346
+ 3. Consider if max_retries needs adjustment
3347
+ 4. Investigate if an external dependency is flaky
3348
+
3349
+ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
3350
+ --label "$(patrol_build_labels "reliability")" 2>/dev/null || true
3351
+ issues_created=$((issues_created + 1))
3352
+ emit_event "patrol.issue_created" "check=retry_exhaustion" "count=$exhausted_count"
3353
+ fi
3354
+ else
3355
+ echo -e " ${RED}●${RESET} ${BOLD}retry exhaustion${RESET}: ${exhausted_count} exhaustions in 7 days (issues: ${issue_list})"
3356
+ fi
3357
+
3358
+ daemon_log INFO "Patrol: found retry exhaustion pattern (${exhausted_count} in 7 days)"
3359
+ }
3360
+
3361
+ # ── Stage 1: Run all grep-based patrol checks (fast pre-filter) ──
3362
+ local patrol_findings_summary=""
3363
+ local pre_check_findings=0
3364
+
1796
3365
  echo -e " ${BOLD}Security Audit${RESET}"
3366
+ pre_check_findings=$total_findings
1797
3367
  patrol_security_audit
3368
+ if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
3369
+ patrol_findings_summary="${patrol_findings_summary}security: $((total_findings - pre_check_findings)) finding(s); "
3370
+ fi
1798
3371
  echo ""
1799
3372
 
1800
3373
  echo -e " ${BOLD}Stale Dependencies${RESET}"
3374
+ pre_check_findings=$total_findings
1801
3375
  patrol_stale_dependencies
3376
+ if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
3377
+ patrol_findings_summary="${patrol_findings_summary}stale_deps: $((total_findings - pre_check_findings)) finding(s); "
3378
+ fi
1802
3379
  echo ""
1803
3380
 
1804
3381
  echo -e " ${BOLD}Dead Code Detection${RESET}"
3382
+ pre_check_findings=$total_findings
1805
3383
  patrol_dead_code
3384
+ if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
3385
+ patrol_findings_summary="${patrol_findings_summary}dead_code: $((total_findings - pre_check_findings)) finding(s); "
3386
+ fi
1806
3387
  echo ""
1807
3388
 
1808
3389
  echo -e " ${BOLD}Test Coverage Gaps${RESET}"
3390
+ pre_check_findings=$total_findings
1809
3391
  patrol_coverage_gaps
3392
+ if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
3393
+ patrol_findings_summary="${patrol_findings_summary}coverage: $((total_findings - pre_check_findings)) finding(s); "
3394
+ fi
1810
3395
  echo ""
1811
3396
 
1812
3397
  echo -e " ${BOLD}Documentation Staleness${RESET}"
3398
+ pre_check_findings=$total_findings
1813
3399
  patrol_doc_staleness
3400
+ if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
3401
+ patrol_findings_summary="${patrol_findings_summary}docs: $((total_findings - pre_check_findings)) finding(s); "
3402
+ fi
1814
3403
  echo ""
1815
3404
 
1816
3405
  echo -e " ${BOLD}Performance Baseline${RESET}"
3406
+ pre_check_findings=$total_findings
1817
3407
  patrol_performance_baseline
3408
+ if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
3409
+ patrol_findings_summary="${patrol_findings_summary}performance: $((total_findings - pre_check_findings)) finding(s); "
3410
+ fi
3411
+ echo ""
3412
+
3413
+ echo -e " ${BOLD}Recurring Failures${RESET}"
3414
+ pre_check_findings=$total_findings
3415
+ patrol_recurring_failures
3416
+ if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
3417
+ patrol_findings_summary="${patrol_findings_summary}recurring_failures: $((total_findings - pre_check_findings)) finding(s); "
3418
+ fi
3419
+ echo ""
3420
+
3421
+ echo -e " ${BOLD}DORA Degradation${RESET}"
3422
+ pre_check_findings=$total_findings
3423
+ patrol_dora_degradation
3424
+ if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
3425
+ patrol_findings_summary="${patrol_findings_summary}dora: $((total_findings - pre_check_findings)) finding(s); "
3426
+ fi
3427
+ echo ""
3428
+
3429
+ echo -e " ${BOLD}Untested Scripts${RESET}"
3430
+ pre_check_findings=$total_findings
3431
+ patrol_untested_scripts
3432
+ if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
3433
+ patrol_findings_summary="${patrol_findings_summary}untested: $((total_findings - pre_check_findings)) finding(s); "
3434
+ fi
3435
+ echo ""
3436
+
3437
+ echo -e " ${BOLD}Retry Exhaustion${RESET}"
3438
+ pre_check_findings=$total_findings
3439
+ patrol_retry_exhaustion
3440
+ if [[ "$total_findings" -gt "$pre_check_findings" ]]; then
3441
+ patrol_findings_summary="${patrol_findings_summary}retry_exhaustion: $((total_findings - pre_check_findings)) finding(s); "
3442
+ fi
1818
3443
  echo ""
1819
3444
 
3445
+ # ── Stage 2: AI-Powered Confirmation (if enabled) ──
3446
+ if [[ "${PREDICTION_ENABLED:-false}" == "true" ]] && type patrol_ai_analyze &>/dev/null 2>&1; then
3447
+ daemon_log INFO "Intelligence: using AI patrol analysis (prediction enabled)"
3448
+ echo -e " ${BOLD}AI Deep Analysis${RESET}"
3449
+ # Sample recent source files for AI analysis
3450
+ local sample_files=""
3451
+ local git_log_recent=""
3452
+ sample_files=$(git diff --name-only HEAD~5 2>/dev/null | head -10 | tr '\n' ',' || echo "")
3453
+ git_log_recent=$(git log --oneline -10 2>/dev/null || echo "")
3454
+ # Include grep-based findings summary as context for AI confirmation
3455
+ if [[ -n "$patrol_findings_summary" ]]; then
3456
+ git_log_recent="${git_log_recent}
3457
+
3458
+ Patrol pre-filter findings to confirm: ${patrol_findings_summary}"
3459
+ daemon_log INFO "Patrol: passing ${total_findings} grep findings to AI for confirmation"
3460
+ fi
3461
+ if [[ -n "$sample_files" ]]; then
3462
+ local ai_findings
3463
+ ai_findings=$(patrol_ai_analyze "$sample_files" "$git_log_recent" 2>/dev/null || echo "[]")
3464
+ if [[ -n "$ai_findings" && "$ai_findings" != "[]" ]]; then
3465
+ local ai_count
3466
+ ai_count=$(echo "$ai_findings" | jq 'length' 2>/dev/null || echo "0")
3467
+ ai_count=${ai_count:-0}
3468
+ total_findings=$((total_findings + ai_count))
3469
+ echo -e " ${CYAN}●${RESET} AI confirmed findings + found ${ai_count} additional issue(s)"
3470
+ emit_event "patrol.ai_analysis" "findings=$ai_count" "grep_findings=${patrol_findings_summary:-none}"
3471
+ else
3472
+ echo -e " ${GREEN}●${RESET} AI analysis: grep findings confirmed, no additional issues"
3473
+ fi
3474
+ fi
3475
+ echo ""
3476
+ else
3477
+ daemon_log INFO "Intelligence: using grep-only patrol (prediction disabled, enable with intelligence.prediction_enabled=true)"
3478
+ fi
3479
+
3480
+ # ── Meta Self-Improvement Patrol ──
3481
+ if [[ -f "$SCRIPT_DIR/sw-patrol-meta.sh" ]]; then
3482
+ # shellcheck source=sw-patrol-meta.sh
3483
+ source "$SCRIPT_DIR/sw-patrol-meta.sh"
3484
+ patrol_meta_run
3485
+ fi
3486
+
1820
3487
  # ── Summary ──
1821
3488
  emit_event "patrol.completed" "findings=$total_findings" "issues_created=$issues_created" "dry_run=$dry_run"
1822
3489
 
@@ -1829,6 +3496,9 @@ Auto-detected by \`shipwright daemon patrol\` on $(now_iso)." \
1829
3496
  echo ""
1830
3497
 
1831
3498
  daemon_log INFO "Patrol complete: ${total_findings} findings, ${issues_created} issues created"
3499
+
3500
+ # Adapt patrol limits based on hit rate
3501
+ adapt_patrol_limits "$total_findings" "$PATROL_MAX_ISSUES"
1832
3502
  }
1833
3503
 
1834
3504
  # ─── Poll Issues ─────────────────────────────────────────────────────────────
@@ -1839,6 +3509,18 @@ daemon_poll_issues() {
1839
3509
  return
1840
3510
  fi
1841
3511
 
3512
+ # Check for pause flag (set by dashboard or disk_low alert)
3513
+ if [[ -f "$HOME/.shipwright/daemon-pause.flag" ]]; then
3514
+ daemon_log INFO "Daemon paused — skipping poll"
3515
+ return
3516
+ fi
3517
+
3518
+ # Circuit breaker: skip poll if in backoff window
3519
+ if gh_rate_limited; then
3520
+ daemon_log INFO "Polling skipped (rate-limit backoff until $(epoch_to_iso "$GH_BACKOFF_UNTIL"))"
3521
+ return
3522
+ fi
3523
+
1842
3524
  local issues_json
1843
3525
 
1844
3526
  # Select gh command wrapper: gh_retry for critical poll calls when enabled
@@ -1865,6 +3547,7 @@ daemon_poll_issues() {
1865
3547
  fi
1866
3548
  fi
1867
3549
  daemon_log WARN "GitHub API error (org search) — backing off ${BACKOFF_SECS}s"
3550
+ gh_record_failure
1868
3551
  sleep "$BACKOFF_SECS"
1869
3552
  return
1870
3553
  }
@@ -1891,6 +3574,7 @@ daemon_poll_issues() {
1891
3574
  fi
1892
3575
  fi
1893
3576
  daemon_log WARN "GitHub API error — backing off ${BACKOFF_SECS}s"
3577
+ gh_record_failure
1894
3578
  sleep "$BACKOFF_SECS"
1895
3579
  return
1896
3580
  }
@@ -1898,6 +3582,7 @@ daemon_poll_issues() {
1898
3582
 
1899
3583
  # Reset backoff on success
1900
3584
  BACKOFF_SECS=0
3585
+ gh_record_success
1901
3586
 
1902
3587
  local issue_count
1903
3588
  issue_count=$(echo "$issues_json" | jq 'length' 2>/dev/null || echo 0)
@@ -1913,6 +3598,7 @@ daemon_poll_issues() {
1913
3598
 
1914
3599
  # Score each issue using intelligent triage and sort by descending score
1915
3600
  local scored_issues=()
3601
+ local dep_graph="" # "issue:dep1,dep2" entries for dependency ordering
1916
3602
  while IFS= read -r issue; do
1917
3603
  local num score
1918
3604
  num=$(echo "$issue" | jq -r '.number')
@@ -1923,14 +3609,85 @@ daemon_poll_issues() {
1923
3609
  repo_name=$(echo "$issue" | jq -r '.repository.nameWithOwner // ""')
1924
3610
  fi
1925
3611
  scored_issues+=("${score}|${num}|${repo_name}")
3612
+
3613
+ # Issue dependency detection (adaptive: extract "depends on #X", "blocked by #X")
3614
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
3615
+ local issue_text
3616
+ issue_text=$(echo "$issue" | jq -r '(.title // "") + " " + (.body // "")')
3617
+ local deps
3618
+ deps=$(extract_issue_dependencies "$issue_text")
3619
+ if [[ -n "$deps" ]]; then
3620
+ local dep_nums
3621
+ dep_nums=$(echo "$deps" | tr -d '#' | tr '\n' ',' | sed 's/,$//')
3622
+ dep_graph="${dep_graph}${num}:${dep_nums}\n"
3623
+ daemon_log INFO "Issue #${num} depends on: ${deps//$'\n'/, }"
3624
+ fi
3625
+ fi
1926
3626
  done < <(echo "$issues_json" | jq -c '.[]')
1927
3627
 
1928
- # Sort by score descending
3628
+ # Sort by score — strategy determines ascending vs descending
1929
3629
  local sorted_order
1930
- sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -rn)
3630
+ if [[ "${PRIORITY_STRATEGY:-quick-wins-first}" == "complex-first" ]]; then
3631
+ # Complex-first: lower score (more complex) first
3632
+ sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -n)
3633
+ else
3634
+ # Quick-wins-first (default): higher score (simpler) first
3635
+ sorted_order=$(printf '%s\n' "${scored_issues[@]}" | sort -t'|' -k1 -rn)
3636
+ fi
3637
+
3638
+ # Dependency-aware reordering: move dependencies before dependents
3639
+ if [[ -n "$dep_graph" && "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
3640
+ local reordered=""
3641
+ local scheduled=""
3642
+ # Multiple passes to resolve transitive dependencies (max 3)
3643
+ local pass=0
3644
+ while [[ $pass -lt 3 ]]; do
3645
+ local changed=false
3646
+ local new_order=""
3647
+ while IFS='|' read -r s_score s_num s_repo; do
3648
+ [[ -z "$s_num" ]] && continue
3649
+ # Check if this issue has unscheduled dependencies
3650
+ local issue_deps
3651
+ issue_deps=$(echo -e "$dep_graph" | grep "^${s_num}:" | head -1 | cut -d: -f2 || true)
3652
+ if [[ -n "$issue_deps" ]]; then
3653
+ # Check if all deps are scheduled (or not in our issue set)
3654
+ local all_deps_ready=true
3655
+ local IFS_SAVE="$IFS"
3656
+ IFS=','
3657
+ for dep in $issue_deps; do
3658
+ dep="${dep## }"
3659
+ dep="${dep%% }"
3660
+ # Is this dep in our scored set and not yet scheduled?
3661
+ if echo "$sorted_order" | grep -q "|${dep}|" && ! echo "$scheduled" | grep -q "|${dep}|"; then
3662
+ all_deps_ready=false
3663
+ break
3664
+ fi
3665
+ done
3666
+ IFS="$IFS_SAVE"
3667
+ if [[ "$all_deps_ready" == "false" ]]; then
3668
+ # Defer this issue — append at end
3669
+ new_order="${new_order}${s_score}|${s_num}|${s_repo}\n"
3670
+ changed=true
3671
+ continue
3672
+ fi
3673
+ fi
3674
+ reordered="${reordered}${s_score}|${s_num}|${s_repo}\n"
3675
+ scheduled="${scheduled}|${s_num}|"
3676
+ done <<< "$sorted_order"
3677
+ # Append deferred issues
3678
+ reordered="${reordered}${new_order}"
3679
+ sorted_order=$(echo -e "$reordered" | grep -v '^$')
3680
+ reordered=""
3681
+ scheduled=""
3682
+ if [[ "$changed" == "false" ]]; then
3683
+ break
3684
+ fi
3685
+ pass=$((pass + 1))
3686
+ done
3687
+ fi
1931
3688
 
1932
3689
  local active_count
1933
- active_count=$(get_active_count)
3690
+ active_count=$(locked_get_active_count)
1934
3691
 
1935
3692
  # Process each issue in triage order (process substitution keeps state in current shell)
1936
3693
  while IFS='|' read -r score issue_num repo_name; do
@@ -1940,11 +3697,27 @@ daemon_poll_issues() {
1940
3697
  issue_title=$(echo "$issues_json" | jq -r --argjson n "$issue_num" '.[] | select(.number == $n) | .title')
1941
3698
  labels_csv=$(echo "$issues_json" | jq -r --argjson n "$issue_num" '.[] | select(.number == $n) | [.labels[].name] | join(",")')
1942
3699
 
3700
+ # Cache title in state for dashboard visibility
3701
+ if [[ -n "$issue_title" ]]; then
3702
+ locked_state_update --arg num "$issue_num" --arg title "$issue_title" \
3703
+ '.titles[$num] = $title'
3704
+ fi
3705
+
1943
3706
  # Skip if already inflight
1944
3707
  if daemon_is_inflight "$issue_num"; then
1945
3708
  continue
1946
3709
  fi
1947
3710
 
3711
+ # Distributed claim (skip if no machines registered)
3712
+ if [[ -f "$HOME/.shipwright/machines.json" ]]; then
3713
+ local machine_name
3714
+ machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
3715
+ if ! claim_issue "$issue_num" "$machine_name"; then
3716
+ daemon_log INFO "Issue #${issue_num} claimed by another machine — skipping"
3717
+ continue
3718
+ fi
3719
+ fi
3720
+
1948
3721
  # Priority lane: bypass queue for critical issues
1949
3722
  if [[ "$PRIORITY_LANE" == "true" ]]; then
1950
3723
  local priority_active
@@ -1967,7 +3740,7 @@ daemon_poll_issues() {
1967
3740
  fi
1968
3741
 
1969
3742
  # Check capacity
1970
- active_count=$(get_active_count)
3743
+ active_count=$(locked_get_active_count)
1971
3744
  if [[ "$active_count" -ge "$MAX_PARALLEL" ]]; then
1972
3745
  enqueue_issue "$issue_num"
1973
3746
  continue
@@ -1993,33 +3766,95 @@ daemon_poll_issues() {
1993
3766
 
1994
3767
  daemon_health_check() {
1995
3768
  local findings=0
1996
-
1997
- # Stale jobs: kill processes running > timeout
1998
- local stale_timeout="${HEALTH_STALE_TIMEOUT:-1800}" # default 30min
1999
3769
  local now_e
2000
3770
  now_e=$(now_epoch)
2001
3771
 
2002
3772
  if [[ -f "$STATE_FILE" ]]; then
3773
+ # ── Progress-Based Health Monitoring ──
3774
+ # Instead of killing after a static timeout, check for forward progress.
3775
+ # Only kill when the agent is truly stuck (no stage change, no new code,
3776
+ # same error repeating). A hard wall-clock limit remains as absolute safety net.
3777
+
3778
+ local hard_limit="${PROGRESS_HARD_LIMIT_S:-10800}"
3779
+ local use_progress="${PROGRESS_MONITORING:-true}"
3780
+
2003
3781
  while IFS= read -r job; do
2004
- local pid started_at issue_num
3782
+ local pid started_at issue_num worktree
2005
3783
  pid=$(echo "$job" | jq -r '.pid')
2006
3784
  started_at=$(echo "$job" | jq -r '.started_at // empty')
2007
3785
  issue_num=$(echo "$job" | jq -r '.issue')
3786
+ worktree=$(echo "$job" | jq -r '.worktree // ""')
3787
+
3788
+ # Skip dead processes
3789
+ if ! kill -0 "$pid" 2>/dev/null; then
3790
+ continue
3791
+ fi
2008
3792
 
3793
+ local elapsed=0
2009
3794
  if [[ -n "$started_at" ]]; then
2010
3795
  local start_e
2011
3796
  start_e=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$started_at" +%s 2>/dev/null || date -d "$started_at" +%s 2>/dev/null || echo "0")
2012
- local elapsed=$(( now_e - start_e ))
2013
- if [[ "$elapsed" -gt "$stale_timeout" ]] && kill -0 "$pid" 2>/dev/null; then
2014
- daemon_log WARN "Stale job detected: issue #${issue_num} (${elapsed}s, PID $pid) — killing"
3797
+ elapsed=$(( now_e - start_e ))
3798
+ fi
3799
+
3800
+ # Hard wall-clock limit — absolute safety net (default 3h)
3801
+ if [[ "$elapsed" -gt "$hard_limit" ]]; then
3802
+ daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
3803
+ emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
3804
+ kill "$pid" 2>/dev/null || true
3805
+ daemon_clear_progress "$issue_num"
3806
+ findings=$((findings + 1))
3807
+ continue
3808
+ fi
3809
+
3810
+ # Progress-based detection (when enabled)
3811
+ if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
3812
+ local snapshot verdict
3813
+ snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
3814
+
3815
+ if [[ "$snapshot" != "{}" ]]; then
3816
+ verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
3817
+
3818
+ case "$verdict" in
3819
+ healthy)
3820
+ # All good — agent is making progress
3821
+ ;;
3822
+ slowing)
3823
+ daemon_log INFO "Issue #${issue_num} slowing (no progress for 1-2 checks, ${elapsed}s elapsed)"
3824
+ ;;
3825
+ stalled)
3826
+ local no_progress_count
3827
+ no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
3828
+ daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks (${elapsed}s elapsed, PID $pid)"
3829
+ emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
3830
+ ;;
3831
+ stuck)
3832
+ local no_progress_count repeated_errors cur_stage
3833
+ no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
3834
+ repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
3835
+ cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
3836
+ daemon_log WARN "Issue #${issue_num} STUCK: no progress for ${no_progress_count} checks, ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
3837
+ emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid"
3838
+ kill "$pid" 2>/dev/null || true
3839
+ daemon_clear_progress "$issue_num"
3840
+ findings=$((findings + 1))
3841
+ ;;
3842
+ esac
3843
+ fi
3844
+ else
3845
+ # Fallback: legacy time-based detection when progress monitoring is off
3846
+ local stale_timeout
3847
+ stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
3848
+ if [[ "$elapsed" -gt "$stale_timeout" ]]; then
3849
+ daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid) — killing"
2015
3850
  kill "$pid" 2>/dev/null || true
2016
3851
  findings=$((findings + 1))
2017
3852
  fi
2018
3853
  fi
2019
- done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null)
3854
+ done < <(jq -c '.active_jobs[]' "$STATE_FILE" 2>/dev/null || true)
2020
3855
  fi
2021
3856
 
2022
- # Disk space warning
3857
+ # Disk space warning (check both repo dir and ~/.shipwright)
2023
3858
  local free_kb
2024
3859
  free_kb=$(df -k "." 2>/dev/null | tail -1 | awk '{print $4}')
2025
3860
  if [[ -n "$free_kb" ]] && [[ "$free_kb" -lt 1048576 ]] 2>/dev/null; then
@@ -2027,6 +3862,17 @@ daemon_health_check() {
2027
3862
  findings=$((findings + 1))
2028
3863
  fi
2029
3864
 
3865
+ # Critical disk space on ~/.shipwright — pause spawning
3866
+ local sw_free_kb
3867
+ sw_free_kb=$(df -k "$HOME/.shipwright" 2>/dev/null | tail -1 | awk '{print $4}')
3868
+ if [[ -n "$sw_free_kb" ]] && [[ "$sw_free_kb" -lt 512000 ]] 2>/dev/null; then
3869
+ daemon_log WARN "Critical disk space on ~/.shipwright: $(( sw_free_kb / 1024 ))MB — pausing spawns"
3870
+ emit_event "daemon.disk_low" "free_mb=$(( sw_free_kb / 1024 ))"
3871
+ mkdir -p "$HOME/.shipwright"
3872
+ echo '{"paused":true,"reason":"disk_low"}' > "$HOME/.shipwright/daemon-pause.flag"
3873
+ findings=$((findings + 1))
3874
+ fi
3875
+
2030
3876
  # Events file size warning
2031
3877
  if [[ -f "$EVENTS_FILE" ]]; then
2032
3878
  local events_size
@@ -2096,6 +3942,13 @@ daemon_auto_scale() {
2096
3942
 
2097
3943
  local prev_max="$MAX_PARALLEL"
2098
3944
 
3945
+ # ── Learn worker memory from actual RSS (adaptive) ──
3946
+ learn_worker_memory
3947
+
3948
+ # ── Adaptive cost estimate per template ──
3949
+ local effective_cost_per_job
3950
+ effective_cost_per_job=$(get_adaptive_cost_estimate "$PIPELINE_TEMPLATE")
3951
+
2099
3952
  # ── CPU cores ──
2100
3953
  local cpu_cores=2
2101
3954
  if [[ "$(uname -s)" == "Darwin" ]]; then
@@ -2106,10 +3959,9 @@ daemon_auto_scale() {
2106
3959
  local max_by_cpu=$(( (cpu_cores * 3) / 4 )) # 75% utilization cap
2107
3960
  [[ "$max_by_cpu" -lt 1 ]] && max_by_cpu=1
2108
3961
 
2109
- # ── Load average check (back off if system is stressed) ──
3962
+ # ── Load average check gradual scaling curve (replaces 90% cliff) ──
2110
3963
  local load_avg
2111
3964
  load_avg=$(uptime | awk -F'load averages?: ' '{print $2}' | awk -F'[, ]+' '{print $1}' 2>/dev/null || echo "0")
2112
- # Validate numeric
2113
3965
  if [[ ! "$load_avg" =~ ^[0-9]+\.?[0-9]*$ ]]; then
2114
3966
  load_avg="0"
2115
3967
  fi
@@ -2117,17 +3969,28 @@ daemon_auto_scale() {
2117
3969
  if [[ "$cpu_cores" -gt 0 ]]; then
2118
3970
  load_ratio=$(awk -v load="$load_avg" -v cores="$cpu_cores" 'BEGIN { printf "%.0f", (load / cores) * 100 }')
2119
3971
  fi
2120
- if [[ "$load_ratio" -gt 90 ]]; then
2121
- # System under heavy load scale down to min
3972
+ # Gradual load scaling curve (replaces binary 90% cliff)
3973
+ if [[ "$load_ratio" -gt 95 ]]; then
3974
+ # 95%+: minimum workers only
2122
3975
  max_by_cpu="$MIN_WORKERS"
2123
- daemon_log WARN "Auto-scale: high load (${load_avg}/${cpu_cores} cores) — constraining to ${max_by_cpu}"
3976
+ daemon_log WARN "Auto-scale: critical load (${load_ratio}%) — minimum workers only"
3977
+ elif [[ "$load_ratio" -gt 85 ]]; then
3978
+ # 85-95%: reduce by 50%
3979
+ max_by_cpu=$(( max_by_cpu / 2 ))
3980
+ [[ "$max_by_cpu" -lt "$MIN_WORKERS" ]] && max_by_cpu="$MIN_WORKERS"
3981
+ daemon_log WARN "Auto-scale: high load (${load_ratio}%) — reducing capacity 50%"
3982
+ elif [[ "$load_ratio" -gt 70 ]]; then
3983
+ # 70-85%: reduce by 25%
3984
+ max_by_cpu=$(( (max_by_cpu * 3) / 4 ))
3985
+ [[ "$max_by_cpu" -lt "$MIN_WORKERS" ]] && max_by_cpu="$MIN_WORKERS"
3986
+ daemon_log INFO "Auto-scale: moderate load (${load_ratio}%) — reducing capacity 25%"
2124
3987
  fi
3988
+ # 0-70%: full capacity (no change)
2125
3989
 
2126
3990
  # ── Available memory ──
2127
3991
  local avail_mem_gb=8
2128
3992
  if [[ "$(uname -s)" == "Darwin" ]]; then
2129
3993
  local page_size free_pages inactive_pages purgeable_pages speculative_pages
2130
- # Page size is in format: "(page size of 16384 bytes)"
2131
3994
  page_size=$(vm_stat | awk '/page size of/ {for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/) print $i}')
2132
3995
  page_size="${page_size:-16384}"
2133
3996
  free_pages=$(vm_stat | awk '/^Pages free:/ {gsub(/\./, "", $NF); print $NF}')
@@ -2138,7 +4001,6 @@ daemon_auto_scale() {
2138
4001
  inactive_pages="${inactive_pages:-0}"
2139
4002
  purgeable_pages=$(vm_stat | awk '/^Pages purgeable:/ {gsub(/\./, "", $NF); print $NF}')
2140
4003
  purgeable_pages="${purgeable_pages:-0}"
2141
- # Available ≈ free + speculative + inactive + purgeable
2142
4004
  local avail_pages=$(( free_pages + speculative_pages + inactive_pages + purgeable_pages ))
2143
4005
  if [[ "$avail_pages" -gt 0 && "$page_size" -gt 0 ]]; then
2144
4006
  local free_bytes=$(( avail_pages * page_size ))
@@ -2153,13 +4015,13 @@ daemon_auto_scale() {
2153
4015
  local max_by_mem=$(( avail_mem_gb / WORKER_MEM_GB ))
2154
4016
  [[ "$max_by_mem" -lt 1 ]] && max_by_mem=1
2155
4017
 
2156
- # ── Budget remaining ──
4018
+ # ── Budget remaining (adaptive cost estimate) ──
2157
4019
  local max_by_budget="$MAX_WORKERS"
2158
4020
  local remaining_usd
2159
- remaining_usd=$("$SCRIPT_DIR/cct-cost.sh" remaining-budget 2>/dev/null || echo "unlimited")
4021
+ remaining_usd=$("$SCRIPT_DIR/sw-cost.sh" remaining-budget 2>/dev/null || echo "unlimited")
2160
4022
  if [[ "$remaining_usd" != "unlimited" && -n "$remaining_usd" ]]; then
2161
- if awk -v r="$remaining_usd" -v c="$EST_COST_PER_JOB" 'BEGIN { exit !(r > 0 && c > 0) }'; then
2162
- max_by_budget=$(awk -v r="$remaining_usd" -v c="$EST_COST_PER_JOB" 'BEGIN { printf "%.0f", r / c }')
4023
+ if awk -v r="$remaining_usd" -v c="$effective_cost_per_job" 'BEGIN { exit !(r > 0 && c > 0) }'; then
4024
+ max_by_budget=$(awk -v r="$remaining_usd" -v c="$effective_cost_per_job" 'BEGIN { printf "%.0f", r / c }')
2163
4025
  [[ "$max_by_budget" -lt 0 ]] && max_by_budget=0
2164
4026
  else
2165
4027
  max_by_budget=0
@@ -2192,10 +4054,31 @@ daemon_auto_scale() {
2192
4054
  # Clamp to min_workers
2193
4055
  [[ "$computed" -lt "$MIN_WORKERS" ]] && computed="$MIN_WORKERS"
2194
4056
 
4057
+ # ── Gradual scaling: change by at most 1 at a time (adaptive) ──
4058
+ if [[ "${ADAPTIVE_THRESHOLDS_ENABLED:-false}" == "true" ]]; then
4059
+ if [[ "$computed" -gt "$prev_max" ]]; then
4060
+ # Check success rate at target parallelism before scaling up
4061
+ local target_rate
4062
+ target_rate=$(get_success_rate_at_parallelism "$((prev_max + 1))")
4063
+ if [[ "$target_rate" -lt 50 ]]; then
4064
+ # Poor success rate at higher parallelism — hold steady
4065
+ computed="$prev_max"
4066
+ daemon_log INFO "Auto-scale: holding at ${prev_max} (success rate ${target_rate}% at $((prev_max + 1)))"
4067
+ else
4068
+ # Scale up by 1, not jump to target
4069
+ computed=$((prev_max + 1))
4070
+ fi
4071
+ elif [[ "$computed" -lt "$prev_max" ]]; then
4072
+ # Scale down by 1, not drop to minimum
4073
+ computed=$((prev_max - 1))
4074
+ [[ "$computed" -lt "$MIN_WORKERS" ]] && computed="$MIN_WORKERS"
4075
+ fi
4076
+ fi
4077
+
2195
4078
  MAX_PARALLEL="$computed"
2196
4079
 
2197
4080
  if [[ "$MAX_PARALLEL" -ne "$prev_max" ]]; then
2198
- daemon_log INFO "Auto-scale: ${prev_max} → ${MAX_PARALLEL} (cpu=${max_by_cpu} mem=${max_by_mem} budget=${max_by_budget} queue=${max_by_queue})"
4081
+ daemon_log INFO "Auto-scale: ${prev_max} → ${MAX_PARALLEL} (cpu=${max_by_cpu} mem=${max_by_mem} budget=${max_by_budget} queue=${max_by_queue} load=${load_ratio}%)"
2199
4082
  emit_event "daemon.scale" \
2200
4083
  "from=$prev_max" \
2201
4084
  "to=$MAX_PARALLEL" \
@@ -2205,7 +4088,8 @@ daemon_auto_scale() {
2205
4088
  "max_by_queue=$max_by_queue" \
2206
4089
  "cpu_cores=$cpu_cores" \
2207
4090
  "avail_mem_gb=$avail_mem_gb" \
2208
- "remaining_usd=$remaining_usd"
4091
+ "remaining_usd=$remaining_usd" \
4092
+ "load_ratio=$load_ratio"
2209
4093
  fi
2210
4094
  }
2211
4095
 
@@ -2213,7 +4097,7 @@ daemon_auto_scale() {
2213
4097
  # Checks for fleet-reload.flag and reloads MAX_PARALLEL from fleet-managed config
2214
4098
 
2215
4099
  daemon_reload_config() {
2216
- local reload_flag="$HOME/.claude-teams/fleet-reload.flag"
4100
+ local reload_flag="$HOME/.shipwright/fleet-reload.flag"
2217
4101
  if [[ ! -f "$reload_flag" ]]; then
2218
4102
  return
2219
4103
  fi
@@ -2245,6 +4129,15 @@ daemon_self_optimize() {
2245
4129
  return
2246
4130
  fi
2247
4131
 
4132
+ # ── Intelligence-powered optimization (if enabled) ──
4133
+ if [[ "${OPTIMIZATION_ENABLED:-false}" == "true" ]] && type optimize_full_analysis &>/dev/null 2>&1; then
4134
+ daemon_log INFO "Running intelligence-powered optimization"
4135
+ optimize_full_analysis 2>/dev/null || {
4136
+ daemon_log WARN "Intelligence optimization failed — falling back to DORA-based tuning"
4137
+ }
4138
+ # Still run DORA-based tuning below as a complement
4139
+ fi
4140
+
2248
4141
  daemon_log INFO "Running self-optimization check"
2249
4142
 
2250
4143
  # Read DORA metrics from recent events (last 7 days)
@@ -2339,13 +4232,10 @@ daemon_self_optimize() {
2339
4232
  local adj_str
2340
4233
  adj_str=$(printf '%s; ' "${adjustments[@]}")
2341
4234
 
2342
- local tmp_state
2343
- tmp_state=$(jq \
4235
+ locked_state_update \
2344
4236
  --arg adj "$adj_str" \
2345
4237
  --arg ts "$(now_iso)" \
2346
- '.last_optimization = {timestamp: $ts, adjustments: $adj}' \
2347
- "$STATE_FILE")
2348
- atomic_write_state "$tmp_state"
4238
+ '.last_optimization = {timestamp: $ts, adjustments: $adj}'
2349
4239
 
2350
4240
  # ── Persist adjustments to daemon-config.json (survives restart) ──
2351
4241
  local config_file="${CONFIG_PATH:-.claude/daemon-config.json}"
@@ -2427,24 +4317,59 @@ daemon_cleanup_stale() {
2427
4317
  done < <(find "$artifacts_dir" -mindepth 1 -maxdepth 1 -type d 2>/dev/null)
2428
4318
  fi
2429
4319
 
2430
- # ── 3. Prune completed/failed state entries older than age_days ──
4320
+ # ── 3. Clean orphaned daemon/* branches (no matching worktree or active job) ──
4321
+ if command -v git &>/dev/null; then
4322
+ while IFS= read -r branch; do
4323
+ [[ -z "$branch" ]] && continue
4324
+ branch="${branch## }" # trim leading spaces
4325
+ # Only clean daemon-created branches
4326
+ [[ "$branch" == daemon/issue-* ]] || continue
4327
+ # Extract issue number
4328
+ local branch_issue_num="${branch#daemon/issue-}"
4329
+ # Skip if there's an active job for this issue
4330
+ if daemon_is_inflight "$branch_issue_num" 2>/dev/null; then
4331
+ continue
4332
+ fi
4333
+ daemon_log INFO "Removing orphaned branch: ${branch}"
4334
+ git branch -D "$branch" 2>/dev/null || true
4335
+ cleaned=$((cleaned + 1))
4336
+ done < <(git branch --list 'daemon/issue-*' 2>/dev/null)
4337
+ fi
4338
+
4339
+ # ── 4. Prune completed/failed state entries older than age_days ──
2431
4340
  if [[ -f "$STATE_FILE" ]]; then
2432
4341
  local cutoff_iso
2433
4342
  cutoff_iso=$(epoch_to_iso $((now_e - age_secs)))
2434
- local before_count after_count
4343
+ local before_count
2435
4344
  before_count=$(jq '.completed | length' "$STATE_FILE" 2>/dev/null || echo 0)
2436
- local tmp_state
2437
- tmp_state=$(jq --arg cutoff "$cutoff_iso" \
2438
- '.completed = [.completed[] | select(.completed_at > $cutoff)]' \
2439
- "$STATE_FILE" 2>/dev/null) || true
2440
- if [[ -n "$tmp_state" ]]; then
2441
- atomic_write_state "$tmp_state"
2442
- after_count=$(jq '.completed | length' "$STATE_FILE" 2>/dev/null || echo 0)
2443
- local pruned=$((before_count - after_count))
2444
- if [[ "$pruned" -gt 0 ]]; then
2445
- daemon_log INFO "Pruned ${pruned} old completed state entries"
2446
- cleaned=$((cleaned + pruned))
4345
+ locked_state_update --arg cutoff "$cutoff_iso" \
4346
+ '.completed = [.completed[] | select(.completed_at > $cutoff)]' 2>/dev/null || true
4347
+ local after_count
4348
+ after_count=$(jq '.completed | length' "$STATE_FILE" 2>/dev/null || echo 0)
4349
+ local pruned=$((before_count - after_count))
4350
+ if [[ "$pruned" -gt 0 ]]; then
4351
+ daemon_log INFO "Pruned ${pruned} old completed state entries"
4352
+ cleaned=$((cleaned + pruned))
4353
+ fi
4354
+ fi
4355
+
4356
+ # ── 5. Prune stale retry_counts (issues no longer in flight or queued) ──
4357
+ if [[ -f "$STATE_FILE" ]]; then
4358
+ local retry_keys
4359
+ retry_keys=$(jq -r '.retry_counts // {} | keys[]' "$STATE_FILE" 2>/dev/null || true)
4360
+ local stale_keys=()
4361
+ while IFS= read -r key; do
4362
+ [[ -z "$key" ]] && continue
4363
+ if ! daemon_is_inflight "$key" 2>/dev/null; then
4364
+ stale_keys+=("$key")
2447
4365
  fi
4366
+ done <<< "$retry_keys"
4367
+ if [[ ${#stale_keys[@]} -gt 0 ]]; then
4368
+ for sk in "${stale_keys[@]}"; do
4369
+ locked_state_update --arg k "$sk" 'del(.retry_counts[$k])' 2>/dev/null || continue
4370
+ done
4371
+ daemon_log INFO "Pruned ${#stale_keys[@]} stale retry count(s)"
4372
+ cleaned=$((cleaned + ${#stale_keys[@]}))
2448
4373
  fi
2449
4374
  fi
2450
4375
 
@@ -2465,56 +4390,69 @@ daemon_poll_loop() {
2465
4390
  daemon_log INFO "Watching for label: ${CYAN}${WATCH_LABEL}${RESET}"
2466
4391
 
2467
4392
  while [[ ! -f "$SHUTDOWN_FLAG" ]]; do
2468
- daemon_poll_issues
2469
- daemon_reap_completed
2470
- daemon_health_check
4393
+ # All poll loop calls are error-guarded to prevent set -e from killing the daemon.
4394
+ # The || operator disables set -e for the entire call chain, so transient failures
4395
+ # (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
4396
+ daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
4397
+ daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
4398
+ daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
2471
4399
 
2472
4400
  # Increment cycle counter (must be before all modulo checks)
2473
4401
  POLL_CYCLE_COUNT=$((POLL_CYCLE_COUNT + 1))
2474
4402
 
2475
4403
  # Fleet config reload every 3 cycles
2476
4404
  if [[ $((POLL_CYCLE_COUNT % 3)) -eq 0 ]]; then
2477
- daemon_reload_config
4405
+ daemon_reload_config || daemon_log WARN "daemon_reload_config failed — continuing"
2478
4406
  fi
2479
4407
 
2480
4408
  # Check degradation every 5 poll cycles
2481
4409
  if [[ $((POLL_CYCLE_COUNT % 5)) -eq 0 ]]; then
2482
- daemon_check_degradation
4410
+ daemon_check_degradation || daemon_log WARN "daemon_check_degradation failed — continuing"
2483
4411
  fi
2484
4412
 
2485
4413
  # Auto-scale every N cycles (default: 5)
2486
4414
  if [[ $((POLL_CYCLE_COUNT % ${AUTO_SCALE_INTERVAL:-5})) -eq 0 ]]; then
2487
- daemon_auto_scale
4415
+ daemon_auto_scale || daemon_log WARN "daemon_auto_scale failed — continuing"
2488
4416
  fi
2489
4417
 
2490
4418
  # Self-optimize every N cycles (default: 10)
2491
4419
  if [[ $((POLL_CYCLE_COUNT % ${OPTIMIZE_INTERVAL:-10})) -eq 0 ]]; then
2492
- daemon_self_optimize
4420
+ daemon_self_optimize || daemon_log WARN "daemon_self_optimize failed — continuing"
2493
4421
  fi
2494
4422
 
2495
4423
  # Stale state reaper every N cycles (default: 10)
2496
4424
  if [[ $((POLL_CYCLE_COUNT % ${STALE_REAPER_INTERVAL:-10})) -eq 0 ]]; then
2497
- daemon_cleanup_stale
4425
+ daemon_cleanup_stale || daemon_log WARN "daemon_cleanup_stale failed — continuing"
2498
4426
  fi
2499
4427
 
2500
- # Proactive patrol during quiet periods
4428
+ # Rotate event log every 10 cycles (~10 min with 60s interval)
4429
+ if [[ $((POLL_CYCLE_COUNT % 10)) -eq 0 ]]; then
4430
+ rotate_event_log || true
4431
+ fi
4432
+
4433
+ # Proactive patrol during quiet periods (with adaptive limits)
2501
4434
  local issue_count_now active_count_now
2502
4435
  issue_count_now=$(jq -r '.queued | length' "$STATE_FILE" 2>/dev/null || echo 0)
2503
- active_count_now=$(get_active_count)
4436
+ active_count_now=$(get_active_count || echo 0)
2504
4437
  if [[ "$issue_count_now" -eq 0 ]] && [[ "$active_count_now" -eq 0 ]]; then
2505
4438
  local now_e
2506
- now_e=$(now_epoch)
4439
+ now_e=$(now_epoch || date +%s)
2507
4440
  if [[ $((now_e - LAST_PATROL_EPOCH)) -ge "$PATROL_INTERVAL" ]]; then
4441
+ load_adaptive_patrol_limits || true
2508
4442
  daemon_log INFO "No active work — running patrol"
2509
- daemon_patrol --once
4443
+ daemon_patrol --once || daemon_log WARN "daemon_patrol failed — continuing"
2510
4444
  LAST_PATROL_EPOCH=$now_e
2511
4445
  fi
2512
4446
  fi
2513
4447
 
4448
+ # ── Adaptive poll interval: adjust sleep based on queue state ──
4449
+ local effective_interval
4450
+ effective_interval=$(get_adaptive_poll_interval "$issue_count_now" "$active_count_now" || echo "${POLL_INTERVAL:-30}")
4451
+
2514
4452
  # Sleep in 1s intervals so we can catch shutdown quickly
2515
4453
  local i=0
2516
- while [[ $i -lt $POLL_INTERVAL ]] && [[ ! -f "$SHUTDOWN_FLAG" ]]; do
2517
- sleep 1
4454
+ while [[ $i -lt $effective_interval ]] && [[ ! -f "$SHUTDOWN_FLAG" ]]; do
4455
+ sleep 1 || true # Guard against signal interruption under set -e
2518
4456
  i=$((i + 1))
2519
4457
  done
2520
4458
  done
@@ -2525,7 +4463,39 @@ daemon_poll_loop() {
2525
4463
  # ─── Graceful Shutdown Handler ───────────────────────────────────────────────
2526
4464
 
2527
4465
  cleanup_on_exit() {
2528
- daemon_log INFO "Cleaning up..."
4466
+ local exit_code=$?
4467
+ local last_cmd="${BASH_COMMAND:-unknown}"
4468
+ daemon_log INFO "Cleaning up... (exit_code=${exit_code}, last_command=${last_cmd})"
4469
+
4470
+ # Kill all active pipeline child processes
4471
+ if [[ -f "$STATE_FILE" ]]; then
4472
+ local child_pids
4473
+ child_pids=$(jq -r '.active_jobs[].pid // empty' "$STATE_FILE" 2>/dev/null || true)
4474
+ if [[ -n "$child_pids" ]]; then
4475
+ local killed=0
4476
+ while IFS= read -r cpid; do
4477
+ [[ -z "$cpid" ]] && continue
4478
+ if kill -0 "$cpid" 2>/dev/null; then
4479
+ daemon_log INFO "Killing pipeline process PID ${cpid}"
4480
+ kill "$cpid" 2>/dev/null || true
4481
+ killed=$((killed + 1))
4482
+ fi
4483
+ done <<< "$child_pids"
4484
+ if [[ $killed -gt 0 ]]; then
4485
+ daemon_log INFO "Sent SIGTERM to ${killed} pipeline process(es) — waiting 5s"
4486
+ sleep 5
4487
+ # Force-kill any that didn't exit
4488
+ while IFS= read -r cpid; do
4489
+ [[ -z "$cpid" ]] && continue
4490
+ if kill -0 "$cpid" 2>/dev/null; then
4491
+ daemon_log WARN "Force-killing pipeline PID ${cpid}"
4492
+ kill -9 "$cpid" 2>/dev/null || true
4493
+ fi
4494
+ done <<< "$child_pids"
4495
+ fi
4496
+ fi
4497
+ fi
4498
+
2529
4499
  rm -f "$PID_FILE" "$SHUTDOWN_FLAG"
2530
4500
  daemon_log INFO "Daemon stopped"
2531
4501
  emit_event "daemon.stopped" "pid=$$"
@@ -2544,15 +4514,19 @@ daemon_start() {
2544
4514
  local existing_pid
2545
4515
  existing_pid=$(cat "$PID_FILE" 2>/dev/null || true)
2546
4516
  if [[ -n "$existing_pid" ]] && kill -0 "$existing_pid" 2>/dev/null; then
4517
+ exec 9>&- # Release FD before exiting
2547
4518
  error "Daemon already running (PID: ${existing_pid})"
2548
4519
  info "Use ${CYAN}shipwright daemon stop${RESET} to stop it first"
2549
4520
  exit 1
2550
4521
  else
2551
4522
  warn "Stale PID file found — removing"
2552
4523
  rm -f "$PID_FILE"
4524
+ exec 9>&- # Release old FD
2553
4525
  exec 9>"$PID_FILE"
2554
4526
  fi
2555
4527
  fi
4528
+ # Release FD 9 — we only needed it for the startup race check
4529
+ exec 9>&-
2556
4530
 
2557
4531
  # Load config
2558
4532
  load_config
@@ -2569,10 +4543,10 @@ daemon_start() {
2569
4543
  exit 1
2570
4544
  fi
2571
4545
 
2572
- info "Starting daemon in detached tmux session: ${CYAN}cct-daemon${RESET}"
4546
+ info "Starting daemon in detached tmux session: ${CYAN}sw-daemon${RESET}"
2573
4547
 
2574
4548
  # Build the command to run in tmux
2575
- local cmd_args=("$SCRIPT_DIR/cct-daemon.sh" "start")
4549
+ local cmd_args=("$SCRIPT_DIR/sw-daemon.sh" "start")
2576
4550
  if [[ -n "$CONFIG_PATH" ]]; then
2577
4551
  cmd_args+=("--config" "$CONFIG_PATH")
2578
4552
  fi
@@ -2580,14 +4554,16 @@ daemon_start() {
2580
4554
  cmd_args+=("--no-github")
2581
4555
  fi
2582
4556
 
2583
- tmux new-session -d -s "cct-daemon" "${cmd_args[*]}" 2>/dev/null || {
4557
+ # Export current PATH so detached session finds claude, gh, etc.
4558
+ local tmux_cmd="export PATH='${PATH}'; ${cmd_args[*]}"
4559
+ tmux new-session -d -s "sw-daemon" "$tmux_cmd" 2>/dev/null || {
2584
4560
  # Session may already exist — try killing and recreating
2585
- tmux kill-session -t "cct-daemon" 2>/dev/null || true
2586
- tmux new-session -d -s "cct-daemon" "${cmd_args[*]}"
4561
+ tmux kill-session -t "sw-daemon" 2>/dev/null || true
4562
+ tmux new-session -d -s "sw-daemon" "$tmux_cmd"
2587
4563
  }
2588
4564
 
2589
- success "Daemon started in tmux session ${CYAN}cct-daemon${RESET}"
2590
- info "Attach with: ${DIM}tmux attach -t cct-daemon${RESET}"
4565
+ success "Daemon started in tmux session ${CYAN}sw-daemon${RESET}"
4566
+ info "Attach with: ${DIM}tmux attach -t sw-daemon${RESET}"
2591
4567
  info "View logs: ${DIM}shipwright daemon logs --follow${RESET}"
2592
4568
  return 0
2593
4569
  fi
@@ -2595,8 +4571,10 @@ daemon_start() {
2595
4571
  # Foreground mode
2596
4572
  info "Starting daemon (PID: $$)"
2597
4573
 
2598
- # Write PID file
2599
- echo "$$" > "$PID_FILE"
4574
+ # Write PID file atomically
4575
+ local pid_tmp="${PID_FILE}.tmp.$$"
4576
+ echo "$$" > "$pid_tmp"
4577
+ mv "$pid_tmp" "$PID_FILE"
2600
4578
 
2601
4579
  # Remove stale shutdown flag
2602
4580
  rm -f "$SHUTDOWN_FLAG"
@@ -2606,10 +4584,26 @@ daemon_start() {
2606
4584
 
2607
4585
  # Trap signals for graceful shutdown
2608
4586
  trap cleanup_on_exit EXIT
2609
- trap 'touch "$SHUTDOWN_FLAG"' SIGINT SIGTERM
4587
+ trap '{ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [WARN] SIGINT/SIGTERM received — initiating shutdown" >> "$LOG_FILE" 2>/dev/null; } || true; touch "$SHUTDOWN_FLAG"' SIGINT SIGTERM
4588
+ # Ignore SIGHUP — tmux sends this on attach/detach and we must survive it
4589
+ trap '' SIGHUP
4590
+ # Ignore SIGPIPE — broken pipes in command substitutions must not kill the daemon
4591
+ trap '' SIGPIPE
4592
+
4593
+ # Override global ERR trap to log to daemon log file (not stderr, which is lost when tmux dies)
4594
+ trap '{ echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [ERROR] ERR trap: line=$LINENO exit=$? cmd=$BASH_COMMAND" >> "$LOG_FILE" 2>/dev/null; } || true' ERR
2610
4595
 
2611
4596
  # Reap any orphaned jobs from previous runs
2612
- daemon_reap_completed
4597
+ daemon_reap_completed || daemon_log WARN "Failed to reap orphaned jobs — continuing"
4598
+
4599
+ # Clean up stale temp files from previous crashes
4600
+ find "$(dirname "$STATE_FILE")" -name "*.tmp.*" -mmin +5 -delete 2>/dev/null || true
4601
+
4602
+ # Rotate event log on startup
4603
+ rotate_event_log || true
4604
+
4605
+ # Load GitHub context (repo metadata, security alerts, etc.)
4606
+ daemon_github_context || daemon_log WARN "Failed to load GitHub context — continuing without it"
2613
4607
 
2614
4608
  daemon_log INFO "Daemon started successfully"
2615
4609
  daemon_log INFO "Config: poll_interval=${POLL_INTERVAL}s, max_parallel=${MAX_PARALLEL}, label=${WATCH_LABEL}"
@@ -2673,7 +4667,7 @@ daemon_stop() {
2673
4667
  rm -f "$PID_FILE" "$SHUTDOWN_FLAG"
2674
4668
 
2675
4669
  # Also kill tmux session if it exists
2676
- tmux kill-session -t "cct-daemon" 2>/dev/null || true
4670
+ tmux kill-session -t "sw-daemon" 2>/dev/null || true
2677
4671
 
2678
4672
  success "Daemon stopped"
2679
4673
  }
@@ -2819,7 +4813,14 @@ daemon_init() {
2819
4813
  "patrol": {
2820
4814
  "interval": 3600,
2821
4815
  "max_issues": 5,
2822
- "label": "auto-patrol"
4816
+ "label": "auto-patrol",
4817
+ "auto_watch": false,
4818
+ "checks": {
4819
+ "recurring_failures": { "enabled": true, "threshold": 3 },
4820
+ "dora_degradation": { "enabled": true },
4821
+ "untested_scripts": { "enabled": true },
4822
+ "retry_exhaustion": { "enabled": true, "threshold": 2 }
4823
+ }
2823
4824
  },
2824
4825
  "auto_template": false,
2825
4826
  "template_map": {
@@ -2841,7 +4842,19 @@ daemon_init() {
2841
4842
  "max_workers": 8,
2842
4843
  "min_workers": 1,
2843
4844
  "worker_mem_gb": 4,
2844
- "estimated_cost_per_job_usd": 5.0
4845
+ "estimated_cost_per_job_usd": 5.0,
4846
+ "intelligence": {
4847
+ "enabled": true,
4848
+ "cache_ttl_seconds": 3600,
4849
+ "composer_enabled": true,
4850
+ "optimization_enabled": true,
4851
+ "prediction_enabled": true,
4852
+ "adversarial_enabled": false,
4853
+ "simulation_enabled": false,
4854
+ "architecture_enabled": false,
4855
+ "ab_test_ratio": 0.2,
4856
+ "anomaly_threshold": 3.0
4857
+ }
2845
4858
  }
2846
4859
  CONFIGEOF
2847
4860
 
@@ -3175,7 +5188,7 @@ case "$SUBCOMMAND" in
3175
5188
  daemon_patrol "$@"
3176
5189
  ;;
3177
5190
  test)
3178
- exec "$SCRIPT_DIR/cct-daemon-test.sh" "$@"
5191
+ exec "$SCRIPT_DIR/sw-daemon-test.sh" "$@"
3179
5192
  ;;
3180
5193
  help|--help|-h)
3181
5194
  show_help