shipwright-cli 1.10.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +221 -55
- package/completions/_shipwright +264 -32
- package/completions/shipwright.bash +118 -26
- package/completions/shipwright.fish +80 -2
- package/dashboard/server.ts +208 -0
- package/docs/strategy/01-market-research.md +619 -0
- package/docs/strategy/02-mission-and-brand.md +587 -0
- package/docs/strategy/03-gtm-and-roadmap.md +759 -0
- package/docs/strategy/QUICK-START.txt +289 -0
- package/docs/strategy/README.md +172 -0
- package/docs/tmux-research/TMUX-ARCHITECTURE.md +567 -0
- package/docs/tmux-research/TMUX-AUDIT.md +925 -0
- package/docs/tmux-research/TMUX-BEST-PRACTICES-2025-2026.md +829 -0
- package/docs/tmux-research/TMUX-QUICK-REFERENCE.md +543 -0
- package/docs/tmux-research/TMUX-RESEARCH-INDEX.md +438 -0
- package/package.json +4 -2
- package/scripts/lib/helpers.sh +7 -0
- package/scripts/sw +323 -2
- package/scripts/sw-activity.sh +500 -0
- package/scripts/sw-adaptive.sh +925 -0
- package/scripts/sw-adversarial.sh +1 -1
- package/scripts/sw-architecture-enforcer.sh +1 -1
- package/scripts/sw-auth.sh +613 -0
- package/scripts/sw-autonomous.sh +754 -0
- package/scripts/sw-changelog.sh +704 -0
- package/scripts/sw-checkpoint.sh +1 -1
- package/scripts/sw-ci.sh +602 -0
- package/scripts/sw-cleanup.sh +1 -1
- package/scripts/sw-code-review.sh +698 -0
- package/scripts/sw-connect.sh +1 -1
- package/scripts/sw-context.sh +605 -0
- package/scripts/sw-cost.sh +44 -3
- package/scripts/sw-daemon.sh +568 -138
- package/scripts/sw-dashboard.sh +1 -1
- package/scripts/sw-db.sh +1380 -0
- package/scripts/sw-decompose.sh +539 -0
- package/scripts/sw-deps.sh +551 -0
- package/scripts/sw-developer-simulation.sh +1 -1
- package/scripts/sw-discovery.sh +412 -0
- package/scripts/sw-docs-agent.sh +539 -0
- package/scripts/sw-docs.sh +1 -1
- package/scripts/sw-doctor.sh +107 -1
- package/scripts/sw-dora.sh +615 -0
- package/scripts/sw-durable.sh +710 -0
- package/scripts/sw-e2e-orchestrator.sh +535 -0
- package/scripts/sw-eventbus.sh +393 -0
- package/scripts/sw-feedback.sh +479 -0
- package/scripts/sw-fix.sh +1 -1
- package/scripts/sw-fleet-discover.sh +567 -0
- package/scripts/sw-fleet-viz.sh +404 -0
- package/scripts/sw-fleet.sh +8 -1
- package/scripts/sw-github-app.sh +596 -0
- package/scripts/sw-github-checks.sh +4 -4
- package/scripts/sw-github-deploy.sh +1 -1
- package/scripts/sw-github-graphql.sh +1 -1
- package/scripts/sw-guild.sh +569 -0
- package/scripts/sw-heartbeat.sh +1 -1
- package/scripts/sw-hygiene.sh +559 -0
- package/scripts/sw-incident.sh +656 -0
- package/scripts/sw-init.sh +237 -24
- package/scripts/sw-instrument.sh +699 -0
- package/scripts/sw-intelligence.sh +1 -1
- package/scripts/sw-jira.sh +1 -1
- package/scripts/sw-launchd.sh +363 -28
- package/scripts/sw-linear.sh +1 -1
- package/scripts/sw-logs.sh +1 -1
- package/scripts/sw-loop.sh +267 -21
- package/scripts/sw-memory.sh +18 -1
- package/scripts/sw-mission-control.sh +487 -0
- package/scripts/sw-model-router.sh +545 -0
- package/scripts/sw-otel.sh +596 -0
- package/scripts/sw-oversight.sh +764 -0
- package/scripts/sw-pipeline-composer.sh +1 -1
- package/scripts/sw-pipeline-vitals.sh +1 -1
- package/scripts/sw-pipeline.sh +947 -35
- package/scripts/sw-pm.sh +758 -0
- package/scripts/sw-pr-lifecycle.sh +522 -0
- package/scripts/sw-predictive.sh +8 -1
- package/scripts/sw-prep.sh +1 -1
- package/scripts/sw-ps.sh +1 -1
- package/scripts/sw-public-dashboard.sh +798 -0
- package/scripts/sw-quality.sh +595 -0
- package/scripts/sw-reaper.sh +1 -1
- package/scripts/sw-recruit.sh +2248 -0
- package/scripts/sw-regression.sh +642 -0
- package/scripts/sw-release-manager.sh +736 -0
- package/scripts/sw-release.sh +706 -0
- package/scripts/sw-remote.sh +1 -1
- package/scripts/sw-replay.sh +520 -0
- package/scripts/sw-retro.sh +691 -0
- package/scripts/sw-scale.sh +444 -0
- package/scripts/sw-security-audit.sh +505 -0
- package/scripts/sw-self-optimize.sh +1 -1
- package/scripts/sw-session.sh +1 -1
- package/scripts/sw-setup.sh +263 -127
- package/scripts/sw-standup.sh +712 -0
- package/scripts/sw-status.sh +44 -2
- package/scripts/sw-strategic.sh +806 -0
- package/scripts/sw-stream.sh +450 -0
- package/scripts/sw-swarm.sh +620 -0
- package/scripts/sw-team-stages.sh +511 -0
- package/scripts/sw-templates.sh +4 -4
- package/scripts/sw-testgen.sh +566 -0
- package/scripts/sw-tmux-pipeline.sh +554 -0
- package/scripts/sw-tmux-role-color.sh +58 -0
- package/scripts/sw-tmux-status.sh +128 -0
- package/scripts/sw-tmux.sh +1 -1
- package/scripts/sw-trace.sh +485 -0
- package/scripts/sw-tracker-github.sh +188 -0
- package/scripts/sw-tracker-jira.sh +172 -0
- package/scripts/sw-tracker-linear.sh +251 -0
- package/scripts/sw-tracker.sh +117 -2
- package/scripts/sw-triage.sh +627 -0
- package/scripts/sw-upgrade.sh +1 -1
- package/scripts/sw-ux.sh +677 -0
- package/scripts/sw-webhook.sh +627 -0
- package/scripts/sw-widgets.sh +530 -0
- package/scripts/sw-worktree.sh +1 -1
- package/templates/pipelines/autonomous.json +2 -2
- package/tmux/shipwright-overlay.conf +35 -17
- package/tmux/tmux.conf +23 -21
package/scripts/sw-daemon.sh
CHANGED
|
@@ -6,7 +6,10 @@
|
|
|
6
6
|
set -euo pipefail
|
|
7
7
|
trap 'echo "ERROR: $BASH_SOURCE:$LINENO exited with status $?" >&2' ERR
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
# Allow spawning Claude CLI from within a Claude Code session (daemon, fleet, etc.)
|
|
10
|
+
unset CLAUDECODE 2>/dev/null || true
|
|
11
|
+
|
|
12
|
+
VERSION="2.1.0"
|
|
10
13
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
11
14
|
REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
12
15
|
|
|
@@ -37,6 +40,10 @@ RESET='\033[0m'
|
|
|
37
40
|
# shellcheck source=sw-pipeline-vitals.sh
|
|
38
41
|
[[ -f "$SCRIPT_DIR/sw-pipeline-vitals.sh" ]] && source "$SCRIPT_DIR/sw-pipeline-vitals.sh"
|
|
39
42
|
|
|
43
|
+
# ─── SQLite Persistence (optional) ──────────────────────────────────────────
|
|
44
|
+
# shellcheck source=sw-db.sh
|
|
45
|
+
[[ -f "$SCRIPT_DIR/sw-db.sh" ]] && source "$SCRIPT_DIR/sw-db.sh"
|
|
46
|
+
|
|
40
47
|
# ─── GitHub API Modules (optional) ────────────────────────────────────────
|
|
41
48
|
# shellcheck source=sw-github-graphql.sh
|
|
42
49
|
[[ -f "$SCRIPT_DIR/sw-github-graphql.sh" ]] && source "$SCRIPT_DIR/sw-github-graphql.sh"
|
|
@@ -478,9 +485,11 @@ load_config() {
|
|
|
478
485
|
|
|
479
486
|
# progress-based health monitoring (replaces static timeouts)
|
|
480
487
|
PROGRESS_MONITORING=$(jq -r '.health.progress_based // true' "$config_file")
|
|
481
|
-
PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn //
|
|
482
|
-
PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill //
|
|
483
|
-
PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s //
|
|
488
|
+
PROGRESS_CHECKS_BEFORE_WARN=$(jq -r '.health.stale_checks_before_warn // 20' "$config_file")
|
|
489
|
+
PROGRESS_CHECKS_BEFORE_KILL=$(jq -r '.health.stale_checks_before_kill // 120' "$config_file")
|
|
490
|
+
PROGRESS_HARD_LIMIT_S=$(jq -r '.health.hard_limit_s // 0' "$config_file") # 0 = disabled (no hard kill)
|
|
491
|
+
NUDGE_ENABLED=$(jq -r '.health.nudge_enabled // true' "$config_file")
|
|
492
|
+
NUDGE_AFTER_CHECKS=$(jq -r '.health.nudge_after_checks // 40' "$config_file")
|
|
484
493
|
|
|
485
494
|
# team dashboard URL (for coordinated claiming)
|
|
486
495
|
local cfg_dashboard_url
|
|
@@ -502,11 +511,13 @@ load_config() {
|
|
|
502
511
|
|
|
503
512
|
setup_dirs() {
|
|
504
513
|
mkdir -p "$DAEMON_DIR"
|
|
514
|
+
mkdir -p "$HOME/.shipwright"
|
|
505
515
|
|
|
506
516
|
STATE_FILE="$DAEMON_DIR/daemon-state.json"
|
|
507
517
|
LOG_FILE="$DAEMON_DIR/daemon.log"
|
|
508
518
|
LOG_DIR="$DAEMON_DIR/logs"
|
|
509
519
|
WORKTREE_DIR=".worktrees"
|
|
520
|
+
PAUSE_FLAG="${HOME}/.shipwright/daemon-pause.flag"
|
|
510
521
|
|
|
511
522
|
mkdir -p "$LOG_DIR"
|
|
512
523
|
mkdir -p "$HOME/.shipwright/progress"
|
|
@@ -836,6 +847,31 @@ daemon_assess_progress() {
|
|
|
836
847
|
has_progress=true
|
|
837
848
|
fi
|
|
838
849
|
|
|
850
|
+
# Claude subprocess is alive and consuming CPU — agent is thinking/working
|
|
851
|
+
# During build stage, Claude can spend 10+ minutes thinking before any
|
|
852
|
+
# visible git changes appear. Detect this as progress.
|
|
853
|
+
if [[ "$has_progress" != "true" ]]; then
|
|
854
|
+
local _pid_for_check
|
|
855
|
+
_pid_for_check=$(echo "$current_snapshot" | jq -r '.pid // empty' 2>/dev/null || true)
|
|
856
|
+
if [[ -z "$_pid_for_check" ]]; then
|
|
857
|
+
# Fallback: get PID from active_jobs
|
|
858
|
+
_pid_for_check=$(jq -r --argjson num "$issue_num" \
|
|
859
|
+
'.active_jobs[] | select(.issue == ($num | tonumber)) | .pid' "$STATE_FILE" 2>/dev/null | head -1 || true)
|
|
860
|
+
fi
|
|
861
|
+
if [[ -n "$_pid_for_check" ]]; then
|
|
862
|
+
# Check if any child process (claude) is alive and using CPU
|
|
863
|
+
local child_cpu=0
|
|
864
|
+
child_cpu=$(ps -o pid=,pcpu= -p "$_pid_for_check" 2>/dev/null | awk '{sum+=$2} END{printf "%d", sum+0}' || echo "0")
|
|
865
|
+
if [[ "$child_cpu" -eq 0 ]]; then
|
|
866
|
+
# Check children of the pipeline process
|
|
867
|
+
child_cpu=$(pgrep -P "$_pid_for_check" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
868
|
+
fi
|
|
869
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
870
|
+
has_progress=true
|
|
871
|
+
fi
|
|
872
|
+
fi
|
|
873
|
+
fi
|
|
874
|
+
|
|
839
875
|
# Detect repeated errors (same error signature hitting again)
|
|
840
876
|
local repeated_errors="$prev_repeated_errors"
|
|
841
877
|
if [[ -n "$cur_error" && "$cur_error" == "$prev_error" ]]; then
|
|
@@ -1208,6 +1244,74 @@ gh_record_failure() {
|
|
|
1208
1244
|
fi
|
|
1209
1245
|
}
|
|
1210
1246
|
|
|
1247
|
+
# ─── Runtime Auth Check ──────────────────────────────────────────────────────
|
|
1248
|
+
|
|
1249
|
+
LAST_AUTH_CHECK_EPOCH=0
|
|
1250
|
+
AUTH_CHECK_INTERVAL=300 # 5 minutes
|
|
1251
|
+
|
|
1252
|
+
daemon_preflight_auth_check() {
|
|
1253
|
+
local now_e
|
|
1254
|
+
now_e=$(now_epoch)
|
|
1255
|
+
if [[ $((now_e - LAST_AUTH_CHECK_EPOCH)) -lt "$AUTH_CHECK_INTERVAL" ]]; then
|
|
1256
|
+
return 0
|
|
1257
|
+
fi
|
|
1258
|
+
LAST_AUTH_CHECK_EPOCH="$now_e"
|
|
1259
|
+
|
|
1260
|
+
# gh auth check
|
|
1261
|
+
if [[ "${NO_GITHUB:-false}" != "true" ]]; then
|
|
1262
|
+
if ! gh auth status &>/dev/null 2>&1; then
|
|
1263
|
+
daemon_log ERROR "GitHub auth check failed — auto-pausing daemon"
|
|
1264
|
+
local pause_json
|
|
1265
|
+
pause_json=$(jq -n --arg reason "gh_auth_failure" --arg ts "$(now_iso)" \
|
|
1266
|
+
'{reason: $reason, timestamp: $ts}')
|
|
1267
|
+
local _tmp_pause
|
|
1268
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
1269
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
1270
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
1271
|
+
emit_event "daemon.auto_pause" "reason=gh_auth_failure"
|
|
1272
|
+
return 1
|
|
1273
|
+
fi
|
|
1274
|
+
fi
|
|
1275
|
+
|
|
1276
|
+
# claude auth check with 15s timeout (macOS has no timeout command)
|
|
1277
|
+
local claude_auth_ok=false
|
|
1278
|
+
local _auth_tmp
|
|
1279
|
+
_auth_tmp=$(mktemp "${TMPDIR:-/tmp}/sw-auth.XXXXXX")
|
|
1280
|
+
( claude --print -p "ok" --max-turns 1 > "$_auth_tmp" 2>/dev/null ) &
|
|
1281
|
+
local _auth_pid=$!
|
|
1282
|
+
local _auth_waited=0
|
|
1283
|
+
while kill -0 "$_auth_pid" 2>/dev/null && [[ "$_auth_waited" -lt 15 ]]; do
|
|
1284
|
+
sleep 1
|
|
1285
|
+
_auth_waited=$((_auth_waited + 1))
|
|
1286
|
+
done
|
|
1287
|
+
if kill -0 "$_auth_pid" 2>/dev/null; then
|
|
1288
|
+
kill "$_auth_pid" 2>/dev/null || true
|
|
1289
|
+
wait "$_auth_pid" 2>/dev/null || true
|
|
1290
|
+
else
|
|
1291
|
+
wait "$_auth_pid" 2>/dev/null || true
|
|
1292
|
+
fi
|
|
1293
|
+
|
|
1294
|
+
if [[ -s "$_auth_tmp" ]]; then
|
|
1295
|
+
claude_auth_ok=true
|
|
1296
|
+
fi
|
|
1297
|
+
rm -f "$_auth_tmp"
|
|
1298
|
+
|
|
1299
|
+
if [[ "$claude_auth_ok" != "true" ]]; then
|
|
1300
|
+
daemon_log ERROR "Claude auth check failed — auto-pausing daemon"
|
|
1301
|
+
local pause_json
|
|
1302
|
+
pause_json=$(jq -n --arg reason "claude_auth_failure" --arg ts "$(now_iso)" \
|
|
1303
|
+
'{reason: $reason, timestamp: $ts}')
|
|
1304
|
+
local _tmp_pause
|
|
1305
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
1306
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
1307
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
1308
|
+
emit_event "daemon.auto_pause" "reason=claude_auth_failure"
|
|
1309
|
+
return 1
|
|
1310
|
+
fi
|
|
1311
|
+
|
|
1312
|
+
return 0
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1211
1315
|
# ─── Pre-flight Checks ──────────────────────────────────────────────────────
|
|
1212
1316
|
|
|
1213
1317
|
preflight_checks() {
|
|
@@ -1369,6 +1473,7 @@ init_state() {
|
|
|
1369
1473
|
queued: [],
|
|
1370
1474
|
completed: [],
|
|
1371
1475
|
retry_counts: {},
|
|
1476
|
+
failure_history: [],
|
|
1372
1477
|
priority_lane_active: [],
|
|
1373
1478
|
titles: {}
|
|
1374
1479
|
}')
|
|
@@ -1609,9 +1714,24 @@ daemon_spawn_pipeline() {
|
|
|
1609
1714
|
local issue_num="$1"
|
|
1610
1715
|
local issue_title="${2:-}"
|
|
1611
1716
|
local repo_full_name="${3:-}" # owner/repo (org mode only)
|
|
1717
|
+
shift 3 2>/dev/null || true
|
|
1718
|
+
local extra_pipeline_args=("$@") # Optional extra args passed to sw-pipeline.sh
|
|
1612
1719
|
|
|
1613
1720
|
daemon_log INFO "Spawning pipeline for issue #${issue_num}: ${issue_title}"
|
|
1614
1721
|
|
|
1722
|
+
# ── Issue decomposition (if decomposer available) ──
|
|
1723
|
+
local decompose_script="${SCRIPT_DIR}/sw-decompose.sh"
|
|
1724
|
+
if [[ -x "$decompose_script" && "$NO_GITHUB" != "true" ]]; then
|
|
1725
|
+
local decompose_result=""
|
|
1726
|
+
decompose_result=$("$decompose_script" auto "$issue_num" 2>/dev/null) || true
|
|
1727
|
+
if [[ "$decompose_result" == *"decomposed"* ]]; then
|
|
1728
|
+
daemon_log INFO "Issue #${issue_num} decomposed into subtasks — skipping pipeline"
|
|
1729
|
+
# Remove the shipwright label so decomposed parent doesn't re-queue
|
|
1730
|
+
gh issue edit "$issue_num" --remove-label "shipwright" 2>/dev/null || true
|
|
1731
|
+
return 0
|
|
1732
|
+
fi
|
|
1733
|
+
fi
|
|
1734
|
+
|
|
1615
1735
|
# Extract goal text from issue (title + first line of body)
|
|
1616
1736
|
local issue_goal="$issue_title"
|
|
1617
1737
|
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
@@ -1727,11 +1847,18 @@ daemon_spawn_pipeline() {
|
|
|
1727
1847
|
pipeline_args+=("--fast-test-cmd" "$FAST_TEST_CMD_CFG")
|
|
1728
1848
|
fi
|
|
1729
1849
|
|
|
1850
|
+
# Append any extra pipeline args (from retry escalation, etc.)
|
|
1851
|
+
if [[ ${#extra_pipeline_args[@]} -gt 0 ]]; then
|
|
1852
|
+
pipeline_args+=("${extra_pipeline_args[@]}")
|
|
1853
|
+
fi
|
|
1854
|
+
|
|
1730
1855
|
# Run pipeline in work directory (background)
|
|
1856
|
+
# Ignore SIGHUP so tmux attach/detach and process group changes don't kill the pipeline
|
|
1731
1857
|
echo -e "\n\n===== Pipeline run $(date -u +%Y-%m-%dT%H:%M:%SZ) =====" >> "$LOG_DIR/issue-${issue_num}.log" 2>/dev/null || true
|
|
1732
1858
|
(
|
|
1859
|
+
trap '' HUP
|
|
1733
1860
|
cd "$work_dir"
|
|
1734
|
-
"$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
|
|
1861
|
+
exec "$SCRIPT_DIR/sw-pipeline.sh" "${pipeline_args[@]}"
|
|
1735
1862
|
) >> "$LOG_DIR/issue-${issue_num}.log" 2>&1 200>&- &
|
|
1736
1863
|
local pid=$!
|
|
1737
1864
|
|
|
@@ -1767,6 +1894,14 @@ _Progress updates will appear below as the pipeline advances through each stage.
|
|
|
1767
1894
|
|
|
1768
1895
|
daemon_track_job() {
|
|
1769
1896
|
local issue_num="$1" pid="$2" worktree="$3" title="${4:-}" repo="${5:-}" goal="${6:-}"
|
|
1897
|
+
|
|
1898
|
+
# Write to SQLite (non-blocking, best-effort)
|
|
1899
|
+
if type db_save_job &>/dev/null; then
|
|
1900
|
+
local job_id="daemon-${issue_num}-$(now_epoch)"
|
|
1901
|
+
db_save_job "$job_id" "$issue_num" "$title" "$pid" "$worktree" "" "${PIPELINE_TEMPLATE:-autonomous}" "$goal" 2>/dev/null || true
|
|
1902
|
+
fi
|
|
1903
|
+
|
|
1904
|
+
# Always write to JSON state file (primary for now)
|
|
1770
1905
|
locked_state_update \
|
|
1771
1906
|
--argjson num "$issue_num" \
|
|
1772
1907
|
--argjson pid "$pid" \
|
|
@@ -1855,6 +1990,16 @@ daemon_reap_completed() {
|
|
|
1855
1990
|
[[ "$start_epoch" -gt 0 ]] && dur_s=$((end_epoch - start_epoch))
|
|
1856
1991
|
emit_event "daemon.reap" "issue=$issue_num" "result=$result_str" "duration_s=$dur_s"
|
|
1857
1992
|
|
|
1993
|
+
# Update SQLite (mark job complete/failed)
|
|
1994
|
+
if type db_complete_job &>/dev/null && type db_fail_job &>/dev/null; then
|
|
1995
|
+
local _db_job_id="daemon-${issue_num}-${start_epoch}"
|
|
1996
|
+
if [[ "$exit_code" -eq 0 ]]; then
|
|
1997
|
+
db_complete_job "$_db_job_id" "$result_str" 2>/dev/null || true
|
|
1998
|
+
else
|
|
1999
|
+
db_fail_job "$_db_job_id" "$result_str" 2>/dev/null || true
|
|
2000
|
+
fi
|
|
2001
|
+
fi
|
|
2002
|
+
|
|
1858
2003
|
if [[ "$exit_code" -eq 0 ]]; then
|
|
1859
2004
|
daemon_on_success "$issue_num" "$duration_str"
|
|
1860
2005
|
else
|
|
@@ -1904,15 +2049,18 @@ daemon_reap_completed() {
|
|
|
1904
2049
|
reap_machine_name=$(jq -r '.machines[] | select(.role == "primary") | .name' "$HOME/.shipwright/machines.json" 2>/dev/null || hostname -s)
|
|
1905
2050
|
release_claim "$issue_num" "$reap_machine_name"
|
|
1906
2051
|
|
|
1907
|
-
#
|
|
2052
|
+
# Always remove the OLD job entry from active_jobs to prevent
|
|
2053
|
+
# re-reaping of the dead PID on the next cycle. When a retry was
|
|
2054
|
+
# spawned, daemon_spawn_pipeline already added a fresh entry with
|
|
2055
|
+
# the new PID — we must not leave the stale one behind.
|
|
2056
|
+
locked_state_update --argjson num "$issue_num" \
|
|
2057
|
+
--argjson old_pid "${pid:-0}" \
|
|
2058
|
+
'.active_jobs = [.active_jobs[] | select(.issue != $num or .pid != $old_pid)]'
|
|
2059
|
+
untrack_priority_job "$issue_num"
|
|
2060
|
+
|
|
1908
2061
|
if [[ "$_retry_spawned_for" == "$issue_num" ]]; then
|
|
1909
2062
|
daemon_log INFO "Retry spawned for issue #${issue_num} — skipping worktree cleanup"
|
|
1910
2063
|
else
|
|
1911
|
-
# Remove from active_jobs and priority lane tracking (locked)
|
|
1912
|
-
locked_state_update --argjson num "$issue_num" \
|
|
1913
|
-
'.active_jobs = [.active_jobs[] | select(.issue != $num)]'
|
|
1914
|
-
untrack_priority_job "$issue_num"
|
|
1915
|
-
|
|
1916
2064
|
# Clean up worktree (skip for org-mode clones — they persist)
|
|
1917
2065
|
local job_repo
|
|
1918
2066
|
job_repo=$(echo "$job" | jq -r '.repo // ""')
|
|
@@ -1951,6 +2099,9 @@ daemon_reap_completed() {
|
|
|
1951
2099
|
daemon_on_success() {
|
|
1952
2100
|
local issue_num="$1" duration="${2:-}"
|
|
1953
2101
|
|
|
2102
|
+
# Reset consecutive failure tracking on any success
|
|
2103
|
+
reset_failure_tracking
|
|
2104
|
+
|
|
1954
2105
|
daemon_log SUCCESS "Pipeline completed for issue #${issue_num} (${duration:-unknown})"
|
|
1955
2106
|
|
|
1956
2107
|
# Record pipeline duration for adaptive threshold learning
|
|
@@ -2009,6 +2160,149 @@ Check the associated PR for the implementation." 2>/dev/null || true
|
|
|
2009
2160
|
notify "Pipeline Complete — Issue #${issue_num}" \
|
|
2010
2161
|
"Duration: ${duration:-unknown}" "success"
|
|
2011
2162
|
"$SCRIPT_DIR/sw-tracker.sh" notify "completed" "$issue_num" 2>/dev/null || true
|
|
2163
|
+
|
|
2164
|
+
# PM agent: record success for learning
|
|
2165
|
+
if [[ -x "$SCRIPT_DIR/sw-pm.sh" ]]; then
|
|
2166
|
+
bash "$SCRIPT_DIR/sw-pm.sh" learn "$issue_num" success 2>/dev/null || true
|
|
2167
|
+
fi
|
|
2168
|
+
}
|
|
2169
|
+
|
|
2170
|
+
# ─── Failure Classification ─────────────────────────────────────────────────
|
|
2171
|
+
|
|
2172
|
+
classify_failure() {
|
|
2173
|
+
local issue_num="$1"
|
|
2174
|
+
if [[ -z "${LOG_DIR:-}" ]]; then
|
|
2175
|
+
echo "unknown"
|
|
2176
|
+
return
|
|
2177
|
+
fi
|
|
2178
|
+
local log_path="$LOG_DIR/issue-${issue_num}.log"
|
|
2179
|
+
if [[ ! -f "$log_path" ]]; then
|
|
2180
|
+
echo "unknown"
|
|
2181
|
+
return
|
|
2182
|
+
fi
|
|
2183
|
+
local tail_content
|
|
2184
|
+
tail_content=$(tail -200 "$log_path" 2>/dev/null || true)
|
|
2185
|
+
|
|
2186
|
+
# Auth errors
|
|
2187
|
+
if echo "$tail_content" | grep -qiE 'not logged in|unauthorized|auth.*fail|401 |invalid.*token|CLAUDE_CODE_OAUTH_TOKEN|api key.*invalid|authentication required'; then
|
|
2188
|
+
echo "auth_error"
|
|
2189
|
+
return
|
|
2190
|
+
fi
|
|
2191
|
+
# API errors (rate limits, timeouts, server errors)
|
|
2192
|
+
if echo "$tail_content" | grep -qiE 'rate limit|429 |503 |502 |overloaded|timeout|ETIMEDOUT|ECONNRESET|socket hang up|service unavailable'; then
|
|
2193
|
+
echo "api_error"
|
|
2194
|
+
return
|
|
2195
|
+
fi
|
|
2196
|
+
# Invalid issue (not found, empty body)
|
|
2197
|
+
if echo "$tail_content" | grep -qiE 'issue not found|404 |no body|could not resolve|GraphQL.*not found|issue.*does not exist'; then
|
|
2198
|
+
echo "invalid_issue"
|
|
2199
|
+
return
|
|
2200
|
+
fi
|
|
2201
|
+
# Context exhaustion — check progress file
|
|
2202
|
+
local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
|
|
2203
|
+
local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
|
|
2204
|
+
if [[ -f "$progress_file" ]]; then
|
|
2205
|
+
local cf_iter
|
|
2206
|
+
cf_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
|
|
2207
|
+
if ! [[ "${cf_iter:-0}" =~ ^[0-9]+$ ]]; then cf_iter="0"; fi
|
|
2208
|
+
local cf_tests
|
|
2209
|
+
cf_tests=$(grep -oE 'Tests passing: (true|false)' "$progress_file" 2>/dev/null | awk '{print $NF}' || echo "unknown")
|
|
2210
|
+
if [[ "${cf_iter:-0}" -gt 0 ]] && { [[ "$cf_tests" == "false" ]] || [[ "$cf_tests" == "unknown" ]]; }; then
|
|
2211
|
+
echo "context_exhaustion"
|
|
2212
|
+
return
|
|
2213
|
+
fi
|
|
2214
|
+
fi
|
|
2215
|
+
# Build failure (test errors, compile errors)
|
|
2216
|
+
if echo "$tail_content" | grep -qiE 'test.*fail|FAIL|build.*error|compile.*error|lint.*fail|npm ERR|exit code [1-9]'; then
|
|
2217
|
+
echo "build_failure"
|
|
2218
|
+
return
|
|
2219
|
+
fi
|
|
2220
|
+
echo "unknown"
|
|
2221
|
+
}
|
|
2222
|
+
|
|
2223
|
+
# ─── Consecutive Failure Tracking (persisted + adaptive) ─────────────────────
|
|
2224
|
+
|
|
2225
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS=""
|
|
2226
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=0
|
|
2227
|
+
|
|
2228
|
+
# Max retries per failure class (adaptive retry strategy)
|
|
2229
|
+
get_max_retries_for_class() {
|
|
2230
|
+
local class="${1:-unknown}"
|
|
2231
|
+
case "$class" in
|
|
2232
|
+
auth_error|invalid_issue) echo 0 ;;
|
|
2233
|
+
api_error) echo "${MAX_RETRIES_API_ERROR:-4}" ;;
|
|
2234
|
+
context_exhaustion) echo "${MAX_RETRIES_CONTEXT_EXHAUSTION:-2}" ;;
|
|
2235
|
+
build_failure) echo "${MAX_RETRIES_BUILD:-2}" ;;
|
|
2236
|
+
*) echo "${MAX_RETRIES:-2}" ;;
|
|
2237
|
+
esac
|
|
2238
|
+
}
|
|
2239
|
+
|
|
2240
|
+
# Append failure to persisted history and compute consecutive count; smart pause with exponential backoff
|
|
2241
|
+
record_failure_class() {
|
|
2242
|
+
local failure_class="$1"
|
|
2243
|
+
# In-memory consecutive (for backward compat)
|
|
2244
|
+
if [[ "$failure_class" == "$DAEMON_CONSECUTIVE_FAILURE_CLASS" ]]; then
|
|
2245
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=$((DAEMON_CONSECUTIVE_FAILURE_COUNT + 1))
|
|
2246
|
+
else
|
|
2247
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS="$failure_class"
|
|
2248
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=1
|
|
2249
|
+
fi
|
|
2250
|
+
|
|
2251
|
+
# Persist failure to state (failure_history) for pattern tracking
|
|
2252
|
+
if [[ -f "${STATE_FILE:-}" ]]; then
|
|
2253
|
+
local entry
|
|
2254
|
+
entry=$(jq -n --arg ts "$(now_iso)" --arg class "$failure_class" '{ts: $ts, class: $class}')
|
|
2255
|
+
locked_state_update --argjson entry "$entry" \
|
|
2256
|
+
'.failure_history = ((.failure_history // []) + [$entry] | .[-100:])' 2>/dev/null || true
|
|
2257
|
+
fi
|
|
2258
|
+
|
|
2259
|
+
# Consecutive count from persisted tail: count only the unbroken run of $failure_class
|
|
2260
|
+
# from the newest entry backwards (not total occurrences)
|
|
2261
|
+
local consecutive="$DAEMON_CONSECUTIVE_FAILURE_COUNT"
|
|
2262
|
+
if [[ -f "${STATE_FILE:-}" ]]; then
|
|
2263
|
+
local from_state
|
|
2264
|
+
from_state=$(jq -r --arg c "$failure_class" '
|
|
2265
|
+
(.failure_history // []) | [.[].class] | reverse |
|
|
2266
|
+
if length == 0 then 0
|
|
2267
|
+
elif .[0] != $c then 0
|
|
2268
|
+
else
|
|
2269
|
+
reduce .[] as $x (
|
|
2270
|
+
{count: 0, done: false};
|
|
2271
|
+
if .done then . elif $x == $c then .count += 1 else .done = true end
|
|
2272
|
+
) | .count
|
|
2273
|
+
end
|
|
2274
|
+
' "$STATE_FILE" 2>/dev/null || echo "1")
|
|
2275
|
+
consecutive="${from_state:-1}"
|
|
2276
|
+
[[ "$consecutive" -eq 0 ]] && consecutive="$DAEMON_CONSECUTIVE_FAILURE_COUNT"
|
|
2277
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT="$consecutive"
|
|
2278
|
+
fi
|
|
2279
|
+
|
|
2280
|
+
# Smart pause: exponential backoff instead of hard stop (resume_after so daemon can auto-resume)
|
|
2281
|
+
if [[ "$consecutive" -ge 3 ]]; then
|
|
2282
|
+
local pause_mins=$((5 * (1 << (consecutive - 3))))
|
|
2283
|
+
[[ "$pause_mins" -gt 480 ]] && pause_mins=480
|
|
2284
|
+
local resume_ts resume_after
|
|
2285
|
+
resume_ts=$(($(date +%s) + pause_mins * 60))
|
|
2286
|
+
resume_after=$(epoch_to_iso "$resume_ts")
|
|
2287
|
+
daemon_log ERROR "${consecutive} consecutive failures (class: ${failure_class}) — auto-pausing until ${resume_after} (${pause_mins}m backoff)"
|
|
2288
|
+
local pause_json
|
|
2289
|
+
pause_json=$(jq -n \
|
|
2290
|
+
--arg reason "consecutive_${failure_class}" \
|
|
2291
|
+
--arg ts "$(now_iso)" \
|
|
2292
|
+
--arg resume "$resume_after" \
|
|
2293
|
+
--argjson count "$consecutive" \
|
|
2294
|
+
'{reason: $reason, timestamp: $ts, resume_after: $resume, consecutive_count: $count}')
|
|
2295
|
+
local _tmp_pause
|
|
2296
|
+
_tmp_pause=$(mktemp "${TMPDIR:-/tmp}/sw-pause.XXXXXX")
|
|
2297
|
+
echo "$pause_json" > "$_tmp_pause"
|
|
2298
|
+
mv "$_tmp_pause" "$PAUSE_FLAG"
|
|
2299
|
+
emit_event "daemon.auto_pause" "reason=consecutive_failures" "class=$failure_class" "count=$consecutive" "resume_after=$resume_after"
|
|
2300
|
+
fi
|
|
2301
|
+
}
|
|
2302
|
+
|
|
2303
|
+
reset_failure_tracking() {
|
|
2304
|
+
DAEMON_CONSECUTIVE_FAILURE_CLASS=""
|
|
2305
|
+
DAEMON_CONSECUTIVE_FAILURE_COUNT=0
|
|
2012
2306
|
}
|
|
2013
2307
|
|
|
2014
2308
|
# ─── Failure Handler ────────────────────────────────────────────────────────
|
|
@@ -2047,126 +2341,152 @@ daemon_on_failure() {
|
|
|
2047
2341
|
completed_at: $completed_at
|
|
2048
2342
|
}] | .completed = .completed[-500:]'
|
|
2049
2343
|
|
|
2344
|
+
# ── Classify failure and decide retry strategy ──
|
|
2345
|
+
local failure_class
|
|
2346
|
+
failure_class=$(classify_failure "$issue_num")
|
|
2347
|
+
daemon_log INFO "Failure classified as: ${failure_class} for issue #${issue_num}"
|
|
2348
|
+
emit_event "daemon.failure_classified" "issue=$issue_num" "class=$failure_class"
|
|
2349
|
+
record_failure_class "$failure_class"
|
|
2350
|
+
|
|
2050
2351
|
# ── Auto-retry with strategy escalation ──
|
|
2051
2352
|
if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
|
|
2052
2353
|
local retry_count
|
|
2053
2354
|
retry_count=$(jq -r --arg num "$issue_num" \
|
|
2054
2355
|
'.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
|
|
2055
2356
|
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
daemon_log WARN "Auto-retry #${retry_count}/${MAX_RETRIES:-2} for issue #${issue_num}"
|
|
2065
|
-
emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=${MAX_RETRIES:-2}"
|
|
2066
|
-
|
|
2067
|
-
# Check for checkpoint to enable resume-from-checkpoint
|
|
2068
|
-
local checkpoint_args=()
|
|
2069
|
-
if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
|
|
2070
|
-
# Try to find worktree for this issue to check for checkpoints
|
|
2071
|
-
local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
|
|
2072
|
-
if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
|
|
2073
|
-
local latest_checkpoint=""
|
|
2074
|
-
for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
|
|
2075
|
-
[[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
|
|
2076
|
-
done
|
|
2077
|
-
if [[ -n "$latest_checkpoint" ]]; then
|
|
2078
|
-
daemon_log INFO "Found checkpoint: $latest_checkpoint"
|
|
2079
|
-
emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
|
|
2080
|
-
checkpoint_args+=("--resume")
|
|
2081
|
-
fi
|
|
2082
|
-
fi
|
|
2083
|
-
fi
|
|
2084
|
-
|
|
2085
|
-
# Detect context exhaustion from progress file
|
|
2086
|
-
local failure_reason="unknown"
|
|
2087
|
-
local issue_worktree_path="${WORKTREE_DIR:-${REPO_DIR}/.worktrees}/daemon-issue-${issue_num}"
|
|
2088
|
-
local progress_file="${issue_worktree_path}/.claude/loop-logs/progress.md"
|
|
2089
|
-
if [[ -f "$progress_file" ]]; then
|
|
2090
|
-
local progress_iter
|
|
2091
|
-
progress_iter=$(grep -oE 'Iteration: [0-9]+' "$progress_file" 2>/dev/null | tail -1 | grep -oE '[0-9]+' || echo "0")
|
|
2092
|
-
if ! [[ "${progress_iter:-0}" =~ ^[0-9]+$ ]]; then
|
|
2093
|
-
progress_iter="0"
|
|
2357
|
+
# Non-retryable failures — skip retry entirely
|
|
2358
|
+
case "$failure_class" in
|
|
2359
|
+
auth_error)
|
|
2360
|
+
daemon_log ERROR "Auth error for issue #${issue_num} — skipping retry"
|
|
2361
|
+
emit_event "daemon.skip_retry" "issue=$issue_num" "reason=auth_error"
|
|
2362
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2363
|
+
gh issue edit "$issue_num" --add-label "pipeline/auth-error" 2>/dev/null || true
|
|
2094
2364
|
fi
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2365
|
+
;;
|
|
2366
|
+
invalid_issue)
|
|
2367
|
+
daemon_log ERROR "Invalid issue #${issue_num} — skipping retry"
|
|
2368
|
+
emit_event "daemon.skip_retry" "issue=$issue_num" "reason=invalid_issue"
|
|
2369
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2370
|
+
gh issue comment "$issue_num" --body "Pipeline skipped retry: issue appears invalid or has no body." 2>/dev/null || true
|
|
2101
2371
|
fi
|
|
2102
|
-
|
|
2372
|
+
;;
|
|
2373
|
+
*)
|
|
2374
|
+
# Retryable failures — per-class max retries and escalation
|
|
2375
|
+
local effective_max
|
|
2376
|
+
effective_max=$(get_max_retries_for_class "$failure_class")
|
|
2377
|
+
if [[ "$retry_count" -lt "$effective_max" ]]; then
|
|
2378
|
+
retry_count=$((retry_count + 1))
|
|
2379
|
+
|
|
2380
|
+
# Update retry count in state (locked to prevent race)
|
|
2381
|
+
locked_state_update \
|
|
2382
|
+
--arg num "$issue_num" --argjson count "$retry_count" \
|
|
2383
|
+
'.retry_counts[$num] = $count'
|
|
2384
|
+
|
|
2385
|
+
daemon_log WARN "Auto-retry #${retry_count}/${effective_max} for issue #${issue_num} (class: ${failure_class})"
|
|
2386
|
+
emit_event "daemon.retry" "issue=$issue_num" "retry=$retry_count" "max=$effective_max" "class=$failure_class"
|
|
2387
|
+
|
|
2388
|
+
# Check for checkpoint to enable resume-from-checkpoint
|
|
2389
|
+
local checkpoint_args=()
|
|
2390
|
+
if [[ "${CHECKPOINT_ENABLED:-true}" == "true" ]]; then
|
|
2391
|
+
local issue_worktree="${REPO_DIR}/.worktrees/daemon-issue-${issue_num}"
|
|
2392
|
+
if [[ -d "$issue_worktree/.claude/pipeline-artifacts/checkpoints" ]]; then
|
|
2393
|
+
local latest_checkpoint=""
|
|
2394
|
+
for cp_file in "$issue_worktree/.claude/pipeline-artifacts/checkpoints"/*-checkpoint.json; do
|
|
2395
|
+
[[ -f "$cp_file" ]] && latest_checkpoint="$cp_file"
|
|
2396
|
+
done
|
|
2397
|
+
if [[ -n "$latest_checkpoint" ]]; then
|
|
2398
|
+
daemon_log INFO "Found checkpoint: $latest_checkpoint"
|
|
2399
|
+
emit_event "daemon.recovery" "issue=$issue_num" "checkpoint=$latest_checkpoint"
|
|
2400
|
+
checkpoint_args+=("--resume")
|
|
2401
|
+
fi
|
|
2402
|
+
fi
|
|
2403
|
+
fi
|
|
2103
2404
|
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
daemon_log INFO "Escalation: template=full, compound_cycles=5"
|
|
2120
|
-
fi
|
|
2405
|
+
# Build escalated pipeline args
|
|
2406
|
+
local retry_template="$PIPELINE_TEMPLATE"
|
|
2407
|
+
local retry_model="${MODEL:-opus}"
|
|
2408
|
+
local extra_args=()
|
|
2409
|
+
|
|
2410
|
+
if [[ "$retry_count" -eq 1 ]]; then
|
|
2411
|
+
retry_model="opus"
|
|
2412
|
+
extra_args+=("--max-iterations" "30")
|
|
2413
|
+
daemon_log INFO "Escalation: model=opus, max_iterations=30"
|
|
2414
|
+
elif [[ "$retry_count" -ge 2 ]]; then
|
|
2415
|
+
retry_template="full"
|
|
2416
|
+
retry_model="opus"
|
|
2417
|
+
extra_args+=("--max-iterations" "30" "--compound-cycles" "5")
|
|
2418
|
+
daemon_log INFO "Escalation: template=full, compound_cycles=5"
|
|
2419
|
+
fi
|
|
2121
2420
|
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
|
|
2421
|
+
# Increase restarts on context exhaustion
|
|
2422
|
+
if [[ "$failure_class" == "context_exhaustion" ]]; then
|
|
2423
|
+
local boosted_restarts=$(( ${MAX_RESTARTS_CFG:-3} + retry_count ))
|
|
2424
|
+
if [[ "$boosted_restarts" -gt 5 ]]; then
|
|
2425
|
+
boosted_restarts=5
|
|
2426
|
+
fi
|
|
2427
|
+
extra_args+=("--max-restarts" "$boosted_restarts")
|
|
2428
|
+
daemon_log INFO "Boosting max-restarts to $boosted_restarts (context exhaustion)"
|
|
2429
|
+
fi
|
|
2430
|
+
|
|
2431
|
+
# Exponential backoff (per-class base); cap at 1h
|
|
2432
|
+
local base_secs=30
|
|
2433
|
+
[[ "$failure_class" == "api_error" ]] && base_secs=300
|
|
2434
|
+
local backoff_secs=$((base_secs * (1 << (retry_count - 1))))
|
|
2435
|
+
[[ "$backoff_secs" -gt 3600 ]] && backoff_secs=3600
|
|
2436
|
+
[[ "$failure_class" == "api_error" ]] && daemon_log INFO "API error — exponential backoff ${backoff_secs}s"
|
|
2132
2437
|
|
|
2133
|
-
|
|
2134
|
-
|
|
2438
|
+
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2439
|
+
gh issue comment "$issue_num" --body "## 🔄 Auto-Retry #${retry_count}
|
|
2135
2440
|
|
|
2136
|
-
Pipeline failed — retrying with escalated strategy.
|
|
2441
|
+
Pipeline failed (${failure_class}) — retrying with escalated strategy.
|
|
2137
2442
|
|
|
2138
2443
|
| Field | Value |
|
|
2139
2444
|
|-------|-------|
|
|
2140
2445
|
| Retry | ${retry_count} / ${MAX_RETRIES:-2} |
|
|
2446
|
+
| Failure | \`${failure_class}\` |
|
|
2141
2447
|
| Template | \`${retry_template}\` |
|
|
2142
2448
|
| Model | \`${retry_model}\` |
|
|
2143
2449
|
| Started | $(now_iso) |
|
|
2144
2450
|
|
|
2145
2451
|
_Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increased iterations"; else echo "full template + compound quality"; fi)_" 2>/dev/null || true
|
|
2146
|
-
|
|
2452
|
+
fi
|
|
2147
2453
|
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
|
|
2151
|
-
sleep "$backoff_secs"
|
|
2152
|
-
|
|
2153
|
-
# Re-spawn with escalated strategy
|
|
2154
|
-
local orig_template="$PIPELINE_TEMPLATE"
|
|
2155
|
-
local orig_model="$MODEL"
|
|
2156
|
-
PIPELINE_TEMPLATE="$retry_template"
|
|
2157
|
-
MODEL="$retry_model"
|
|
2158
|
-
daemon_spawn_pipeline "$issue_num" "retry-${retry_count}"
|
|
2159
|
-
_retry_spawned_for="$issue_num"
|
|
2160
|
-
PIPELINE_TEMPLATE="$orig_template"
|
|
2161
|
-
MODEL="$orig_model"
|
|
2162
|
-
return
|
|
2163
|
-
fi
|
|
2454
|
+
daemon_log INFO "Waiting ${backoff_secs}s before retry #${retry_count}"
|
|
2455
|
+
sleep "$backoff_secs"
|
|
2164
2456
|
|
|
2165
|
-
|
|
2166
|
-
|
|
2457
|
+
# Merge checkpoint args + extra args for passthrough
|
|
2458
|
+
local all_extra_args=()
|
|
2459
|
+
if [[ ${#checkpoint_args[@]} -gt 0 ]]; then
|
|
2460
|
+
all_extra_args+=("${checkpoint_args[@]}")
|
|
2461
|
+
fi
|
|
2462
|
+
if [[ ${#extra_args[@]} -gt 0 ]]; then
|
|
2463
|
+
all_extra_args+=("${extra_args[@]}")
|
|
2464
|
+
fi
|
|
2465
|
+
|
|
2466
|
+
# Re-spawn with escalated strategy
|
|
2467
|
+
local orig_template="$PIPELINE_TEMPLATE"
|
|
2468
|
+
local orig_model="$MODEL"
|
|
2469
|
+
PIPELINE_TEMPLATE="$retry_template"
|
|
2470
|
+
MODEL="$retry_model"
|
|
2471
|
+
daemon_spawn_pipeline "$issue_num" "retry-${retry_count}" "" "${all_extra_args[@]}"
|
|
2472
|
+
_retry_spawned_for="$issue_num"
|
|
2473
|
+
PIPELINE_TEMPLATE="$orig_template"
|
|
2474
|
+
MODEL="$orig_model"
|
|
2475
|
+
return
|
|
2476
|
+
fi
|
|
2477
|
+
|
|
2478
|
+
daemon_log WARN "Max retries (${effective_max}) exhausted for issue #${issue_num}"
|
|
2479
|
+
emit_event "daemon.retry_exhausted" "issue=$issue_num" "retries=$retry_count"
|
|
2480
|
+
;;
|
|
2481
|
+
esac
|
|
2167
2482
|
fi
|
|
2168
2483
|
|
|
2169
2484
|
# ── No retry — report final failure ──
|
|
2485
|
+
# PM agent: record failure for learning (only when we're done with this issue)
|
|
2486
|
+
if [[ -x "$SCRIPT_DIR/sw-pm.sh" ]]; then
|
|
2487
|
+
bash "$SCRIPT_DIR/sw-pm.sh" learn "$issue_num" failure 2>/dev/null || true
|
|
2488
|
+
fi
|
|
2489
|
+
|
|
2170
2490
|
if [[ "$NO_GITHUB" != "true" ]]; then
|
|
2171
2491
|
# Add failure label and remove watch label (prevent re-processing)
|
|
2172
2492
|
gh issue edit "$issue_num" \
|
|
@@ -2191,10 +2511,11 @@ _Escalation: $(if [[ "$retry_count" -eq 1 ]]; then echo "upgraded model + increa
|
|
|
2191
2511
|
|
|
2192
2512
|
local retry_info=""
|
|
2193
2513
|
if [[ "${RETRY_ESCALATION:-true}" == "true" ]]; then
|
|
2194
|
-
local final_count
|
|
2514
|
+
local final_count final_max
|
|
2195
2515
|
final_count=$(jq -r --arg num "$issue_num" \
|
|
2196
2516
|
'.retry_counts[$num] // 0' "$STATE_FILE" 2>/dev/null || echo "0")
|
|
2197
|
-
|
|
2517
|
+
final_max=$(get_max_retries_for_class "$failure_class")
|
|
2518
|
+
retry_info="| Retries | ${final_count} / ${final_max} (exhausted) |"
|
|
2198
2519
|
fi
|
|
2199
2520
|
|
|
2200
2521
|
gh issue comment "$issue_num" --body "## ❌ Pipeline Failed
|
|
@@ -3770,6 +4091,13 @@ Patrol pre-filter findings to confirm: ${patrol_findings_summary}"
|
|
|
3770
4091
|
patrol_meta_run
|
|
3771
4092
|
fi
|
|
3772
4093
|
|
|
4094
|
+
# ── Strategic Intelligence Patrol (requires CLAUDE_CODE_OAUTH_TOKEN) ──
|
|
4095
|
+
if [[ -f "$SCRIPT_DIR/sw-strategic.sh" ]] && [[ -n "${CLAUDE_CODE_OAUTH_TOKEN:-}" ]]; then
|
|
4096
|
+
# shellcheck source=sw-strategic.sh
|
|
4097
|
+
source "$SCRIPT_DIR/sw-strategic.sh"
|
|
4098
|
+
strategic_patrol_run || true
|
|
4099
|
+
fi
|
|
4100
|
+
|
|
3773
4101
|
# ── Summary ──
|
|
3774
4102
|
emit_event "patrol.completed" "findings=$total_findings" "issues_created=$issues_created" "dry_run=$dry_run"
|
|
3775
4103
|
|
|
@@ -3795,10 +4123,27 @@ daemon_poll_issues() {
|
|
|
3795
4123
|
return
|
|
3796
4124
|
fi
|
|
3797
4125
|
|
|
3798
|
-
# Check for pause flag (set by dashboard or
|
|
3799
|
-
|
|
3800
|
-
|
|
3801
|
-
|
|
4126
|
+
# Check for pause flag (set by dashboard, disk_low, or consecutive-failure backoff)
|
|
4127
|
+
local pause_file="${PAUSE_FLAG:-$HOME/.shipwright/daemon-pause.flag}"
|
|
4128
|
+
if [[ -f "$pause_file" ]]; then
|
|
4129
|
+
local resume_after
|
|
4130
|
+
resume_after=$(jq -r '.resume_after // empty' "$pause_file" 2>/dev/null || true)
|
|
4131
|
+
if [[ -n "$resume_after" ]]; then
|
|
4132
|
+
local now_epoch resume_epoch
|
|
4133
|
+
now_epoch=$(date +%s)
|
|
4134
|
+
resume_epoch=$(TZ=UTC date -j -f "%Y-%m-%dT%H:%M:%SZ" "$resume_after" +%s 2>/dev/null || \
|
|
4135
|
+
date -d "$resume_after" +%s 2>/dev/null || echo 0)
|
|
4136
|
+
if [[ "$resume_epoch" -gt 0 ]] && [[ "$now_epoch" -ge "$resume_epoch" ]]; then
|
|
4137
|
+
rm -f "$pause_file"
|
|
4138
|
+
daemon_log INFO "Auto-resuming after backoff (resume_after passed)"
|
|
4139
|
+
else
|
|
4140
|
+
daemon_log INFO "Daemon paused until ${resume_after} — skipping poll"
|
|
4141
|
+
return
|
|
4142
|
+
fi
|
|
4143
|
+
else
|
|
4144
|
+
daemon_log INFO "Daemon paused — skipping poll"
|
|
4145
|
+
return
|
|
4146
|
+
fi
|
|
3802
4147
|
fi
|
|
3803
4148
|
|
|
3804
4149
|
# Circuit breaker: skip poll if in backoff window
|
|
@@ -4036,9 +4381,25 @@ daemon_poll_issues() {
|
|
|
4036
4381
|
continue
|
|
4037
4382
|
fi
|
|
4038
4383
|
|
|
4039
|
-
# Auto-select pipeline template
|
|
4384
|
+
# Auto-select pipeline template: PM recommendation (if available) else labels + triage score
|
|
4040
4385
|
local template
|
|
4041
|
-
|
|
4386
|
+
if [[ "$NO_GITHUB" != "true" ]] && [[ -x "$SCRIPT_DIR/sw-pm.sh" ]]; then
|
|
4387
|
+
local pm_rec
|
|
4388
|
+
pm_rec=$(bash "$SCRIPT_DIR/sw-pm.sh" recommend --json "$issue_num" 2>/dev/null) || true
|
|
4389
|
+
if [[ -n "$pm_rec" ]]; then
|
|
4390
|
+
template=$(echo "$pm_rec" | jq -r '.team_composition.template // empty' 2>/dev/null) || true
|
|
4391
|
+
# Capability self-assessment: low confidence → upgrade to full template
|
|
4392
|
+
local confidence
|
|
4393
|
+
confidence=$(echo "$pm_rec" | jq -r '.team_composition.confidence_percent // 100' 2>/dev/null) || true
|
|
4394
|
+
if [[ -n "$confidence" && "$confidence" != "null" && "$confidence" -lt 60 ]]; then
|
|
4395
|
+
daemon_log INFO "Low PM confidence (${confidence}%) — upgrading to full template"
|
|
4396
|
+
template="full"
|
|
4397
|
+
fi
|
|
4398
|
+
fi
|
|
4399
|
+
fi
|
|
4400
|
+
if [[ -z "$template" ]]; then
|
|
4401
|
+
template=$(select_pipeline_template "$labels_csv" "$score" 2>/dev/null | tail -1)
|
|
4402
|
+
fi
|
|
4042
4403
|
template=$(printf '%s' "$template" | sed $'s/\x1b\\[[0-9;]*m//g' | tr -cd '[:alnum:]-_')
|
|
4043
4404
|
[[ -z "$template" ]] && template="$PIPELINE_TEMPLATE"
|
|
4044
4405
|
daemon_log INFO "Triage: issue #${issue_num} scored ${score}, template=${template}"
|
|
@@ -4095,13 +4456,15 @@ daemon_health_check() {
|
|
|
4095
4456
|
now_e=$(now_epoch)
|
|
4096
4457
|
|
|
4097
4458
|
if [[ -f "$STATE_FILE" ]]; then
|
|
4098
|
-
# ──
|
|
4099
|
-
# Instead of killing after a
|
|
4100
|
-
#
|
|
4101
|
-
#
|
|
4459
|
+
# ── Intelligent Health Monitoring ──
|
|
4460
|
+
# Instead of killing after a countdown, sense what the agent is doing.
|
|
4461
|
+
# Agents think for long stretches — that's normal and expected.
|
|
4462
|
+
# Strategy: sense → understand → be patient → nudge → only kill as last resort.
|
|
4102
4463
|
|
|
4103
|
-
local hard_limit="${PROGRESS_HARD_LIMIT_S:-
|
|
4464
|
+
local hard_limit="${PROGRESS_HARD_LIMIT_S:-0}"
|
|
4104
4465
|
local use_progress="${PROGRESS_MONITORING:-true}"
|
|
4466
|
+
local nudge_enabled="${NUDGE_ENABLED:-true}"
|
|
4467
|
+
local nudge_after="${NUDGE_AFTER_CHECKS:-40}"
|
|
4105
4468
|
|
|
4106
4469
|
while IFS= read -r job; do
|
|
4107
4470
|
local pid started_at issue_num worktree
|
|
@@ -4122,8 +4485,8 @@ daemon_health_check() {
|
|
|
4122
4485
|
elapsed=$(( now_e - start_e ))
|
|
4123
4486
|
fi
|
|
4124
4487
|
|
|
4125
|
-
# Hard wall-clock limit —
|
|
4126
|
-
if [[ "$elapsed" -gt "$hard_limit" ]]; then
|
|
4488
|
+
# Hard wall-clock limit — disabled by default (0 = off)
|
|
4489
|
+
if [[ "$hard_limit" -gt 0 && "$elapsed" -gt "$hard_limit" ]]; then
|
|
4127
4490
|
daemon_log WARN "Hard limit exceeded: issue #${issue_num} (${elapsed}s > ${hard_limit}s, PID $pid) — killing"
|
|
4128
4491
|
emit_event "daemon.hard_limit" "issue=$issue_num" "elapsed_s=$elapsed" "limit_s=$hard_limit" "pid=$pid"
|
|
4129
4492
|
kill "$pid" 2>/dev/null || true
|
|
@@ -4132,7 +4495,7 @@ daemon_health_check() {
|
|
|
4132
4495
|
continue
|
|
4133
4496
|
fi
|
|
4134
4497
|
|
|
4135
|
-
# Progress
|
|
4498
|
+
# ── Intelligent Progress Sensing ──
|
|
4136
4499
|
if [[ "$use_progress" == "true" && -n "$worktree" ]]; then
|
|
4137
4500
|
local snapshot verdict
|
|
4138
4501
|
snapshot=$(daemon_collect_snapshot "$issue_num" "$worktree" "$pid" 2>/dev/null || echo '{}')
|
|
@@ -4140,29 +4503,87 @@ daemon_health_check() {
|
|
|
4140
4503
|
if [[ "$snapshot" != "{}" ]]; then
|
|
4141
4504
|
verdict=$(daemon_assess_progress "$issue_num" "$snapshot" 2>/dev/null || echo "healthy")
|
|
4142
4505
|
|
|
4506
|
+
local no_progress_count=0
|
|
4507
|
+
no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
4508
|
+
local cur_stage
|
|
4509
|
+
cur_stage=$(echo "$snapshot" | jq -r '.stage // "unknown"')
|
|
4510
|
+
|
|
4143
4511
|
case "$verdict" in
|
|
4144
4512
|
healthy)
|
|
4145
4513
|
# All good — agent is making progress
|
|
4146
4514
|
;;
|
|
4147
4515
|
slowing)
|
|
4148
|
-
daemon_log INFO "Issue #${issue_num} slowing (no
|
|
4516
|
+
daemon_log INFO "Issue #${issue_num} slowing (no visible changes for ${no_progress_count} checks, ${elapsed}s elapsed, stage=${cur_stage})"
|
|
4149
4517
|
;;
|
|
4150
4518
|
stalled)
|
|
4151
|
-
|
|
4152
|
-
|
|
4153
|
-
|
|
4154
|
-
|
|
4519
|
+
# Check if agent subprocess is alive and consuming CPU
|
|
4520
|
+
local agent_alive=false
|
|
4521
|
+
local child_cpu=0
|
|
4522
|
+
child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
4523
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
4524
|
+
agent_alive=true
|
|
4525
|
+
fi
|
|
4526
|
+
|
|
4527
|
+
if [[ "$agent_alive" == "true" ]]; then
|
|
4528
|
+
daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}, ${elapsed}s) — being patient"
|
|
4529
|
+
else
|
|
4530
|
+
daemon_log WARN "Issue #${issue_num} stalled: no progress for ${no_progress_count} checks, no CPU activity (${elapsed}s elapsed, PID $pid)"
|
|
4531
|
+
emit_event "daemon.stalled" "issue=$issue_num" "no_progress=$no_progress_count" "elapsed_s=$elapsed" "pid=$pid"
|
|
4532
|
+
fi
|
|
4155
4533
|
;;
|
|
4156
4534
|
stuck)
|
|
4157
|
-
local
|
|
4158
|
-
no_progress_count=$(jq -r '.no_progress_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
4535
|
+
local repeated_errors
|
|
4159
4536
|
repeated_errors=$(jq -r '.repeated_error_count // 0' "$PROGRESS_DIR/issue-${issue_num}.json" 2>/dev/null || echo 0)
|
|
4160
|
-
|
|
4161
|
-
|
|
4162
|
-
|
|
4163
|
-
|
|
4164
|
-
|
|
4165
|
-
|
|
4537
|
+
|
|
4538
|
+
# Even "stuck" — check if the process tree is alive first
|
|
4539
|
+
local agent_alive=false
|
|
4540
|
+
local child_cpu=0
|
|
4541
|
+
child_cpu=$(pgrep -P "$pid" 2>/dev/null | xargs -I{} ps -o pcpu= -p {} 2>/dev/null | awk '{sum+=$1} END{printf "%d", sum+0}' || echo "0")
|
|
4542
|
+
if [[ "${child_cpu:-0}" -gt 0 ]]; then
|
|
4543
|
+
agent_alive=true
|
|
4544
|
+
fi
|
|
4545
|
+
|
|
4546
|
+
if [[ "$agent_alive" == "true" && "$repeated_errors" -lt 3 ]]; then
|
|
4547
|
+
# Agent is alive — nudge instead of kill
|
|
4548
|
+
if [[ "$nudge_enabled" == "true" && "$no_progress_count" -ge "$nudge_after" ]]; then
|
|
4549
|
+
local nudge_file="${worktree}/.claude/nudge.md"
|
|
4550
|
+
if [[ ! -f "$nudge_file" ]]; then
|
|
4551
|
+
cat > "$nudge_file" <<NUDGE_EOF
|
|
4552
|
+
# Nudge from Daemon Health Monitor
|
|
4553
|
+
|
|
4554
|
+
The daemon has noticed no visible progress for $(( no_progress_count * 30 / 60 )) minutes.
|
|
4555
|
+
Current stage: ${cur_stage}
|
|
4556
|
+
|
|
4557
|
+
If you're stuck, consider:
|
|
4558
|
+
- Breaking the task into smaller steps
|
|
4559
|
+
- Committing partial progress
|
|
4560
|
+
- Running tests to validate current state
|
|
4561
|
+
|
|
4562
|
+
This is just a gentle check-in — take your time if you're working through a complex problem.
|
|
4563
|
+
NUDGE_EOF
|
|
4564
|
+
daemon_log INFO "Issue #${issue_num} nudged (${no_progress_count} checks, stage=${cur_stage}, CPU=${child_cpu}%) — file written to worktree"
|
|
4565
|
+
emit_event "daemon.nudge" "issue=$issue_num" "no_progress=$no_progress_count" "stage=$cur_stage" "elapsed_s=$elapsed"
|
|
4566
|
+
fi
|
|
4567
|
+
else
|
|
4568
|
+
daemon_log INFO "Issue #${issue_num} no visible progress (${no_progress_count} checks) but agent is alive (CPU: ${child_cpu}%, stage=${cur_stage}) — waiting"
|
|
4569
|
+
fi
|
|
4570
|
+
elif [[ "$repeated_errors" -ge 5 ]]; then
|
|
4571
|
+
# Truly stuck in an error loop — kill as last resort
|
|
4572
|
+
daemon_log WARN "Issue #${issue_num} in error loop: ${repeated_errors} repeated errors (stage=${cur_stage}, ${elapsed}s, PID $pid) — killing"
|
|
4573
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=error_loop"
|
|
4574
|
+
kill "$pid" 2>/dev/null || true
|
|
4575
|
+
daemon_clear_progress "$issue_num"
|
|
4576
|
+
findings=$((findings + 1))
|
|
4577
|
+
elif [[ "$agent_alive" != "true" && "$no_progress_count" -ge "$((PROGRESS_CHECKS_BEFORE_KILL * 2))" ]]; then
|
|
4578
|
+
# Process tree is dead AND no progress for very long time
|
|
4579
|
+
daemon_log WARN "Issue #${issue_num} appears dead: no CPU, no progress for ${no_progress_count} checks (${elapsed}s, PID $pid) — killing"
|
|
4580
|
+
emit_event "daemon.stuck_kill" "issue=$issue_num" "no_progress=$no_progress_count" "repeated_errors=$repeated_errors" "stage=$cur_stage" "elapsed_s=$elapsed" "pid=$pid" "reason=dead_process"
|
|
4581
|
+
kill "$pid" 2>/dev/null || true
|
|
4582
|
+
daemon_clear_progress "$issue_num"
|
|
4583
|
+
findings=$((findings + 1))
|
|
4584
|
+
else
|
|
4585
|
+
daemon_log WARN "Issue #${issue_num} struggling (${no_progress_count} checks, ${repeated_errors} errors, CPU=${child_cpu}%, stage=${cur_stage}) — monitoring"
|
|
4586
|
+
fi
|
|
4166
4587
|
;;
|
|
4167
4588
|
esac
|
|
4168
4589
|
fi
|
|
@@ -4171,8 +4592,9 @@ daemon_health_check() {
|
|
|
4171
4592
|
local stale_timeout
|
|
4172
4593
|
stale_timeout=$(get_adaptive_stale_timeout "$PIPELINE_TEMPLATE")
|
|
4173
4594
|
if [[ "$elapsed" -gt "$stale_timeout" ]]; then
|
|
4174
|
-
daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)
|
|
4175
|
-
kill
|
|
4595
|
+
daemon_log WARN "Stale job (legacy): issue #${issue_num} (${elapsed}s > ${stale_timeout}s, PID $pid)"
|
|
4596
|
+
# Don't kill — just log. Let the process run.
|
|
4597
|
+
emit_event "daemon.stale_warning" "issue=$issue_num" "elapsed_s=$elapsed" "pid=$pid"
|
|
4176
4598
|
findings=$((findings + 1))
|
|
4177
4599
|
fi
|
|
4178
4600
|
fi
|
|
@@ -4765,6 +5187,7 @@ daemon_poll_loop() {
|
|
|
4765
5187
|
# All poll loop calls are error-guarded to prevent set -e from killing the daemon.
|
|
4766
5188
|
# The || operator disables set -e for the entire call chain, so transient failures
|
|
4767
5189
|
# (GitHub API timeouts, jq errors, intelligence failures) are logged and skipped.
|
|
5190
|
+
daemon_preflight_auth_check || daemon_log WARN "Auth check failed — daemon may be paused"
|
|
4768
5191
|
daemon_poll_issues || daemon_log WARN "daemon_poll_issues failed — continuing"
|
|
4769
5192
|
daemon_reap_completed || daemon_log WARN "daemon_reap_completed failed — continuing"
|
|
4770
5193
|
daemon_health_check || daemon_log WARN "daemon_health_check failed — continuing"
|
|
@@ -4848,7 +5271,8 @@ cleanup_on_exit() {
|
|
|
4848
5271
|
while IFS= read -r cpid; do
|
|
4849
5272
|
[[ -z "$cpid" ]] && continue
|
|
4850
5273
|
if kill -0 "$cpid" 2>/dev/null; then
|
|
4851
|
-
daemon_log INFO "Killing pipeline process PID ${cpid}"
|
|
5274
|
+
daemon_log INFO "Killing pipeline process tree PID ${cpid}"
|
|
5275
|
+
pkill -TERM -P "$cpid" 2>/dev/null || true
|
|
4852
5276
|
kill "$cpid" 2>/dev/null || true
|
|
4853
5277
|
killed=$((killed + 1))
|
|
4854
5278
|
fi
|
|
@@ -4860,7 +5284,8 @@ cleanup_on_exit() {
|
|
|
4860
5284
|
while IFS= read -r cpid; do
|
|
4861
5285
|
[[ -z "$cpid" ]] && continue
|
|
4862
5286
|
if kill -0 "$cpid" 2>/dev/null; then
|
|
4863
|
-
daemon_log WARN "Force-killing pipeline PID ${cpid}"
|
|
5287
|
+
daemon_log WARN "Force-killing pipeline tree PID ${cpid}"
|
|
5288
|
+
pkill -9 -P "$cpid" 2>/dev/null || true
|
|
4864
5289
|
kill -9 "$cpid" 2>/dev/null || true
|
|
4865
5290
|
fi
|
|
4866
5291
|
done <<< "$child_pids"
|
|
@@ -4951,6 +5376,11 @@ daemon_start() {
|
|
|
4951
5376
|
# Remove stale shutdown flag
|
|
4952
5377
|
rm -f "$SHUTDOWN_FLAG"
|
|
4953
5378
|
|
|
5379
|
+
# Initialize SQLite database (if available)
|
|
5380
|
+
if type init_schema &>/dev/null; then
|
|
5381
|
+
init_schema 2>/dev/null || true
|
|
5382
|
+
fi
|
|
5383
|
+
|
|
4954
5384
|
# Initialize state
|
|
4955
5385
|
init_state
|
|
4956
5386
|
|