prizmkit 1.1.21 → 1.1.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled/VERSION.json +3 -3
- package/bundled/dev-pipeline/lib/heartbeat.sh +50 -7
- package/bundled/dev-pipeline/reset-bug.sh +21 -13
- package/bundled/dev-pipeline/reset-feature.sh +21 -13
- package/bundled/dev-pipeline/reset-refactor.sh +21 -13
- package/bundled/dev-pipeline/run-bugfix.sh +40 -2
- package/bundled/dev-pipeline/run-feature.sh +41 -1
- package/bundled/dev-pipeline/run-refactor.sh +40 -2
- package/bundled/dev-pipeline/scripts/detect-stuck.py +25 -14
- package/bundled/dev-pipeline/scripts/init-bugfix-pipeline.py +0 -5
- package/bundled/dev-pipeline/scripts/init-pipeline.py +0 -5
- package/bundled/dev-pipeline/scripts/init-refactor-pipeline.py +0 -5
- package/bundled/dev-pipeline/scripts/update-bug-status.py +40 -31
- package/bundled/dev-pipeline/scripts/update-feature-status.py +54 -60
- package/bundled/dev-pipeline/scripts/update-refactor-status.py +43 -34
- package/bundled/dev-pipeline/templates/bootstrap-tier1.md +50 -7
- package/bundled/dev-pipeline/templates/bootstrap-tier2.md +50 -7
- package/bundled/dev-pipeline/templates/bootstrap-tier3.md +50 -7
- package/bundled/dev-pipeline/templates/sections/context-budget-rules.md +20 -0
- package/bundled/dev-pipeline/templates/sections/phase-browser-verification.md +84 -5
- package/bundled/dev-pipeline/templates/sections/phase-implement-agent.md +7 -0
- package/bundled/dev-pipeline/templates/sections/phase-implement-full.md +7 -0
- package/bundled/dev-pipeline/templates/sections/phase-implement-lite.md +7 -0
- package/bundled/dev-pipeline/tests/test_auto_skip.py +10 -3
- package/bundled/skills/_metadata.json +1 -1
- package/package.json +1 -1
package/bundled/VERSION.json
CHANGED
|
@@ -6,9 +6,14 @@
|
|
|
6
6
|
# structured progress from progress.json (written by
|
|
7
7
|
# parse-stream-progress.py) and fall back to tail-based monitoring.
|
|
8
8
|
#
|
|
9
|
+
# When stale_kill_threshold is set (>0), the heartbeat monitor will
|
|
10
|
+
# automatically kill the AI CLI process if it shows no progress for
|
|
11
|
+
# the specified duration. This prevents sessions from hanging forever
|
|
12
|
+
# when the AI CLI process doesn't exit after completing its work.
|
|
13
|
+
#
|
|
9
14
|
# Usage:
|
|
10
15
|
# source "$SCRIPT_DIR/lib/heartbeat.sh"
|
|
11
|
-
# start_heartbeat "$cli_pid" "$session_log" "$progress_json" "$interval"
|
|
16
|
+
# start_heartbeat "$cli_pid" "$session_log" "$progress_json" "$interval" ["$stale_kill_threshold"]
|
|
12
17
|
# # ... wait for CLI to finish ...
|
|
13
18
|
# stop_heartbeat "$_HEARTBEAT_PID"
|
|
14
19
|
#
|
|
@@ -20,19 +25,23 @@
|
|
|
20
25
|
# Sets _HEARTBEAT_PID to the background process PID.
|
|
21
26
|
#
|
|
22
27
|
# Arguments:
|
|
23
|
-
# $1 - cli_pid
|
|
24
|
-
# $2 - session_log
|
|
25
|
-
# $3 - progress_json
|
|
26
|
-
# $4 - interval
|
|
28
|
+
# $1 - cli_pid PID of the AI CLI process to monitor
|
|
29
|
+
# $2 - session_log Path to session.log
|
|
30
|
+
# $3 - progress_json Path to progress.json (may not exist if stream-json disabled)
|
|
31
|
+
# $4 - interval Heartbeat interval in seconds
|
|
32
|
+
# $5 - stale_kill_threshold (optional) Seconds of no progress before auto-killing the process.
|
|
33
|
+
# 0 = disabled (default). Recommended: 900 (15 minutes).
|
|
27
34
|
start_heartbeat() {
|
|
28
35
|
local cli_pid="$1"
|
|
29
36
|
local session_log="$2"
|
|
30
37
|
local progress_json="$3"
|
|
31
38
|
local heartbeat_interval="$4"
|
|
39
|
+
local stale_kill_threshold="${5:-0}"
|
|
32
40
|
|
|
33
41
|
(
|
|
34
42
|
local elapsed=0
|
|
35
43
|
local prev_size=0
|
|
44
|
+
local stale_seconds=0
|
|
36
45
|
while kill -0 "$cli_pid" 2>/dev/null; do
|
|
37
46
|
sleep "$heartbeat_interval"
|
|
38
47
|
elapsed=$((elapsed + heartbeat_interval))
|
|
@@ -48,6 +57,13 @@ start_heartbeat() {
|
|
|
48
57
|
local growth=$((cur_size - prev_size))
|
|
49
58
|
prev_size=$cur_size
|
|
50
59
|
|
|
60
|
+
# Track progress staleness (no log growth = stale)
|
|
61
|
+
if [[ $growth -eq 0 ]]; then
|
|
62
|
+
stale_seconds=$((stale_seconds + heartbeat_interval))
|
|
63
|
+
else
|
|
64
|
+
stale_seconds=0
|
|
65
|
+
fi
|
|
66
|
+
|
|
51
67
|
local size_display
|
|
52
68
|
if [[ $cur_size -gt 1048576 ]]; then
|
|
53
69
|
size_display="$((cur_size / 1048576))MB"
|
|
@@ -67,6 +83,33 @@ start_heartbeat() {
|
|
|
67
83
|
status_icon="${YELLOW}⏸${NC}"
|
|
68
84
|
fi
|
|
69
85
|
|
|
86
|
+
# Stale-kill: auto-terminate process if no progress for too long
|
|
87
|
+
if [[ $stale_kill_threshold -gt 0 && $stale_seconds -ge $stale_kill_threshold ]]; then
|
|
88
|
+
local stale_mins=$((stale_seconds / 60))
|
|
89
|
+
echo -e " ${RED}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${RED}STALE-KILL: no progress for ${stale_mins}m (threshold: ${stale_kill_threshold}s)${NC}"
|
|
90
|
+
echo -e " ${RED}[HEARTBEAT]${NC} Killing AI CLI process $cli_pid (stale session)..."
|
|
91
|
+
kill -TERM "$cli_pid" 2>/dev/null || true
|
|
92
|
+
# Give process 10s to exit gracefully, then force kill
|
|
93
|
+
sleep 10
|
|
94
|
+
if kill -0 "$cli_pid" 2>/dev/null; then
|
|
95
|
+
echo -e " ${RED}[HEARTBEAT]${NC} Process still alive after SIGTERM, sending SIGKILL..."
|
|
96
|
+
kill -9 "$cli_pid" 2>/dev/null || true
|
|
97
|
+
fi
|
|
98
|
+
# Write stale-kill marker so spawn_and_wait_session knows this wasn't a crash
|
|
99
|
+
local _marker_dir
|
|
100
|
+
_marker_dir="$(dirname "$session_log")"
|
|
101
|
+
echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"stale_session\", \"stale_seconds\": $stale_seconds, \"threshold\": $stale_kill_threshold}" > "$_marker_dir/stale-kill.json" 2>/dev/null || true
|
|
102
|
+
break
|
|
103
|
+
fi
|
|
104
|
+
|
|
105
|
+
# Build staleness hint for display
|
|
106
|
+
local stale_hint=""
|
|
107
|
+
if [[ $stale_kill_threshold -gt 0 && $stale_seconds -gt 0 ]]; then
|
|
108
|
+
local stale_mins=$((stale_seconds / 60))
|
|
109
|
+
local threshold_mins=$((stale_kill_threshold / 60))
|
|
110
|
+
stale_hint=" | stale: ${stale_mins}m/${threshold_mins}m"
|
|
111
|
+
fi
|
|
112
|
+
|
|
70
113
|
# Try structured progress from progress.json
|
|
71
114
|
if [[ -f "$progress_json" ]]; then
|
|
72
115
|
local phase tool msgs tools_total
|
|
@@ -86,7 +129,7 @@ try:
|
|
|
86
129
|
except Exception:
|
|
87
130
|
sys.exit(1)
|
|
88
131
|
" "$progress_json" 2>/dev/null) && {
|
|
89
|
-
echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${phase}"
|
|
132
|
+
echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${phase}${stale_hint}"
|
|
90
133
|
continue
|
|
91
134
|
}
|
|
92
135
|
fi
|
|
@@ -97,7 +140,7 @@ except Exception:
|
|
|
97
140
|
last_activity=$(tail -20 "$session_log" 2>/dev/null | grep -v '^$' | tail -1 | cut -c1-80 || echo "")
|
|
98
141
|
fi
|
|
99
142
|
|
|
100
|
-
echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s elapsed | log: ${size_display} (+${growth}B) | ${last_activity}"
|
|
143
|
+
echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s elapsed | log: ${size_display} (+${growth}B) | ${last_activity}${stale_hint}"
|
|
101
144
|
done
|
|
102
145
|
) &
|
|
103
146
|
_HEARTBEAT_PID=$!
|
|
@@ -139,22 +139,20 @@ fi
|
|
|
139
139
|
BUG_IDS=()
|
|
140
140
|
|
|
141
141
|
if [[ -n "$FILTER_MODE" ]]; then
|
|
142
|
-
# Filter by status from .
|
|
142
|
+
# Filter by status from bug-fix-list.json (single source of truth)
|
|
143
143
|
while IFS= read -r bid; do
|
|
144
144
|
[[ -n "$bid" ]] && BUG_IDS+=("$bid")
|
|
145
145
|
done < <(python3 -c "
|
|
146
|
-
import json,
|
|
147
|
-
state_dir = '$STATE_DIR'
|
|
146
|
+
import json, sys
|
|
148
147
|
filter_mode = '$FILTER_MODE'
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
for
|
|
153
|
-
|
|
154
|
-
if not os.path.isfile(status_file):
|
|
148
|
+
bug_list = '$BUG_LIST'
|
|
149
|
+
with open(bug_list) as f:
|
|
150
|
+
data = json.load(f)
|
|
151
|
+
for bug in data.get('bugs', []):
|
|
152
|
+
if not isinstance(bug, dict):
|
|
155
153
|
continue
|
|
156
|
-
|
|
157
|
-
|
|
154
|
+
bid = bug.get('id', '')
|
|
155
|
+
status = bug.get('status', '')
|
|
158
156
|
if filter_mode == 'auto_skipped' and status == 'auto_skipped':
|
|
159
157
|
print(bid)
|
|
160
158
|
elif filter_mode == 'failed' and status == 'failed':
|
|
@@ -244,13 +242,23 @@ sys.exit(1)
|
|
|
244
242
|
echo -e "${BOLD}════════════════════════════════════════════════════${NC}"
|
|
245
243
|
|
|
246
244
|
STATUS_FILE="$STATE_DIR/bugs/$CUR_BUG_ID/status.json"
|
|
245
|
+
# Read status from bug-fix-list.json (single source of truth)
|
|
246
|
+
CURRENT_STATUS=$(python3 -c "
|
|
247
|
+
import json, sys
|
|
248
|
+
with open('$BUG_LIST') as f:
|
|
249
|
+
data = json.load(f)
|
|
250
|
+
for bug in data.get('bugs', []):
|
|
251
|
+
if isinstance(bug, dict) and bug.get('id') == '$CUR_BUG_ID':
|
|
252
|
+
print(bug.get('status', '?'))
|
|
253
|
+
sys.exit(0)
|
|
254
|
+
print('?')
|
|
255
|
+
" 2>/dev/null || echo "?")
|
|
247
256
|
if [[ -f "$STATUS_FILE" ]]; then
|
|
248
|
-
CURRENT_STATUS=$(python3 -c "import json; d=json.load(open('$STATUS_FILE')); print(d.get('status','?'))")
|
|
249
257
|
CURRENT_RETRY=$(python3 -c "import json; d=json.load(open('$STATUS_FILE')); print(d.get('retry_count',0))")
|
|
250
258
|
SESSION_COUNT=$(python3 -c "import json; d=json.load(open('$STATUS_FILE')); print(len(d.get('sessions',[])))")
|
|
251
259
|
log_info "Current status: $CURRENT_STATUS (retry $CURRENT_RETRY, $SESSION_COUNT sessions)"
|
|
252
260
|
else
|
|
253
|
-
log_info "
|
|
261
|
+
log_info "Current status: $CURRENT_STATUS (no runtime state file)"
|
|
254
262
|
fi
|
|
255
263
|
|
|
256
264
|
BUGFIX_DIR="$PROJECT_ROOT/.prizmkit/bugfix/$CUR_BUG_ID"
|
|
@@ -139,22 +139,20 @@ fi
|
|
|
139
139
|
FEATURE_IDS=()
|
|
140
140
|
|
|
141
141
|
if [[ -n "$FILTER_MODE" ]]; then
|
|
142
|
-
# Filter by status from
|
|
142
|
+
# Filter by status from feature-list.json (single source of truth)
|
|
143
143
|
while IFS= read -r fid; do
|
|
144
144
|
[[ -n "$fid" ]] && FEATURE_IDS+=("$fid")
|
|
145
145
|
done < <(python3 -c "
|
|
146
|
-
import json,
|
|
147
|
-
state_dir = '$STATE_DIR'
|
|
146
|
+
import json, sys
|
|
148
147
|
filter_mode = '$FILTER_MODE'
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
for
|
|
153
|
-
|
|
154
|
-
if not os.path.isfile(status_file):
|
|
148
|
+
feature_list = '$FEATURE_LIST'
|
|
149
|
+
with open(feature_list) as f:
|
|
150
|
+
data = json.load(f)
|
|
151
|
+
for feat in data.get('features', []):
|
|
152
|
+
if not isinstance(feat, dict):
|
|
155
153
|
continue
|
|
156
|
-
|
|
157
|
-
|
|
154
|
+
fid = feat.get('id', '')
|
|
155
|
+
status = feat.get('status', '')
|
|
158
156
|
if filter_mode == 'auto_skipped' and status == 'auto_skipped':
|
|
159
157
|
print(fid)
|
|
160
158
|
elif filter_mode == 'failed' and status == 'failed':
|
|
@@ -253,13 +251,23 @@ sys.exit(1)
|
|
|
253
251
|
echo -e "${BOLD}════════════════════════════════════════════════════${NC}"
|
|
254
252
|
|
|
255
253
|
STATUS_FILE="$STATE_DIR/features/$CUR_FEATURE_ID/status.json"
|
|
254
|
+
# Read status from feature-list.json (single source of truth)
|
|
255
|
+
CURRENT_STATUS=$(python3 -c "
|
|
256
|
+
import json, sys
|
|
257
|
+
with open('$FEATURE_LIST') as f:
|
|
258
|
+
data = json.load(f)
|
|
259
|
+
for feat in data.get('features', []):
|
|
260
|
+
if isinstance(feat, dict) and feat.get('id') == '$CUR_FEATURE_ID':
|
|
261
|
+
print(feat.get('status', '?'))
|
|
262
|
+
sys.exit(0)
|
|
263
|
+
print('?')
|
|
264
|
+
" 2>/dev/null || echo "?")
|
|
256
265
|
if [[ -f "$STATUS_FILE" ]]; then
|
|
257
|
-
CURRENT_STATUS=$(python3 -c "import json; d=json.load(open('$STATUS_FILE')); print(d.get('status','?'))")
|
|
258
266
|
CURRENT_RETRY=$(python3 -c "import json; d=json.load(open('$STATUS_FILE')); print(d.get('retry_count',0))")
|
|
259
267
|
SESSION_COUNT=$(python3 -c "import json; d=json.load(open('$STATUS_FILE')); print(len(d.get('sessions',[])))")
|
|
260
268
|
log_info "Current status: $CURRENT_STATUS (retry $CURRENT_RETRY, $SESSION_COUNT sessions)"
|
|
261
269
|
else
|
|
262
|
-
log_info "
|
|
270
|
+
log_info "Current status: $CURRENT_STATUS (no runtime state file)"
|
|
263
271
|
fi
|
|
264
272
|
|
|
265
273
|
SPECS_DIR="$PROJECT_ROOT/.prizmkit/specs/$FEATURE_SLUG"
|
|
@@ -129,22 +129,20 @@ fi
|
|
|
129
129
|
REFACTOR_IDS=()
|
|
130
130
|
|
|
131
131
|
if [[ -n "$FILTER_MODE" ]]; then
|
|
132
|
-
# Filter by status from
|
|
132
|
+
# Filter by status from refactor-list.json (single source of truth)
|
|
133
133
|
while IFS= read -r rid; do
|
|
134
134
|
[[ -n "$rid" ]] && REFACTOR_IDS+=("$rid")
|
|
135
135
|
done < <(python3 -c "
|
|
136
|
-
import json,
|
|
137
|
-
state_dir = '$STATE_DIR'
|
|
136
|
+
import json, sys
|
|
138
137
|
filter_mode = '$FILTER_MODE'
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
for
|
|
143
|
-
|
|
144
|
-
if not os.path.isfile(status_file):
|
|
138
|
+
refactor_list = '$REFACTOR_LIST'
|
|
139
|
+
with open(refactor_list) as f:
|
|
140
|
+
data = json.load(f)
|
|
141
|
+
for r in data.get('refactors', []):
|
|
142
|
+
if not isinstance(r, dict):
|
|
145
143
|
continue
|
|
146
|
-
|
|
147
|
-
|
|
144
|
+
rid = r.get('id', '')
|
|
145
|
+
status = r.get('status', '')
|
|
148
146
|
if filter_mode == 'auto_skipped' and status == 'auto_skipped':
|
|
149
147
|
print(rid)
|
|
150
148
|
elif filter_mode == 'failed' and status == 'failed':
|
|
@@ -242,13 +240,23 @@ sys.exit(1)
|
|
|
242
240
|
echo -e "${BOLD}════════════════════════════════════════════════════${NC}"
|
|
243
241
|
|
|
244
242
|
STATUS_FILE="$STATE_DIR/refactors/$CUR_REFACTOR_ID/status.json"
|
|
243
|
+
# Read status from refactor-list.json (single source of truth)
|
|
244
|
+
CURRENT_STATUS=$(python3 -c "
|
|
245
|
+
import json, sys
|
|
246
|
+
with open('$REFACTOR_LIST') as f:
|
|
247
|
+
data = json.load(f)
|
|
248
|
+
for r in data.get('refactors', []):
|
|
249
|
+
if isinstance(r, dict) and r.get('id') == '$CUR_REFACTOR_ID':
|
|
250
|
+
print(r.get('status', '?'))
|
|
251
|
+
sys.exit(0)
|
|
252
|
+
print('?')
|
|
253
|
+
" 2>/dev/null || echo "?")
|
|
245
254
|
if [[ -f "$STATUS_FILE" ]]; then
|
|
246
|
-
CURRENT_STATUS=$(python3 -c "import json; d=json.load(open('$STATUS_FILE')); print(d.get('status','?'))")
|
|
247
255
|
CURRENT_RETRY=$(python3 -c "import json; d=json.load(open('$STATUS_FILE')); print(d.get('retry_count',0))")
|
|
248
256
|
SESSION_COUNT=$(python3 -c "import json; d=json.load(open('$STATUS_FILE')); print(len(d.get('sessions',[])))")
|
|
249
257
|
log_info "Current status: $CURRENT_STATUS (retry $CURRENT_RETRY, $SESSION_COUNT sessions)"
|
|
250
258
|
else
|
|
251
|
-
log_info "
|
|
259
|
+
log_info "Current status: $CURRENT_STATUS (no runtime state file)"
|
|
252
260
|
fi
|
|
253
261
|
|
|
254
262
|
SPECS_DIR="$PROJECT_ROOT/.prizmkit/specs/$REFACTOR_SLUG"
|
|
@@ -21,6 +21,7 @@ set -euo pipefail
|
|
|
21
21
|
# PRIZMKIT_PLATFORM Force platform: 'codebuddy' or 'claude' (auto-detected)
|
|
22
22
|
# VERBOSE Set to 1 to enable --verbose on AI CLI
|
|
23
23
|
# HEARTBEAT_INTERVAL Heartbeat log interval in seconds (default: 30)
|
|
24
|
+
# STALE_KILL_THRESHOLD Auto-kill session after N seconds of no progress (default: 900)
|
|
24
25
|
# HEARTBEAT_STALE_THRESHOLD Heartbeat stale threshold in seconds (default: 600)
|
|
25
26
|
# LOG_CLEANUP_ENABLED Run periodic log cleanup (default: 1)
|
|
26
27
|
# LOG_RETENTION_DAYS Delete logs older than N days (default: 14)
|
|
@@ -39,6 +40,7 @@ MAX_RETRIES=${MAX_RETRIES:-3}
|
|
|
39
40
|
SESSION_TIMEOUT=${SESSION_TIMEOUT:-0}
|
|
40
41
|
HEARTBEAT_STALE_THRESHOLD=${HEARTBEAT_STALE_THRESHOLD:-600}
|
|
41
42
|
HEARTBEAT_INTERVAL=${HEARTBEAT_INTERVAL:-30}
|
|
43
|
+
STALE_KILL_THRESHOLD=${STALE_KILL_THRESHOLD:-900}
|
|
42
44
|
LOG_CLEANUP_ENABLED=${LOG_CLEANUP_ENABLED:-1}
|
|
43
45
|
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-14}
|
|
44
46
|
LOG_MAX_TOTAL_MB=${LOG_MAX_TOTAL_MB:-1024}
|
|
@@ -146,8 +148,8 @@ spawn_and_wait_session() {
|
|
|
146
148
|
watcher_pid=$!
|
|
147
149
|
fi
|
|
148
150
|
|
|
149
|
-
# Heartbeat monitor
|
|
150
|
-
start_heartbeat "$cli_pid" "$session_log" "$progress_json" "$HEARTBEAT_INTERVAL"
|
|
151
|
+
# Heartbeat monitor (with stale-kill protection)
|
|
152
|
+
start_heartbeat "$cli_pid" "$session_log" "$progress_json" "$HEARTBEAT_INTERVAL" "$STALE_KILL_THRESHOLD"
|
|
151
153
|
local heartbeat_pid="${_HEARTBEAT_PID:-}"
|
|
152
154
|
|
|
153
155
|
# Wait for AI CLI to finish
|
|
@@ -166,6 +168,14 @@ spawn_and_wait_session() {
|
|
|
166
168
|
|
|
167
169
|
[[ $exit_code -eq 143 ]] && exit_code=124
|
|
168
170
|
|
|
171
|
+
# Check for stale-kill marker (heartbeat killed the process due to no progress)
|
|
172
|
+
local stale_kill_marker="$session_dir/logs/stale-kill.json"
|
|
173
|
+
local was_stale_killed=false
|
|
174
|
+
if [[ -f "$stale_kill_marker" ]]; then
|
|
175
|
+
was_stale_killed=true
|
|
176
|
+
log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
177
|
+
fi
|
|
178
|
+
|
|
169
179
|
# Session summary
|
|
170
180
|
if [[ -f "$session_log" ]]; then
|
|
171
181
|
local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
|
|
@@ -183,6 +193,33 @@ spawn_and_wait_session() {
|
|
|
183
193
|
if [[ $exit_code -eq 124 ]]; then
|
|
184
194
|
log_warn "Session timed out after ${SESSION_TIMEOUT}s"
|
|
185
195
|
session_status="timed_out"
|
|
196
|
+
elif [[ "$was_stale_killed" == true ]]; then
|
|
197
|
+
log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
|
|
198
|
+
local has_commits=""
|
|
199
|
+
if git -C "$project_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
|
200
|
+
has_commits=$(git -C "$project_root" log "${default_branch}..HEAD" --oneline 2>/dev/null | head -1)
|
|
201
|
+
fi
|
|
202
|
+
if [[ -n "$has_commits" ]]; then
|
|
203
|
+
log_info "Stale-killed session has commits — treating as success"
|
|
204
|
+
session_status="success"
|
|
205
|
+
else
|
|
206
|
+
local uncommitted=""
|
|
207
|
+
uncommitted=$(git -C "$project_root" status --porcelain 2>/dev/null | head -1 || true)
|
|
208
|
+
if [[ -n "$uncommitted" ]]; then
|
|
209
|
+
log_warn "Stale-killed session has uncommitted changes — auto-committing..."
|
|
210
|
+
git -C "$project_root" add -A 2>/dev/null || true
|
|
211
|
+
if git -C "$project_root" commit --no-verify -m "chore($bug_id): auto-commit session work (stale-killed)" 2>/dev/null; then
|
|
212
|
+
log_info "Auto-commit succeeded"
|
|
213
|
+
session_status="success"
|
|
214
|
+
else
|
|
215
|
+
log_warn "Auto-commit failed — no changes to commit"
|
|
216
|
+
session_status="crashed"
|
|
217
|
+
fi
|
|
218
|
+
else
|
|
219
|
+
log_warn "Stale-killed session produced no commits and no changes"
|
|
220
|
+
session_status="crashed"
|
|
221
|
+
fi
|
|
222
|
+
fi
|
|
186
223
|
elif [[ $exit_code -ne 0 ]]; then
|
|
187
224
|
log_warn "Session exited with code $exit_code"
|
|
188
225
|
session_status="crashed"
|
|
@@ -1091,6 +1128,7 @@ show_help() {
|
|
|
1091
1128
|
echo " AI_CLI AI CLI command name (auto-detected: cbc or claude)"
|
|
1092
1129
|
echo " VERBOSE Set to 1 for verbose AI CLI output"
|
|
1093
1130
|
echo " HEARTBEAT_INTERVAL Heartbeat log interval in seconds (default: 30)"
|
|
1131
|
+
echo " STALE_KILL_THRESHOLD Auto-kill session after N seconds of no progress (default: 900)"
|
|
1094
1132
|
echo " LOG_CLEANUP_ENABLED Run log cleanup before execution (default: 1)"
|
|
1095
1133
|
echo " LOG_RETENTION_DAYS Delete logs older than N days (default: 14)"
|
|
1096
1134
|
echo " LOG_MAX_TOTAL_MB Keep total logs under N MB (default: 1024)"
|
|
@@ -23,6 +23,7 @@ set -euo pipefail
|
|
|
23
23
|
# MODEL AI model to use (e.g. claude-opus-4.6, claude-sonnet-4.6, claude-haiku-4.5)
|
|
24
24
|
# VERBOSE Set to 1 to enable --verbose on AI CLI (shows subagent output)
|
|
25
25
|
# HEARTBEAT_INTERVAL Heartbeat log interval in seconds (default: 30)
|
|
26
|
+
# STALE_KILL_THRESHOLD Auto-kill session after N seconds of no progress (default: 900)
|
|
26
27
|
# HEARTBEAT_STALE_THRESHOLD Heartbeat stale threshold in seconds (default: 600)
|
|
27
28
|
# LOG_CLEANUP_ENABLED Run periodic log cleanup (default: 1)
|
|
28
29
|
# LOG_RETENTION_DAYS Delete logs older than N days (default: 14)
|
|
@@ -42,6 +43,7 @@ MAX_RETRIES=${MAX_RETRIES:-3}
|
|
|
42
43
|
SESSION_TIMEOUT=${SESSION_TIMEOUT:-0}
|
|
43
44
|
HEARTBEAT_STALE_THRESHOLD=${HEARTBEAT_STALE_THRESHOLD:-600}
|
|
44
45
|
HEARTBEAT_INTERVAL=${HEARTBEAT_INTERVAL:-30}
|
|
46
|
+
STALE_KILL_THRESHOLD=${STALE_KILL_THRESHOLD:-900}
|
|
45
47
|
LOG_CLEANUP_ENABLED=${LOG_CLEANUP_ENABLED:-1}
|
|
46
48
|
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-14}
|
|
47
49
|
LOG_MAX_TOTAL_MB=${LOG_MAX_TOTAL_MB:-1024}
|
|
@@ -157,7 +159,8 @@ spawn_and_wait_session() {
|
|
|
157
159
|
fi
|
|
158
160
|
|
|
159
161
|
# Heartbeat monitor (reads progress.json when available, falls back to tail)
|
|
160
|
-
|
|
162
|
+
# Also monitors for stale sessions and auto-kills if no progress for STALE_KILL_THRESHOLD seconds
|
|
163
|
+
start_heartbeat "$cbc_pid" "$session_log" "$progress_json" "$HEARTBEAT_INTERVAL" "$STALE_KILL_THRESHOLD"
|
|
161
164
|
local heartbeat_pid="${_HEARTBEAT_PID:-}"
|
|
162
165
|
|
|
163
166
|
# Wait for AI CLI to finish
|
|
@@ -179,6 +182,14 @@ spawn_and_wait_session() {
|
|
|
179
182
|
exit_code=124
|
|
180
183
|
fi
|
|
181
184
|
|
|
185
|
+
# Check for stale-kill marker (heartbeat killed the process due to no progress)
|
|
186
|
+
local stale_kill_marker="$session_dir/logs/stale-kill.json"
|
|
187
|
+
local was_stale_killed=false
|
|
188
|
+
if [[ -f "$stale_kill_marker" ]]; then
|
|
189
|
+
was_stale_killed=true
|
|
190
|
+
log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
191
|
+
fi
|
|
192
|
+
|
|
182
193
|
# Show final session summary
|
|
183
194
|
if [[ -f "$session_log" ]]; then
|
|
184
195
|
local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
|
|
@@ -198,6 +209,34 @@ spawn_and_wait_session() {
|
|
|
198
209
|
if [[ $exit_code -eq 124 ]]; then
|
|
199
210
|
log_warn "Session timed out after ${SESSION_TIMEOUT}s"
|
|
200
211
|
session_status="timed_out"
|
|
212
|
+
elif [[ "$was_stale_killed" == true ]]; then
|
|
213
|
+
log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
|
|
214
|
+
# Treat stale-killed as potentially successful — check for commits
|
|
215
|
+
local has_commits=""
|
|
216
|
+
if git -C "$project_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
|
217
|
+
has_commits=$(git -C "$project_root" log "${default_branch}..HEAD" --oneline 2>/dev/null | head -1)
|
|
218
|
+
fi
|
|
219
|
+
if [[ -n "$has_commits" ]]; then
|
|
220
|
+
log_info "Stale-killed session has commits — treating as success"
|
|
221
|
+
session_status="success"
|
|
222
|
+
else
|
|
223
|
+
local uncommitted=""
|
|
224
|
+
uncommitted=$(git -C "$project_root" status --porcelain 2>/dev/null | head -1 || true)
|
|
225
|
+
if [[ -n "$uncommitted" ]]; then
|
|
226
|
+
log_warn "Stale-killed session has uncommitted changes — auto-committing..."
|
|
227
|
+
git -C "$project_root" add -A 2>/dev/null || true
|
|
228
|
+
if git -C "$project_root" commit --no-verify -m "chore($feature_id): auto-commit session work (stale-killed)" 2>/dev/null; then
|
|
229
|
+
log_info "Auto-commit succeeded"
|
|
230
|
+
session_status="success"
|
|
231
|
+
else
|
|
232
|
+
log_warn "Auto-commit failed — no changes to commit"
|
|
233
|
+
session_status="crashed"
|
|
234
|
+
fi
|
|
235
|
+
else
|
|
236
|
+
log_warn "Stale-killed session produced no commits and no changes"
|
|
237
|
+
session_status="crashed"
|
|
238
|
+
fi
|
|
239
|
+
fi
|
|
201
240
|
elif [[ $exit_code -ne 0 ]]; then
|
|
202
241
|
log_warn "Session exited with code $exit_code"
|
|
203
242
|
session_status="crashed"
|
|
@@ -1318,6 +1357,7 @@ show_help() {
|
|
|
1318
1357
|
echo " AI_CLI AI CLI command name (auto-detected: cbc or claude)"
|
|
1319
1358
|
echo " MODEL AI model ID (e.g. claude-opus-4.6, claude-sonnet-4.6, claude-haiku-4.5)"
|
|
1320
1359
|
echo " HEARTBEAT_INTERVAL Heartbeat log interval in seconds (default: 30)"
|
|
1360
|
+
echo " STALE_KILL_THRESHOLD Auto-kill session after N seconds of no progress (default: 900)"
|
|
1321
1361
|
echo " HEARTBEAT_STALE_THRESHOLD Heartbeat stale threshold in seconds (default: 600)"
|
|
1322
1362
|
echo " LOG_CLEANUP_ENABLED Run log cleanup before execution (default: 1)"
|
|
1323
1363
|
echo " LOG_RETENTION_DAYS Delete logs older than N days (default: 14)"
|
|
@@ -21,6 +21,7 @@ set -euo pipefail
|
|
|
21
21
|
# PRIZMKIT_PLATFORM Force platform: 'codebuddy' or 'claude' (auto-detected)
|
|
22
22
|
# VERBOSE Set to 1 to enable --verbose on AI CLI
|
|
23
23
|
# HEARTBEAT_INTERVAL Heartbeat log interval in seconds (default: 30)
|
|
24
|
+
# STALE_KILL_THRESHOLD Auto-kill session after N seconds of no progress (default: 900)
|
|
24
25
|
# HEARTBEAT_STALE_THRESHOLD Heartbeat stale threshold in seconds (default: 600)
|
|
25
26
|
# LOG_CLEANUP_ENABLED Run periodic log cleanup (default: 1)
|
|
26
27
|
# LOG_RETENTION_DAYS Delete logs older than N days (default: 14)
|
|
@@ -40,6 +41,7 @@ MAX_RETRIES=${MAX_RETRIES:-3}
|
|
|
40
41
|
SESSION_TIMEOUT=${SESSION_TIMEOUT:-0}
|
|
41
42
|
HEARTBEAT_STALE_THRESHOLD=${HEARTBEAT_STALE_THRESHOLD:-600}
|
|
42
43
|
HEARTBEAT_INTERVAL=${HEARTBEAT_INTERVAL:-30}
|
|
44
|
+
STALE_KILL_THRESHOLD=${STALE_KILL_THRESHOLD:-900}
|
|
43
45
|
LOG_CLEANUP_ENABLED=${LOG_CLEANUP_ENABLED:-1}
|
|
44
46
|
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-14}
|
|
45
47
|
LOG_MAX_TOTAL_MB=${LOG_MAX_TOTAL_MB:-1024}
|
|
@@ -148,8 +150,8 @@ spawn_and_wait_session() {
|
|
|
148
150
|
watcher_pid=$!
|
|
149
151
|
fi
|
|
150
152
|
|
|
151
|
-
# Heartbeat monitor
|
|
152
|
-
start_heartbeat "$cli_pid" "$session_log" "$progress_json" "$HEARTBEAT_INTERVAL"
|
|
153
|
+
# Heartbeat monitor (with stale-kill protection)
|
|
154
|
+
start_heartbeat "$cli_pid" "$session_log" "$progress_json" "$HEARTBEAT_INTERVAL" "$STALE_KILL_THRESHOLD"
|
|
153
155
|
local heartbeat_pid="${_HEARTBEAT_PID:-}"
|
|
154
156
|
|
|
155
157
|
# Wait for AI CLI to finish
|
|
@@ -168,6 +170,14 @@ spawn_and_wait_session() {
|
|
|
168
170
|
|
|
169
171
|
[[ $exit_code -eq 143 ]] && exit_code=124
|
|
170
172
|
|
|
173
|
+
# Check for stale-kill marker (heartbeat killed the process due to no progress)
|
|
174
|
+
local stale_kill_marker="$session_dir/logs/stale-kill.json"
|
|
175
|
+
local was_stale_killed=false
|
|
176
|
+
if [[ -f "$stale_kill_marker" ]]; then
|
|
177
|
+
was_stale_killed=true
|
|
178
|
+
log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
179
|
+
fi
|
|
180
|
+
|
|
171
181
|
# Session summary
|
|
172
182
|
if [[ -f "$session_log" ]]; then
|
|
173
183
|
local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
|
|
@@ -185,6 +195,33 @@ spawn_and_wait_session() {
|
|
|
185
195
|
if [[ $exit_code -eq 124 ]]; then
|
|
186
196
|
log_warn "Session timed out after ${SESSION_TIMEOUT}s"
|
|
187
197
|
session_status="timed_out"
|
|
198
|
+
elif [[ "$was_stale_killed" == true ]]; then
|
|
199
|
+
log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
|
|
200
|
+
local has_commits=""
|
|
201
|
+
if git -C "$project_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
|
202
|
+
has_commits=$(git -C "$project_root" log "${default_branch}..HEAD" --oneline 2>/dev/null | head -1)
|
|
203
|
+
fi
|
|
204
|
+
if [[ -n "$has_commits" ]]; then
|
|
205
|
+
log_info "Stale-killed session has commits — treating as success"
|
|
206
|
+
session_status="success"
|
|
207
|
+
else
|
|
208
|
+
local uncommitted=""
|
|
209
|
+
uncommitted=$(git -C "$project_root" status --porcelain 2>/dev/null | head -1 || true)
|
|
210
|
+
if [[ -n "$uncommitted" ]]; then
|
|
211
|
+
log_warn "Stale-killed session has uncommitted changes — auto-committing..."
|
|
212
|
+
git -C "$project_root" add -A 2>/dev/null || true
|
|
213
|
+
if git -C "$project_root" commit --no-verify -m "chore($refactor_id): auto-commit session work (stale-killed)" 2>/dev/null; then
|
|
214
|
+
log_info "Auto-commit succeeded"
|
|
215
|
+
session_status="success"
|
|
216
|
+
else
|
|
217
|
+
log_warn "Auto-commit failed — no changes to commit"
|
|
218
|
+
session_status="crashed"
|
|
219
|
+
fi
|
|
220
|
+
else
|
|
221
|
+
log_warn "Stale-killed session produced no commits and no changes"
|
|
222
|
+
session_status="crashed"
|
|
223
|
+
fi
|
|
224
|
+
fi
|
|
188
225
|
elif [[ $exit_code -ne 0 ]]; then
|
|
189
226
|
log_warn "Session exited with code $exit_code"
|
|
190
227
|
session_status="crashed"
|
|
@@ -1129,6 +1166,7 @@ show_help() {
|
|
|
1129
1166
|
echo " VERBOSE Set to 1 for verbose AI CLI output"
|
|
1130
1167
|
echo " STRICT_BEHAVIOR_CHECK Force full test suite after each refactor (default: 1)"
|
|
1131
1168
|
echo " HEARTBEAT_INTERVAL Heartbeat log interval in seconds (default: 30)"
|
|
1169
|
+
echo " STALE_KILL_THRESHOLD Auto-kill session after N seconds of no progress (default: 900)"
|
|
1132
1170
|
echo " LOG_CLEANUP_ENABLED Run log cleanup before execution (default: 1)"
|
|
1133
1171
|
echo " LOG_RETENTION_DAYS Delete logs older than N days (default: 14)"
|
|
1134
1172
|
echo " LOG_MAX_TOTAL_MB Keep total logs under N MB (default: 1024)"
|
|
@@ -226,15 +226,16 @@ def check_stuck_checkpoint(item_dir):
|
|
|
226
226
|
return None
|
|
227
227
|
|
|
228
228
|
|
|
229
|
-
def check_stale_heartbeat(item_id, item_status, state_dir, items_subdir, stale_threshold):
|
|
229
|
+
def check_stale_heartbeat(item_id, item_status, state_dir, items_subdir, stale_threshold, task_list_status=None):
|
|
230
230
|
"""Check 3: Is the heartbeat stale or missing for an in_progress item?
|
|
231
231
|
|
|
232
232
|
Only applies to items whose status indicates active work.
|
|
233
|
-
|
|
233
|
+
Status is read from task_list_status (task list JSON, single source of truth).
|
|
234
|
+
Uses last_session_id from the item's own status.json to find the active session.
|
|
234
235
|
|
|
235
236
|
Returns a stuck-report dict or None.
|
|
236
237
|
"""
|
|
237
|
-
status =
|
|
238
|
+
status = task_list_status
|
|
238
239
|
# All pipelines now use "in_progress" as the active status
|
|
239
240
|
in_progress_statuses = {"in_progress"}
|
|
240
241
|
if status not in in_progress_statuses:
|
|
@@ -287,6 +288,8 @@ def check_stale_heartbeat(item_id, item_status, state_dir, items_subdir, stale_t
|
|
|
287
288
|
def check_dependency_deadlock(item_id, task_list_data, state_dir, items_subdir, items_key):
|
|
288
289
|
"""Check 4: Does this item depend on a failed item?
|
|
289
290
|
|
|
291
|
+
Reads dependency status from task list JSON (single source of truth).
|
|
292
|
+
|
|
290
293
|
Returns a stuck-report dict or None.
|
|
291
294
|
"""
|
|
292
295
|
if task_list_data is None:
|
|
@@ -296,6 +299,12 @@ def check_dependency_deadlock(item_id, task_list_data, state_dir, items_subdir,
|
|
|
296
299
|
if not isinstance(items, list):
|
|
297
300
|
return None
|
|
298
301
|
|
|
302
|
+
# Build status map from task list
|
|
303
|
+
status_map = {}
|
|
304
|
+
for item in items:
|
|
305
|
+
if isinstance(item, dict) and item.get("id"):
|
|
306
|
+
status_map[item["id"]] = item.get("status", "pending")
|
|
307
|
+
|
|
299
308
|
# Find this item in the list to get its dependencies
|
|
300
309
|
deps = None
|
|
301
310
|
for item in items:
|
|
@@ -308,15 +317,9 @@ def check_dependency_deadlock(item_id, task_list_data, state_dir, items_subdir,
|
|
|
308
317
|
if not deps or not isinstance(deps, list):
|
|
309
318
|
return None
|
|
310
319
|
|
|
311
|
-
# Check each dependency's status
|
|
320
|
+
# Check each dependency's status from the task list
|
|
312
321
|
for dep_id in deps:
|
|
313
|
-
|
|
314
|
-
state_dir, items_subdir, dep_id, "status.json"
|
|
315
|
-
)
|
|
316
|
-
dep_status = load_json(dep_status_path)
|
|
317
|
-
if dep_status is None:
|
|
318
|
-
continue
|
|
319
|
-
dep_state = dep_status.get("status")
|
|
322
|
+
dep_state = status_map.get(dep_id)
|
|
320
323
|
if dep_state == "failed":
|
|
321
324
|
return {
|
|
322
325
|
"reason": "dependency_failed",
|
|
@@ -376,8 +379,16 @@ def check_item(item_id, state_dir, items_subdir, items_key, task_list_data, max_
|
|
|
376
379
|
item_status = load_json(status_path)
|
|
377
380
|
|
|
378
381
|
if item_status is None:
|
|
379
|
-
#
|
|
380
|
-
|
|
382
|
+
# Create a minimal runtime dict so checks can proceed
|
|
383
|
+
item_status = {}
|
|
384
|
+
|
|
385
|
+
# Look up item status from task list (single source of truth)
|
|
386
|
+
task_list_status = None
|
|
387
|
+
if task_list_data:
|
|
388
|
+
for item in task_list_data.get(items_key, []):
|
|
389
|
+
if isinstance(item, dict) and item.get("id") == item_id:
|
|
390
|
+
task_list_status = item.get("status", "pending")
|
|
391
|
+
break
|
|
381
392
|
|
|
382
393
|
reports = []
|
|
383
394
|
|
|
@@ -392,7 +403,7 @@ def check_item(item_id, state_dir, items_subdir, items_key, task_list_data, max_
|
|
|
392
403
|
reports.append(result)
|
|
393
404
|
|
|
394
405
|
# Check 3: Stale heartbeat
|
|
395
|
-
result = check_stale_heartbeat(item_id, item_status, state_dir, items_subdir, stale_threshold)
|
|
406
|
+
result = check_stale_heartbeat(item_id, item_status, state_dir, items_subdir, stale_threshold, task_list_status)
|
|
396
407
|
if result is not None:
|
|
397
408
|
reports.append(result)
|
|
398
409
|
|
|
@@ -249,13 +249,8 @@ def create_state_directory(state_dir, bug_list_path, bugs):
|
|
|
249
249
|
sessions_dir = os.path.join(bug_dir, "sessions")
|
|
250
250
|
os.makedirs(sessions_dir, exist_ok=True)
|
|
251
251
|
|
|
252
|
-
# Respect existing terminal status from bug-fix-list.json
|
|
253
|
-
bl_status = bug.get("status", "pending")
|
|
254
|
-
init_status = bl_status if bl_status in TERMINAL_STATUSES else "pending"
|
|
255
|
-
|
|
256
252
|
bug_status = {
|
|
257
253
|
"bug_id": bid,
|
|
258
|
-
"status": init_status,
|
|
259
254
|
"retry_count": 0,
|
|
260
255
|
"max_retries": 3,
|
|
261
256
|
"sessions": [],
|