prizmkit 1.1.66 → 1.1.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled/VERSION.json +3 -3
- package/bundled/adapters/codex/settings-adapter.js +1 -1
- package/bundled/dev-pipeline/.env.example +3 -0
- package/bundled/dev-pipeline/SCHEMA_ANALYSIS.md +3 -1
- package/bundled/dev-pipeline/lib/common.sh +61 -18
- package/bundled/dev-pipeline/lib/heartbeat.sh +104 -11
- package/bundled/dev-pipeline/run-bugfix.sh +26 -5
- package/bundled/dev-pipeline/run-feature.sh +20 -3
- package/bundled/dev-pipeline/run-refactor.sh +26 -5
- package/bundled/dev-pipeline/scripts/parse-stream-progress.py +144 -12
- package/bundled/dev-pipeline/scripts/update-bug-status.py +15 -0
- package/bundled/dev-pipeline/scripts/update-feature-status.py +18 -0
- package/bundled/dev-pipeline/scripts/update-refactor-status.py +15 -0
- package/bundled/dev-pipeline/tests/test_auto_skip.py +39 -0
- package/bundled/dev-pipeline-windows/.env.example +3 -2
- package/bundled/dev-pipeline-windows/SCHEMA_ANALYSIS.md +4 -3
- package/bundled/dev-pipeline-windows/lib/common.ps1 +97 -5
- package/bundled/dev-pipeline-windows/lib/pipeline.ps1 +31 -7
- package/bundled/dev-pipeline-windows/run-recovery.ps1 +8 -1
- package/bundled/dev-pipeline-windows/scripts/parse-stream-progress.py +144 -12
- package/bundled/dev-pipeline-windows/scripts/update-bug-status.py +15 -0
- package/bundled/dev-pipeline-windows/scripts/update-feature-status.py +18 -0
- package/bundled/dev-pipeline-windows/scripts/update-refactor-status.py +15 -0
- package/bundled/skills/_metadata.json +1 -1
- package/package.json +1 -1
- package/src/scaffold.js +1 -1
package/bundled/VERSION.json
CHANGED
|
@@ -41,6 +41,9 @@
|
|
|
41
41
|
# ─── Logging & Heartbeat ─────────────────────────────────────────────
|
|
42
42
|
# HEARTBEAT_INTERVAL=30 # Heartbeat log interval in seconds
|
|
43
43
|
# HEARTBEAT_STALE_THRESHOLD=600 # Max seconds without heartbeat before marking stale
|
|
44
|
+
# STALE_KILL_THRESHOLD=900 # Auto-kill after N seconds without parent log progress (0 = disabled)
|
|
45
|
+
# CODEX_WAIT_STALE_KILL_THRESHOLD=3600 # Longer no-log window while Codex waits on subagents
|
|
46
|
+
# CODEX_SUBAGENT_TIMEOUT_SECONDS=3300 # Codex subagent max runtime; defaults to wait threshold - 300
|
|
44
47
|
# LOG_CLEANUP_ENABLED=1 # Periodic log cleanup (1=on, 0=off)
|
|
45
48
|
# LOG_RETENTION_DAYS=14 # Delete logs older than N days
|
|
46
49
|
# LOG_MAX_TOTAL_MB=1024 # Keep total logs under N MB via oldest-first cleanup
|
|
@@ -353,6 +353,9 @@ pending, in_progress, completed, failed, skipped
|
|
|
353
353
|
| `DEV_BRANCH` | string | auto-generated | Custom branch name |
|
|
354
354
|
| `HEARTBEAT_INTERVAL` | integer | 30 | Heartbeat log interval (s) |
|
|
355
355
|
| `HEARTBEAT_STALE_THRESHOLD` | integer | 600 | Max seconds without heartbeat |
|
|
356
|
+
| `STALE_KILL_THRESHOLD` | integer | 900 | Auto-kill after N seconds without parent log progress |
|
|
357
|
+
| `CODEX_WAIT_STALE_KILL_THRESHOLD` | integer | 3600 | Longer no-log stale window while Codex waits on subagents |
|
|
358
|
+
| `CODEX_SUBAGENT_TIMEOUT_SECONDS` | integer | 3300 | Codex subagent max runtime |
|
|
356
359
|
| `LOG_CLEANUP_ENABLED` | integer | 1 | Periodic cleanup |
|
|
357
360
|
| `LOG_RETENTION_DAYS` | integer | 14 | Delete logs older than N days |
|
|
358
361
|
| `LOG_MAX_TOTAL_MB` | integer | 1024 | Max total logs (MB) |
|
|
@@ -532,4 +535,3 @@ Located in `/dev-pipeline/templates/`:
|
|
|
532
535
|
- Agent: 6 files
|
|
533
536
|
- Base/Shared: 7 files
|
|
534
537
|
- Singleton: 3 files
|
|
535
|
-
|
|
@@ -344,6 +344,23 @@ prizm_detect_cli_and_platform() {
|
|
|
344
344
|
# command substitution; the background process must remain a child of the
|
|
345
345
|
# runner shell so wait/heartbeat/trap handling works correctly.
|
|
346
346
|
PRIZM_AI_PID=""
|
|
347
|
+
|
|
348
|
+
_prizm_codex_subagent_timeout_seconds() {
|
|
349
|
+
local configured="${CODEX_SUBAGENT_TIMEOUT_SECONDS:-}"
|
|
350
|
+
if [[ "$configured" =~ ^[0-9]+$ && "$configured" -gt 0 ]]; then
|
|
351
|
+
printf '%s\n' "$configured"
|
|
352
|
+
return 0
|
|
353
|
+
fi
|
|
354
|
+
|
|
355
|
+
local wait_threshold="${CODEX_WAIT_STALE_KILL_THRESHOLD:-3600}"
|
|
356
|
+
if [[ "$wait_threshold" =~ ^[0-9]+$ && "$wait_threshold" -gt 600 ]]; then
|
|
357
|
+
printf '%s\n' "$((wait_threshold - 300))"
|
|
358
|
+
return 0
|
|
359
|
+
fi
|
|
360
|
+
|
|
361
|
+
printf '%s\n' 3300
|
|
362
|
+
}
|
|
363
|
+
|
|
347
364
|
prizm_start_ai_session() {
|
|
348
365
|
local prompt_path="$1"
|
|
349
366
|
local log_path="$2"
|
|
@@ -370,15 +387,8 @@ prizm_start_ai_session() {
|
|
|
370
387
|
;;
|
|
371
388
|
codex)
|
|
372
389
|
local codex_args=(--ask-for-approval never --sandbox danger-full-access)
|
|
373
|
-
local codex_subagent_timeout
|
|
374
|
-
|
|
375
|
-
local outer_stale_threshold="${STALE_KILL_THRESHOLD:-900}"
|
|
376
|
-
if [[ "$outer_stale_threshold" =~ ^[0-9]+$ && "$outer_stale_threshold" -gt 120 ]]; then
|
|
377
|
-
codex_subagent_timeout=$((outer_stale_threshold - 60))
|
|
378
|
-
else
|
|
379
|
-
codex_subagent_timeout=840
|
|
380
|
-
fi
|
|
381
|
-
fi
|
|
390
|
+
local codex_subagent_timeout
|
|
391
|
+
codex_subagent_timeout="$(_prizm_codex_subagent_timeout_seconds)"
|
|
382
392
|
if [[ "$codex_subagent_timeout" =~ ^[0-9]+$ && "$codex_subagent_timeout" -gt 0 ]]; then
|
|
383
393
|
codex_args+=(--config "agents.job_max_runtime_seconds=$codex_subagent_timeout")
|
|
384
394
|
fi
|
|
@@ -408,6 +418,46 @@ prizm_start_ai_session() {
|
|
|
408
418
|
PRIZM_AI_PID=$!
|
|
409
419
|
}
|
|
410
420
|
|
|
421
|
+
# Detect AI CLI/provider infrastructure failures that are outside the
|
|
422
|
+
# generated code's control. These should be retried without consuming the
|
|
423
|
+
# item's code retry budget.
|
|
424
|
+
prizm_detect_infra_error() {
|
|
425
|
+
local session_log="${1:-}"
|
|
426
|
+
local progress_json="${2:-}"
|
|
427
|
+
|
|
428
|
+
local haystack=""
|
|
429
|
+
if [[ -n "$session_log" && -f "$session_log" ]]; then
|
|
430
|
+
haystack="$(tail -c 65536 "$session_log" 2>/dev/null || true)"
|
|
431
|
+
fi
|
|
432
|
+
if [[ -n "$progress_json" && -f "$progress_json" ]]; then
|
|
433
|
+
haystack+=$'\n'
|
|
434
|
+
haystack+="$(cat "$progress_json" 2>/dev/null || true)"
|
|
435
|
+
fi
|
|
436
|
+
|
|
437
|
+
[[ -n "$haystack" ]] || return 1
|
|
438
|
+
|
|
439
|
+
if printf '%s' "$haystack" | grep -Eiq \
|
|
440
|
+
'auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded'; then
|
|
441
|
+
return 0
|
|
442
|
+
fi
|
|
443
|
+
|
|
444
|
+
return 1
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
prizm_extract_update_new_status() {
|
|
448
|
+
python3 -c "
|
|
449
|
+
import json, sys
|
|
450
|
+
raw = sys.stdin.read()
|
|
451
|
+
try:
|
|
452
|
+
data = json.loads(raw)
|
|
453
|
+
except Exception:
|
|
454
|
+
sys.exit(0)
|
|
455
|
+
value = data.get('new_status')
|
|
456
|
+
if value:
|
|
457
|
+
print(value)
|
|
458
|
+
"
|
|
459
|
+
}
|
|
460
|
+
|
|
411
461
|
# Run an AI CLI session synchronously.
|
|
412
462
|
# Usage: prizm_run_ai_session <prompt_path> <log_path> <model>
|
|
413
463
|
prizm_run_ai_session() {
|
|
@@ -430,15 +480,8 @@ prizm_run_ai_session() {
|
|
|
430
480
|
;;
|
|
431
481
|
codex)
|
|
432
482
|
local codex_args=(--ask-for-approval never --sandbox danger-full-access)
|
|
433
|
-
local codex_subagent_timeout
|
|
434
|
-
|
|
435
|
-
local outer_stale_threshold="${STALE_KILL_THRESHOLD:-900}"
|
|
436
|
-
if [[ "$outer_stale_threshold" =~ ^[0-9]+$ && "$outer_stale_threshold" -gt 120 ]]; then
|
|
437
|
-
codex_subagent_timeout=$((outer_stale_threshold - 60))
|
|
438
|
-
else
|
|
439
|
-
codex_subagent_timeout=840
|
|
440
|
-
fi
|
|
441
|
-
fi
|
|
483
|
+
local codex_subagent_timeout
|
|
484
|
+
codex_subagent_timeout="$(_prizm_codex_subagent_timeout_seconds)"
|
|
442
485
|
if [[ "$codex_subagent_timeout" =~ ^[0-9]+$ && "$codex_subagent_timeout" -gt 0 ]]; then
|
|
443
486
|
codex_args+=(--config "agents.job_max_runtime_seconds=$codex_subagent_timeout")
|
|
444
487
|
fi
|
|
@@ -41,6 +41,7 @@ start_heartbeat() {
|
|
|
41
41
|
(
|
|
42
42
|
local elapsed=0
|
|
43
43
|
local prev_size=0
|
|
44
|
+
local prev_child_activity_signature=""
|
|
44
45
|
local stale_seconds=0
|
|
45
46
|
while kill -0 "$cli_pid" 2>/dev/null; do
|
|
46
47
|
sleep "$heartbeat_interval"
|
|
@@ -57,8 +58,41 @@ start_heartbeat() {
|
|
|
57
58
|
local growth=$((cur_size - prev_size))
|
|
58
59
|
prev_size=$cur_size
|
|
59
60
|
|
|
60
|
-
|
|
61
|
-
|
|
61
|
+
local child_activity_signature=""
|
|
62
|
+
local child_total_bytes=0
|
|
63
|
+
local child_session_count=0
|
|
64
|
+
if [[ -f "$progress_json" ]]; then
|
|
65
|
+
local child_activity_data
|
|
66
|
+
child_activity_data=$(python3 - "$progress_json" <<'PY' 2>/dev/null || true
|
|
67
|
+
import json
|
|
68
|
+
import sys
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
with open(sys.argv[1], "r", encoding="utf-8") as fh:
|
|
72
|
+
progress = json.load(fh)
|
|
73
|
+
except Exception:
|
|
74
|
+
sys.exit(0)
|
|
75
|
+
|
|
76
|
+
signature = str(progress.get("child_activity_signature") or "")
|
|
77
|
+
total_bytes = int(progress.get("child_total_bytes") or 0)
|
|
78
|
+
session_count = len(progress.get("child_session_files") or [])
|
|
79
|
+
print(f"{signature}\t{total_bytes}\t{session_count}")
|
|
80
|
+
PY
|
|
81
|
+
)
|
|
82
|
+
if [[ -n "$child_activity_data" ]]; then
|
|
83
|
+
IFS=$'\t' read -r child_activity_signature child_total_bytes child_session_count <<< "$child_activity_data"
|
|
84
|
+
fi
|
|
85
|
+
fi
|
|
86
|
+
|
|
87
|
+
local child_growth=0
|
|
88
|
+
if [[ -n "$child_activity_signature" && "$child_activity_signature" != "$prev_child_activity_signature" ]]; then
|
|
89
|
+
child_growth=1
|
|
90
|
+
fi
|
|
91
|
+
prev_child_activity_signature="$child_activity_signature"
|
|
92
|
+
|
|
93
|
+
# Track progress staleness. A Codex parent can sit in `wait`
|
|
94
|
+
# while child transcripts keep growing, so child activity counts.
|
|
95
|
+
if [[ $growth -eq 0 && $child_growth -eq 0 ]]; then
|
|
62
96
|
stale_seconds=$((stale_seconds + heartbeat_interval))
|
|
63
97
|
else
|
|
64
98
|
stale_seconds=0
|
|
@@ -72,28 +106,87 @@ start_heartbeat() {
|
|
|
72
106
|
else
|
|
73
107
|
size_display="${cur_size}B"
|
|
74
108
|
fi
|
|
109
|
+
local child_display=""
|
|
110
|
+
if [[ ${child_total_bytes:-0} -gt 0 ]]; then
|
|
111
|
+
local child_size_display
|
|
112
|
+
if [[ $child_total_bytes -gt 1048576 ]]; then
|
|
113
|
+
child_size_display="$((child_total_bytes / 1048576))MB"
|
|
114
|
+
elif [[ $child_total_bytes -gt 1024 ]]; then
|
|
115
|
+
child_size_display="$((child_total_bytes / 1024))KB"
|
|
116
|
+
else
|
|
117
|
+
child_size_display="${child_total_bytes}B"
|
|
118
|
+
fi
|
|
119
|
+
child_display=" | child: ${child_size_display}"
|
|
120
|
+
if [[ ${child_session_count:-0} -gt 1 ]]; then
|
|
121
|
+
child_display="${child_display}/${child_session_count}"
|
|
122
|
+
fi
|
|
123
|
+
fi
|
|
75
124
|
|
|
76
125
|
local mins=$((elapsed / 60))
|
|
77
126
|
local secs=$((elapsed % 60))
|
|
78
127
|
|
|
79
128
|
local status_icon
|
|
80
|
-
if [[ $growth -gt 0 ]]; then
|
|
129
|
+
if [[ $growth -gt 0 || $child_growth -gt 0 ]]; then
|
|
81
130
|
status_icon="${GREEN}▶${NC}"
|
|
82
131
|
else
|
|
83
132
|
status_icon="${YELLOW}⏸${NC}"
|
|
84
133
|
fi
|
|
85
134
|
|
|
86
|
-
|
|
87
|
-
if [[ $stale_kill_threshold -gt 0 &&
|
|
135
|
+
local effective_stale_kill_threshold="$stale_kill_threshold"
|
|
136
|
+
if [[ $stale_kill_threshold -gt 0 && -f "$progress_json" ]]; then
|
|
137
|
+
local codex_wait_threshold
|
|
138
|
+
codex_wait_threshold=$(python3 - "$progress_json" "$stale_kill_threshold" <<'PY' 2>/dev/null || true
|
|
139
|
+
import json
|
|
140
|
+
import os
|
|
141
|
+
import sys
|
|
142
|
+
|
|
143
|
+
progress_path = sys.argv[1]
|
|
144
|
+
base_threshold = int(sys.argv[2])
|
|
145
|
+
|
|
146
|
+
with open(progress_path, "r", encoding="utf-8") as fh:
|
|
147
|
+
progress = json.load(fh)
|
|
148
|
+
|
|
149
|
+
spawn_count = 0
|
|
150
|
+
for tool in progress.get("tool_calls", []):
|
|
151
|
+
if isinstance(tool, dict) and tool.get("name") == "spawn_agent":
|
|
152
|
+
try:
|
|
153
|
+
spawn_count += int(tool.get("count", 0))
|
|
154
|
+
except (TypeError, ValueError):
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
if (
|
|
158
|
+
progress.get("event_format") == "codex-json"
|
|
159
|
+
and progress.get("current_tool") == "wait"
|
|
160
|
+
and spawn_count > 0
|
|
161
|
+
):
|
|
162
|
+
configured = os.environ.get("CODEX_WAIT_STALE_KILL_THRESHOLD", "")
|
|
163
|
+
try:
|
|
164
|
+
wait_threshold = int(configured)
|
|
165
|
+
except ValueError:
|
|
166
|
+
wait_threshold = max(base_threshold * 4, 3600)
|
|
167
|
+
if wait_threshold > base_threshold:
|
|
168
|
+
print(wait_threshold)
|
|
169
|
+
PY
|
|
170
|
+
)
|
|
171
|
+
if [[ "$codex_wait_threshold" =~ ^[0-9]+$ && "$codex_wait_threshold" -gt "$stale_kill_threshold" ]]; then
|
|
172
|
+
effective_stale_kill_threshold="$codex_wait_threshold"
|
|
173
|
+
fi
|
|
174
|
+
fi
|
|
175
|
+
|
|
176
|
+
# Stale-kill: auto-terminate process if no progress for too long.
|
|
177
|
+
# Codex parent sessions can sit on the `wait` tool while a spawned
|
|
178
|
+
# subagent is still doing useful work. Give that valid wait a longer
|
|
179
|
+
# stale window; normal single-agent stalls still use the base limit.
|
|
180
|
+
if [[ $effective_stale_kill_threshold -gt 0 && $stale_seconds -ge $effective_stale_kill_threshold ]]; then
|
|
88
181
|
local stale_mins=$((stale_seconds / 60))
|
|
89
|
-
echo -e " ${RED}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${RED}STALE-KILL: no progress for ${stale_mins}m (threshold: ${
|
|
182
|
+
echo -e " ${RED}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${RED}STALE-KILL: no progress for ${stale_mins}m (threshold: ${effective_stale_kill_threshold}s)${NC}"
|
|
90
183
|
echo -e " ${RED}[HEARTBEAT]${NC} Killing AI CLI process $cli_pid (stale session)..."
|
|
91
184
|
# Write the marker before killing. Some CLIs exit quickly, and the
|
|
92
185
|
# parent runner may stop this heartbeat process immediately after
|
|
93
186
|
# wait(1) returns.
|
|
94
187
|
local _marker_dir
|
|
95
188
|
_marker_dir="$(dirname "$session_log")"
|
|
96
|
-
echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"stale_session\", \"stale_seconds\": $stale_seconds, \"threshold\": $
|
|
189
|
+
echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"stale_session\", \"stale_seconds\": $stale_seconds, \"threshold\": $effective_stale_kill_threshold}" > "$_marker_dir/stale-kill.json" 2>/dev/null || true
|
|
97
190
|
kill -TERM "$cli_pid" 2>/dev/null || true
|
|
98
191
|
# Give process 10s to exit gracefully, then force kill
|
|
99
192
|
local stale_kill_grace_seconds="${STALE_KILL_GRACE_SECONDS:-10}"
|
|
@@ -109,9 +202,9 @@ start_heartbeat() {
|
|
|
109
202
|
|
|
110
203
|
# Build staleness hint for display
|
|
111
204
|
local stale_hint=""
|
|
112
|
-
if [[ $
|
|
205
|
+
if [[ $effective_stale_kill_threshold -gt 0 && $stale_seconds -gt 0 ]]; then
|
|
113
206
|
local stale_mins=$((stale_seconds / 60))
|
|
114
|
-
local threshold_mins=$((
|
|
207
|
+
local threshold_mins=$((effective_stale_kill_threshold / 60))
|
|
115
208
|
stale_hint=" | stale: ${stale_mins}m/${threshold_mins}m"
|
|
116
209
|
fi
|
|
117
210
|
|
|
@@ -134,7 +227,7 @@ try:
|
|
|
134
227
|
except Exception:
|
|
135
228
|
sys.exit(1)
|
|
136
229
|
" "$progress_json" 2>/dev/null) && {
|
|
137
|
-
echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${phase}${stale_hint}"
|
|
230
|
+
echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display}${child_display} | ${phase}${stale_hint}"
|
|
138
231
|
continue
|
|
139
232
|
}
|
|
140
233
|
fi
|
|
@@ -145,7 +238,7 @@ except Exception:
|
|
|
145
238
|
last_activity=$(tail -20 "$session_log" 2>/dev/null | grep -v '^$' | tail -1 | cut -c1-80 || echo "")
|
|
146
239
|
fi
|
|
147
240
|
|
|
148
|
-
echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s elapsed | log: ${size_display} (+${growth}B) | ${last_activity}${stale_hint}"
|
|
241
|
+
echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s elapsed | log: ${size_display}${child_display} (+${growth}B) | ${last_activity}${stale_hint}"
|
|
149
242
|
done
|
|
150
243
|
) &
|
|
151
244
|
_HEARTBEAT_PID=$!
|
|
@@ -145,6 +145,11 @@ spawn_and_wait_session() {
|
|
|
145
145
|
log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
146
146
|
fi
|
|
147
147
|
|
|
148
|
+
local was_infra_error=false
|
|
149
|
+
if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
|
|
150
|
+
was_infra_error=true
|
|
151
|
+
fi
|
|
152
|
+
|
|
148
153
|
# Session summary
|
|
149
154
|
if [[ -f "$session_log" ]]; then
|
|
150
155
|
local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
|
|
@@ -162,6 +167,10 @@ spawn_and_wait_session() {
|
|
|
162
167
|
if [[ $exit_code -eq 124 ]]; then
|
|
163
168
|
log_warn "Session timed out after ${SESSION_TIMEOUT}s"
|
|
164
169
|
session_status="timed_out"
|
|
170
|
+
elif [[ "$was_infra_error" == true ]]; then
|
|
171
|
+
log_warn "Session failed due to AI CLI/provider infrastructure error"
|
|
172
|
+
log_warn "Infrastructure errors are retried without consuming code retry budget"
|
|
173
|
+
session_status="infra_error"
|
|
165
174
|
elif [[ "$was_stale_killed" == true ]]; then
|
|
166
175
|
log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
|
|
167
176
|
log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
|
|
@@ -259,14 +268,20 @@ sys.exit(0)
|
|
|
259
268
|
prizm_detect_subagents "$session_log"
|
|
260
269
|
|
|
261
270
|
# Update bug status (do NOT commit on dev branch — commit happens after merge)
|
|
262
|
-
|
|
271
|
+
local update_output
|
|
272
|
+
update_output=$(python3 "$SCRIPTS_DIR/update-bug-status.py" \
|
|
263
273
|
--bug-list "$bug_list" \
|
|
264
274
|
--state-dir "$STATE_DIR" \
|
|
265
275
|
--bug-id "$bug_id" \
|
|
266
276
|
--session-status "$session_status" \
|
|
267
277
|
--session-id "$session_id" \
|
|
268
278
|
--max-retries "$max_retries" \
|
|
269
|
-
--action update
|
|
279
|
+
--action update 2>&1) || {
|
|
280
|
+
log_error "Failed to update bug status: $update_output"
|
|
281
|
+
update_output=""
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
_SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
|
|
270
285
|
|
|
271
286
|
_SPAWN_RESULT="$session_status"
|
|
272
287
|
}
|
|
@@ -693,6 +708,7 @@ else:
|
|
|
693
708
|
trap cleanup_single_bug SIGINT SIGTERM
|
|
694
709
|
|
|
695
710
|
_SPAWN_RESULT=""
|
|
711
|
+
_SPAWN_ITEM_STATUS=""
|
|
696
712
|
|
|
697
713
|
# Branch lifecycle: create and checkout bugfix branch
|
|
698
714
|
local _proj_root
|
|
@@ -1078,12 +1094,14 @@ DEPLOY_PROMPT_EOF
|
|
|
1078
1094
|
# Spawn session
|
|
1079
1095
|
log_info "Spawning AI CLI session: $session_id"
|
|
1080
1096
|
_SPAWN_RESULT=""
|
|
1097
|
+
_SPAWN_ITEM_STATUS=""
|
|
1081
1098
|
|
|
1082
1099
|
spawn_and_wait_session \
|
|
1083
1100
|
"$bug_id" "$bug_list" "$session_id" \
|
|
1084
1101
|
"$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$bug_model" "$_ORIGINAL_BRANCH"
|
|
1085
1102
|
|
|
1086
1103
|
local session_status="$_SPAWN_RESULT"
|
|
1104
|
+
local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
|
|
1087
1105
|
|
|
1088
1106
|
# Merge per-bug dev branch back to original on success
|
|
1089
1107
|
if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
|
|
@@ -1112,15 +1130,18 @@ DEPLOY_PROMPT_EOF
|
|
|
1112
1130
|
session_count=$((session_count + 1))
|
|
1113
1131
|
total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
|
|
1114
1132
|
|
|
1115
|
-
# Stop-on-failure: abort
|
|
1116
|
-
|
|
1133
|
+
# Stop-on-failure: abort only after the task is actually marked failed.
|
|
1134
|
+
# Pending retry outcomes, including infrastructure errors, keep running.
|
|
1135
|
+
if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
|
|
1117
1136
|
echo ""
|
|
1118
1137
|
log_error "════════════════════════════════════════════════════"
|
|
1119
|
-
log_error " STOP_ON_FAILURE: Pipeline halted after $bug_id
|
|
1138
|
+
log_error " STOP_ON_FAILURE: Pipeline halted after $bug_id exhausted retries."
|
|
1120
1139
|
log_error " Total sessions completed: $session_count"
|
|
1121
1140
|
log_error " Set STOP_ON_FAILURE=0 to continue past failures."
|
|
1122
1141
|
log_error "════════════════════════════════════════════════════"
|
|
1123
1142
|
break
|
|
1143
|
+
elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
|
|
1144
|
+
log_info "STOP_ON_FAILURE: $bug_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
|
|
1124
1145
|
fi
|
|
1125
1146
|
|
|
1126
1147
|
# Stuck detection
|
|
@@ -153,6 +153,11 @@ spawn_and_wait_session() {
|
|
|
153
153
|
log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
154
154
|
fi
|
|
155
155
|
|
|
156
|
+
local was_infra_error=false
|
|
157
|
+
if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
|
|
158
|
+
was_infra_error=true
|
|
159
|
+
fi
|
|
160
|
+
|
|
156
161
|
# Show final session summary
|
|
157
162
|
if [[ -f "$session_log" ]]; then
|
|
158
163
|
local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
|
|
@@ -172,6 +177,10 @@ spawn_and_wait_session() {
|
|
|
172
177
|
if [[ $exit_code -eq 124 ]]; then
|
|
173
178
|
log_warn "Session timed out after ${SESSION_TIMEOUT}s"
|
|
174
179
|
session_status="timed_out"
|
|
180
|
+
elif [[ "$was_infra_error" == true ]]; then
|
|
181
|
+
log_warn "Session failed due to AI CLI/provider infrastructure error"
|
|
182
|
+
log_warn "Infrastructure errors are retried without consuming code retry budget"
|
|
183
|
+
session_status="infra_error"
|
|
175
184
|
elif [[ "$was_stale_killed" == true ]]; then
|
|
176
185
|
log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
|
|
177
186
|
log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
|
|
@@ -347,6 +356,8 @@ sys.exit(0)
|
|
|
347
356
|
log_error ".prizmkit/plans/feature-list.json may be out of sync. Manual intervention needed."
|
|
348
357
|
}
|
|
349
358
|
|
|
359
|
+
_SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
|
|
360
|
+
|
|
350
361
|
# Return status via global variable (avoids $() swallowing stdout)
|
|
351
362
|
_SPAWN_RESULT="$session_status"
|
|
352
363
|
}
|
|
@@ -848,6 +859,7 @@ else:
|
|
|
848
859
|
trap cleanup_single_feature SIGINT SIGTERM
|
|
849
860
|
|
|
850
861
|
_SPAWN_RESULT=""
|
|
862
|
+
_SPAWN_ITEM_STATUS=""
|
|
851
863
|
|
|
852
864
|
# Branch lifecycle: create and checkout feature branch
|
|
853
865
|
local _proj_root
|
|
@@ -1300,11 +1312,13 @@ DEPLOY_PROMPT_EOF
|
|
|
1300
1312
|
log_info "Feature model: $feature_model"
|
|
1301
1313
|
fi
|
|
1302
1314
|
_SPAWN_RESULT=""
|
|
1315
|
+
_SPAWN_ITEM_STATUS=""
|
|
1303
1316
|
|
|
1304
1317
|
spawn_and_wait_session \
|
|
1305
1318
|
"$feature_id" "$feature_list" "$session_id" \
|
|
1306
1319
|
"$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$feature_model" "$_ORIGINAL_BRANCH"
|
|
1307
1320
|
local session_status="$_SPAWN_RESULT"
|
|
1321
|
+
local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
|
|
1308
1322
|
|
|
1309
1323
|
# Merge per-feature dev branch back to original on success
|
|
1310
1324
|
if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
|
|
@@ -1333,15 +1347,18 @@ DEPLOY_PROMPT_EOF
|
|
|
1333
1347
|
session_count=$((session_count + 1))
|
|
1334
1348
|
total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
|
|
1335
1349
|
|
|
1336
|
-
# Stop-on-failure: abort
|
|
1337
|
-
|
|
1350
|
+
# Stop-on-failure: abort only after the task is actually marked failed.
|
|
1351
|
+
# Pending retry outcomes, including infrastructure errors, keep running.
|
|
1352
|
+
if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
|
|
1338
1353
|
echo ""
|
|
1339
1354
|
log_error "════════════════════════════════════════════════════"
|
|
1340
|
-
log_error " STOP_ON_FAILURE: Pipeline halted after $feature_id
|
|
1355
|
+
log_error " STOP_ON_FAILURE: Pipeline halted after $feature_id exhausted retries."
|
|
1341
1356
|
log_error " Total sessions completed: $session_count"
|
|
1342
1357
|
log_error " Set STOP_ON_FAILURE=0 to continue past failures."
|
|
1343
1358
|
log_error "════════════════════════════════════════════════════"
|
|
1344
1359
|
break
|
|
1360
|
+
elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
|
|
1361
|
+
log_info "STOP_ON_FAILURE: $feature_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
|
|
1345
1362
|
fi
|
|
1346
1363
|
|
|
1347
1364
|
# Brief pause before next iteration
|
|
@@ -147,6 +147,11 @@ spawn_and_wait_session() {
|
|
|
147
147
|
log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
148
148
|
fi
|
|
149
149
|
|
|
150
|
+
local was_infra_error=false
|
|
151
|
+
if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
|
|
152
|
+
was_infra_error=true
|
|
153
|
+
fi
|
|
154
|
+
|
|
150
155
|
# Session summary
|
|
151
156
|
if [[ -f "$session_log" ]]; then
|
|
152
157
|
local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
|
|
@@ -164,6 +169,10 @@ spawn_and_wait_session() {
|
|
|
164
169
|
if [[ $exit_code -eq 124 ]]; then
|
|
165
170
|
log_warn "Session timed out after ${SESSION_TIMEOUT}s"
|
|
166
171
|
session_status="timed_out"
|
|
172
|
+
elif [[ "$was_infra_error" == true ]]; then
|
|
173
|
+
log_warn "Session failed due to AI CLI/provider infrastructure error"
|
|
174
|
+
log_warn "Infrastructure errors are retried without consuming code retry budget"
|
|
175
|
+
session_status="infra_error"
|
|
167
176
|
elif [[ "$was_stale_killed" == true ]]; then
|
|
168
177
|
log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
|
|
169
178
|
log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
|
|
@@ -286,14 +295,20 @@ sys.exit(0)
|
|
|
286
295
|
fi
|
|
287
296
|
|
|
288
297
|
# Update refactor status (do NOT commit on dev branch — commit happens after merge)
|
|
289
|
-
|
|
298
|
+
local update_output
|
|
299
|
+
update_output=$(python3 "$SCRIPTS_DIR/update-refactor-status.py" \
|
|
290
300
|
--refactor-list "$refactor_list" \
|
|
291
301
|
--state-dir "$STATE_DIR" \
|
|
292
302
|
--refactor-id "$refactor_id" \
|
|
293
303
|
--session-status "$session_status" \
|
|
294
304
|
--session-id "$session_id" \
|
|
295
305
|
--max-retries "$max_retries" \
|
|
296
|
-
--action update
|
|
306
|
+
--action update 2>&1) || {
|
|
307
|
+
log_error "Failed to update refactor status: $update_output"
|
|
308
|
+
update_output=""
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
_SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
|
|
297
312
|
|
|
298
313
|
_SPAWN_RESULT="$session_status"
|
|
299
314
|
}
|
|
@@ -723,6 +738,7 @@ else:
|
|
|
723
738
|
trap cleanup_single_refactor SIGINT SIGTERM
|
|
724
739
|
|
|
725
740
|
_SPAWN_RESULT=""
|
|
741
|
+
_SPAWN_ITEM_STATUS=""
|
|
726
742
|
|
|
727
743
|
# Branch lifecycle: create and checkout refactor branch
|
|
728
744
|
local _proj_root
|
|
@@ -1114,6 +1130,7 @@ DEPLOY_PROMPT_EOF
|
|
|
1114
1130
|
# Spawn session
|
|
1115
1131
|
log_info "Spawning AI CLI session: $session_id"
|
|
1116
1132
|
_SPAWN_RESULT=""
|
|
1133
|
+
_SPAWN_ITEM_STATUS=""
|
|
1117
1134
|
|
|
1118
1135
|
spawn_and_wait_session \
|
|
1119
1136
|
"$refactor_id" "$refactor_list" "$session_id" \
|
|
@@ -1130,6 +1147,7 @@ DEPLOY_PROMPT_EOF
|
|
|
1130
1147
|
fi
|
|
1131
1148
|
|
|
1132
1149
|
local session_status="$_SPAWN_RESULT"
|
|
1150
|
+
local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
|
|
1133
1151
|
|
|
1134
1152
|
# Merge per-refactor dev branch back to original on success
|
|
1135
1153
|
if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
|
|
@@ -1168,15 +1186,18 @@ DEPLOY_PROMPT_EOF
|
|
|
1168
1186
|
session_count=$((session_count + 1))
|
|
1169
1187
|
total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
|
|
1170
1188
|
|
|
1171
|
-
# Stop-on-failure: abort
|
|
1172
|
-
|
|
1189
|
+
# Stop-on-failure: abort only after the task is actually marked failed.
|
|
1190
|
+
# Pending retry outcomes, including infrastructure errors, keep running.
|
|
1191
|
+
if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
|
|
1173
1192
|
echo ""
|
|
1174
1193
|
log_error "════════════════════════════════════════════════════"
|
|
1175
|
-
log_error " STOP_ON_FAILURE: Pipeline halted after $refactor_id
|
|
1194
|
+
log_error " STOP_ON_FAILURE: Pipeline halted after $refactor_id exhausted retries."
|
|
1176
1195
|
log_error " Total sessions completed: $session_count"
|
|
1177
1196
|
log_error " Set STOP_ON_FAILURE=0 to continue past failures."
|
|
1178
1197
|
log_error "════════════════════════════════════════════════════"
|
|
1179
1198
|
break
|
|
1199
|
+
elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
|
|
1200
|
+
log_info "STOP_ON_FAILURE: $refactor_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
|
|
1180
1201
|
fi
|
|
1181
1202
|
|
|
1182
1203
|
log_info "Pausing 5s before next refactor..."
|