prizmkit 1.1.66 → 1.1.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. package/bundled/VERSION.json +3 -3
  2. package/bundled/adapters/codex/settings-adapter.js +1 -1
  3. package/bundled/dev-pipeline/.env.example +3 -0
  4. package/bundled/dev-pipeline/SCHEMA_ANALYSIS.md +3 -1
  5. package/bundled/dev-pipeline/lib/common.sh +61 -18
  6. package/bundled/dev-pipeline/lib/heartbeat.sh +104 -11
  7. package/bundled/dev-pipeline/run-bugfix.sh +26 -5
  8. package/bundled/dev-pipeline/run-feature.sh +20 -3
  9. package/bundled/dev-pipeline/run-refactor.sh +26 -5
  10. package/bundled/dev-pipeline/scripts/parse-stream-progress.py +144 -12
  11. package/bundled/dev-pipeline/scripts/update-bug-status.py +15 -0
  12. package/bundled/dev-pipeline/scripts/update-feature-status.py +18 -0
  13. package/bundled/dev-pipeline/scripts/update-refactor-status.py +15 -0
  14. package/bundled/dev-pipeline/tests/test_auto_skip.py +39 -0
  15. package/bundled/dev-pipeline-windows/.env.example +3 -2
  16. package/bundled/dev-pipeline-windows/SCHEMA_ANALYSIS.md +4 -3
  17. package/bundled/dev-pipeline-windows/lib/common.ps1 +97 -5
  18. package/bundled/dev-pipeline-windows/lib/pipeline.ps1 +31 -7
  19. package/bundled/dev-pipeline-windows/run-recovery.ps1 +8 -1
  20. package/bundled/dev-pipeline-windows/scripts/parse-stream-progress.py +144 -12
  21. package/bundled/dev-pipeline-windows/scripts/update-bug-status.py +15 -0
  22. package/bundled/dev-pipeline-windows/scripts/update-feature-status.py +18 -0
  23. package/bundled/dev-pipeline-windows/scripts/update-refactor-status.py +15 -0
  24. package/bundled/skills/_metadata.json +1 -1
  25. package/package.json +1 -1
  26. package/src/scaffold.js +1 -1
@@ -1,5 +1,5 @@
1
1
  {
2
- "frameworkVersion": "1.1.66",
3
- "bundledAt": "2026-06-08T19:45:48.630Z",
4
- "bundledFrom": "940cbd4"
2
+ "frameworkVersion": "1.1.68",
3
+ "bundledAt": "2026-06-09T14:36:58.835Z",
4
+ "bundledFrom": "82060fd"
5
5
  }
@@ -20,7 +20,7 @@ project_doc_fallback_filenames = ["CLAUDE.md", "CODEBUDDY.md"]
20
20
 
21
21
  [agents]
22
22
  max_depth = 1
23
- job_max_runtime_seconds = 840
23
+ job_max_runtime_seconds = 3300
24
24
  `;
25
25
 
26
26
  await writeFile(configPath, configToml);
@@ -41,6 +41,9 @@
41
41
  # ─── Logging & Heartbeat ─────────────────────────────────────────────
42
42
  # HEARTBEAT_INTERVAL=30 # Heartbeat log interval in seconds
43
43
  # HEARTBEAT_STALE_THRESHOLD=600 # Max seconds without heartbeat before marking stale
44
+ # STALE_KILL_THRESHOLD=900 # Auto-kill after N seconds without parent log progress (0 = disabled)
45
+ # CODEX_WAIT_STALE_KILL_THRESHOLD=3600 # Longer no-log window while Codex waits on subagents
46
+ # CODEX_SUBAGENT_TIMEOUT_SECONDS=3300 # Codex subagent max runtime; defaults to wait threshold - 300
44
47
  # LOG_CLEANUP_ENABLED=1 # Periodic log cleanup (1=on, 0=off)
45
48
  # LOG_RETENTION_DAYS=14 # Delete logs older than N days
46
49
  # LOG_MAX_TOTAL_MB=1024 # Keep total logs under N MB via oldest-first cleanup
@@ -353,6 +353,9 @@ pending, in_progress, completed, failed, skipped
353
353
  | `DEV_BRANCH` | string | auto-generated | Custom branch name |
354
354
  | `HEARTBEAT_INTERVAL` | integer | 30 | Heartbeat log interval (s) |
355
355
  | `HEARTBEAT_STALE_THRESHOLD` | integer | 600 | Max seconds without heartbeat |
356
+ | `STALE_KILL_THRESHOLD` | integer | 900 | Auto-kill after N seconds without parent log progress |
357
+ | `CODEX_WAIT_STALE_KILL_THRESHOLD` | integer | 3600 | Longer no-log stale window while Codex waits on subagents |
358
+ | `CODEX_SUBAGENT_TIMEOUT_SECONDS` | integer | 3300 | Codex subagent max runtime |
356
359
  | `LOG_CLEANUP_ENABLED` | integer | 1 | Periodic cleanup |
357
360
  | `LOG_RETENTION_DAYS` | integer | 14 | Delete logs older than N days |
358
361
  | `LOG_MAX_TOTAL_MB` | integer | 1024 | Max total logs (MB) |
@@ -532,4 +535,3 @@ Located in `/dev-pipeline/templates/`:
532
535
  - Agent: 6 files
533
536
  - Base/Shared: 7 files
534
537
  - Singleton: 3 files
535
-
@@ -344,6 +344,23 @@ prizm_detect_cli_and_platform() {
344
344
  # command substitution; the background process must remain a child of the
345
345
  # runner shell so wait/heartbeat/trap handling works correctly.
346
346
  PRIZM_AI_PID=""
347
+
348
+ _prizm_codex_subagent_timeout_seconds() {
349
+ local configured="${CODEX_SUBAGENT_TIMEOUT_SECONDS:-}"
350
+ if [[ "$configured" =~ ^[0-9]+$ && "$configured" -gt 0 ]]; then
351
+ printf '%s\n' "$configured"
352
+ return 0
353
+ fi
354
+
355
+ local wait_threshold="${CODEX_WAIT_STALE_KILL_THRESHOLD:-3600}"
356
+ if [[ "$wait_threshold" =~ ^[0-9]+$ && "$wait_threshold" -gt 600 ]]; then
357
+ printf '%s\n' "$((wait_threshold - 300))"
358
+ return 0
359
+ fi
360
+
361
+ printf '%s\n' 3300
362
+ }
363
+
347
364
  prizm_start_ai_session() {
348
365
  local prompt_path="$1"
349
366
  local log_path="$2"
@@ -370,15 +387,8 @@ prizm_start_ai_session() {
370
387
  ;;
371
388
  codex)
372
389
  local codex_args=(--ask-for-approval never --sandbox danger-full-access)
373
- local codex_subagent_timeout="${CODEX_SUBAGENT_TIMEOUT_SECONDS:-}"
374
- if [[ -z "$codex_subagent_timeout" ]]; then
375
- local outer_stale_threshold="${STALE_KILL_THRESHOLD:-900}"
376
- if [[ "$outer_stale_threshold" =~ ^[0-9]+$ && "$outer_stale_threshold" -gt 120 ]]; then
377
- codex_subagent_timeout=$((outer_stale_threshold - 60))
378
- else
379
- codex_subagent_timeout=840
380
- fi
381
- fi
390
+ local codex_subagent_timeout
391
+ codex_subagent_timeout="$(_prizm_codex_subagent_timeout_seconds)"
382
392
  if [[ "$codex_subagent_timeout" =~ ^[0-9]+$ && "$codex_subagent_timeout" -gt 0 ]]; then
383
393
  codex_args+=(--config "agents.job_max_runtime_seconds=$codex_subagent_timeout")
384
394
  fi
@@ -408,6 +418,46 @@ prizm_start_ai_session() {
408
418
  PRIZM_AI_PID=$!
409
419
  }
410
420
 
421
+ # Detect AI CLI/provider infrastructure failures that are outside the
422
+ # generated code's control. These should be retried without consuming the
423
+ # item's code retry budget.
424
+ prizm_detect_infra_error() {
425
+ local session_log="${1:-}"
426
+ local progress_json="${2:-}"
427
+
428
+ local haystack=""
429
+ if [[ -n "$session_log" && -f "$session_log" ]]; then
430
+ haystack="$(tail -c 65536 "$session_log" 2>/dev/null || true)"
431
+ fi
432
+ if [[ -n "$progress_json" && -f "$progress_json" ]]; then
433
+ haystack+=$'\n'
434
+ haystack+="$(cat "$progress_json" 2>/dev/null || true)"
435
+ fi
436
+
437
+ [[ -n "$haystack" ]] || return 1
438
+
439
+ if printf '%s' "$haystack" | grep -Eiq \
440
+ 'auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded'; then
441
+ return 0
442
+ fi
443
+
444
+ return 1
445
+ }
446
+
447
+ prizm_extract_update_new_status() {
448
+ python3 -c "
449
+ import json, sys
450
+ raw = sys.stdin.read()
451
+ try:
452
+ data = json.loads(raw)
453
+ except Exception:
454
+ sys.exit(0)
455
+ value = data.get('new_status')
456
+ if value:
457
+ print(value)
458
+ "
459
+ }
460
+
411
461
  # Run an AI CLI session synchronously.
412
462
  # Usage: prizm_run_ai_session <prompt_path> <log_path> <model>
413
463
  prizm_run_ai_session() {
@@ -430,15 +480,8 @@ prizm_run_ai_session() {
430
480
  ;;
431
481
  codex)
432
482
  local codex_args=(--ask-for-approval never --sandbox danger-full-access)
433
- local codex_subagent_timeout="${CODEX_SUBAGENT_TIMEOUT_SECONDS:-}"
434
- if [[ -z "$codex_subagent_timeout" ]]; then
435
- local outer_stale_threshold="${STALE_KILL_THRESHOLD:-900}"
436
- if [[ "$outer_stale_threshold" =~ ^[0-9]+$ && "$outer_stale_threshold" -gt 120 ]]; then
437
- codex_subagent_timeout=$((outer_stale_threshold - 60))
438
- else
439
- codex_subagent_timeout=840
440
- fi
441
- fi
483
+ local codex_subagent_timeout
484
+ codex_subagent_timeout="$(_prizm_codex_subagent_timeout_seconds)"
442
485
  if [[ "$codex_subagent_timeout" =~ ^[0-9]+$ && "$codex_subagent_timeout" -gt 0 ]]; then
443
486
  codex_args+=(--config "agents.job_max_runtime_seconds=$codex_subagent_timeout")
444
487
  fi
@@ -41,6 +41,7 @@ start_heartbeat() {
41
41
  (
42
42
  local elapsed=0
43
43
  local prev_size=0
44
+ local prev_child_activity_signature=""
44
45
  local stale_seconds=0
45
46
  while kill -0 "$cli_pid" 2>/dev/null; do
46
47
  sleep "$heartbeat_interval"
@@ -57,8 +58,41 @@ start_heartbeat() {
57
58
  local growth=$((cur_size - prev_size))
58
59
  prev_size=$cur_size
59
60
 
60
- # Track progress staleness (no log growth = stale)
61
- if [[ $growth -eq 0 ]]; then
61
+ local child_activity_signature=""
62
+ local child_total_bytes=0
63
+ local child_session_count=0
64
+ if [[ -f "$progress_json" ]]; then
65
+ local child_activity_data
66
+ child_activity_data=$(python3 - "$progress_json" <<'PY' 2>/dev/null || true
67
+ import json
68
+ import sys
69
+
70
+ try:
71
+ with open(sys.argv[1], "r", encoding="utf-8") as fh:
72
+ progress = json.load(fh)
73
+ except Exception:
74
+ sys.exit(0)
75
+
76
+ signature = str(progress.get("child_activity_signature") or "")
77
+ total_bytes = int(progress.get("child_total_bytes") or 0)
78
+ session_count = len(progress.get("child_session_files") or [])
79
+ print(f"{signature}\t{total_bytes}\t{session_count}")
80
+ PY
81
+ )
82
+ if [[ -n "$child_activity_data" ]]; then
83
+ IFS=$'\t' read -r child_activity_signature child_total_bytes child_session_count <<< "$child_activity_data"
84
+ fi
85
+ fi
86
+
87
+ local child_growth=0
88
+ if [[ -n "$child_activity_signature" && "$child_activity_signature" != "$prev_child_activity_signature" ]]; then
89
+ child_growth=1
90
+ fi
91
+ prev_child_activity_signature="$child_activity_signature"
92
+
93
+ # Track progress staleness. A Codex parent can sit in `wait`
94
+ # while child transcripts keep growing, so child activity counts.
95
+ if [[ $growth -eq 0 && $child_growth -eq 0 ]]; then
62
96
  stale_seconds=$((stale_seconds + heartbeat_interval))
63
97
  else
64
98
  stale_seconds=0
@@ -72,28 +106,87 @@ start_heartbeat() {
72
106
  else
73
107
  size_display="${cur_size}B"
74
108
  fi
109
+ local child_display=""
110
+ if [[ ${child_total_bytes:-0} -gt 0 ]]; then
111
+ local child_size_display
112
+ if [[ $child_total_bytes -gt 1048576 ]]; then
113
+ child_size_display="$((child_total_bytes / 1048576))MB"
114
+ elif [[ $child_total_bytes -gt 1024 ]]; then
115
+ child_size_display="$((child_total_bytes / 1024))KB"
116
+ else
117
+ child_size_display="${child_total_bytes}B"
118
+ fi
119
+ child_display=" | child: ${child_size_display}"
120
+ if [[ ${child_session_count:-0} -gt 1 ]]; then
121
+ child_display="${child_display}/${child_session_count}"
122
+ fi
123
+ fi
75
124
 
76
125
  local mins=$((elapsed / 60))
77
126
  local secs=$((elapsed % 60))
78
127
 
79
128
  local status_icon
80
- if [[ $growth -gt 0 ]]; then
129
+ if [[ $growth -gt 0 || $child_growth -gt 0 ]]; then
81
130
  status_icon="${GREEN}▶${NC}"
82
131
  else
83
132
  status_icon="${YELLOW}⏸${NC}"
84
133
  fi
85
134
 
86
- # Stale-kill: auto-terminate process if no progress for too long
87
- if [[ $stale_kill_threshold -gt 0 && $stale_seconds -ge $stale_kill_threshold ]]; then
135
+ local effective_stale_kill_threshold="$stale_kill_threshold"
136
+ if [[ $stale_kill_threshold -gt 0 && -f "$progress_json" ]]; then
137
+ local codex_wait_threshold
138
+ codex_wait_threshold=$(python3 - "$progress_json" "$stale_kill_threshold" <<'PY' 2>/dev/null || true
139
+ import json
140
+ import os
141
+ import sys
142
+
143
+ progress_path = sys.argv[1]
144
+ base_threshold = int(sys.argv[2])
145
+
146
+ with open(progress_path, "r", encoding="utf-8") as fh:
147
+ progress = json.load(fh)
148
+
149
+ spawn_count = 0
150
+ for tool in progress.get("tool_calls", []):
151
+ if isinstance(tool, dict) and tool.get("name") == "spawn_agent":
152
+ try:
153
+ spawn_count += int(tool.get("count", 0))
154
+ except (TypeError, ValueError):
155
+ pass
156
+
157
+ if (
158
+ progress.get("event_format") == "codex-json"
159
+ and progress.get("current_tool") == "wait"
160
+ and spawn_count > 0
161
+ ):
162
+ configured = os.environ.get("CODEX_WAIT_STALE_KILL_THRESHOLD", "")
163
+ try:
164
+ wait_threshold = int(configured)
165
+ except ValueError:
166
+ wait_threshold = max(base_threshold * 4, 3600)
167
+ if wait_threshold > base_threshold:
168
+ print(wait_threshold)
169
+ PY
170
+ )
171
+ if [[ "$codex_wait_threshold" =~ ^[0-9]+$ && "$codex_wait_threshold" -gt "$stale_kill_threshold" ]]; then
172
+ effective_stale_kill_threshold="$codex_wait_threshold"
173
+ fi
174
+ fi
175
+
176
+ # Stale-kill: auto-terminate process if no progress for too long.
177
+ # Codex parent sessions can sit on the `wait` tool while a spawned
178
+ # subagent is still doing useful work. Give that valid wait a longer
179
+ # stale window; normal single-agent stalls still use the base limit.
180
+ if [[ $effective_stale_kill_threshold -gt 0 && $stale_seconds -ge $effective_stale_kill_threshold ]]; then
88
181
  local stale_mins=$((stale_seconds / 60))
89
- echo -e " ${RED}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${RED}STALE-KILL: no progress for ${stale_mins}m (threshold: ${stale_kill_threshold}s)${NC}"
182
+ echo -e " ${RED}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${RED}STALE-KILL: no progress for ${stale_mins}m (threshold: ${effective_stale_kill_threshold}s)${NC}"
90
183
  echo -e " ${RED}[HEARTBEAT]${NC} Killing AI CLI process $cli_pid (stale session)..."
91
184
  # Write the marker before killing. Some CLIs exit quickly, and the
92
185
  # parent runner may stop this heartbeat process immediately after
93
186
  # wait(1) returns.
94
187
  local _marker_dir
95
188
  _marker_dir="$(dirname "$session_log")"
96
- echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"stale_session\", \"stale_seconds\": $stale_seconds, \"threshold\": $stale_kill_threshold}" > "$_marker_dir/stale-kill.json" 2>/dev/null || true
189
+ echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"stale_session\", \"stale_seconds\": $stale_seconds, \"threshold\": $effective_stale_kill_threshold}" > "$_marker_dir/stale-kill.json" 2>/dev/null || true
97
190
  kill -TERM "$cli_pid" 2>/dev/null || true
98
191
  # Give process 10s to exit gracefully, then force kill
99
192
  local stale_kill_grace_seconds="${STALE_KILL_GRACE_SECONDS:-10}"
@@ -109,9 +202,9 @@ start_heartbeat() {
109
202
 
110
203
  # Build staleness hint for display
111
204
  local stale_hint=""
112
- if [[ $stale_kill_threshold -gt 0 && $stale_seconds -gt 0 ]]; then
205
+ if [[ $effective_stale_kill_threshold -gt 0 && $stale_seconds -gt 0 ]]; then
113
206
  local stale_mins=$((stale_seconds / 60))
114
- local threshold_mins=$((stale_kill_threshold / 60))
207
+ local threshold_mins=$((effective_stale_kill_threshold / 60))
115
208
  stale_hint=" | stale: ${stale_mins}m/${threshold_mins}m"
116
209
  fi
117
210
 
@@ -134,7 +227,7 @@ try:
134
227
  except Exception:
135
228
  sys.exit(1)
136
229
  " "$progress_json" 2>/dev/null) && {
137
- echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${phase}${stale_hint}"
230
+ echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display}${child_display} | ${phase}${stale_hint}"
138
231
  continue
139
232
  }
140
233
  fi
@@ -145,7 +238,7 @@ except Exception:
145
238
  last_activity=$(tail -20 "$session_log" 2>/dev/null | grep -v '^$' | tail -1 | cut -c1-80 || echo "")
146
239
  fi
147
240
 
148
- echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s elapsed | log: ${size_display} (+${growth}B) | ${last_activity}${stale_hint}"
241
+ echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s elapsed | log: ${size_display}${child_display} (+${growth}B) | ${last_activity}${stale_hint}"
149
242
  done
150
243
  ) &
151
244
  _HEARTBEAT_PID=$!
@@ -145,6 +145,11 @@ spawn_and_wait_session() {
145
145
  log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
146
146
  fi
147
147
 
148
+ local was_infra_error=false
149
+ if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
150
+ was_infra_error=true
151
+ fi
152
+
148
153
  # Session summary
149
154
  if [[ -f "$session_log" ]]; then
150
155
  local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -162,6 +167,10 @@ spawn_and_wait_session() {
162
167
  if [[ $exit_code -eq 124 ]]; then
163
168
  log_warn "Session timed out after ${SESSION_TIMEOUT}s"
164
169
  session_status="timed_out"
170
+ elif [[ "$was_infra_error" == true ]]; then
171
+ log_warn "Session failed due to AI CLI/provider infrastructure error"
172
+ log_warn "Infrastructure errors are retried without consuming code retry budget"
173
+ session_status="infra_error"
165
174
  elif [[ "$was_stale_killed" == true ]]; then
166
175
  log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
167
176
  log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -259,14 +268,20 @@ sys.exit(0)
259
268
  prizm_detect_subagents "$session_log"
260
269
 
261
270
  # Update bug status (do NOT commit on dev branch — commit happens after merge)
262
- python3 "$SCRIPTS_DIR/update-bug-status.py" \
271
+ local update_output
272
+ update_output=$(python3 "$SCRIPTS_DIR/update-bug-status.py" \
263
273
  --bug-list "$bug_list" \
264
274
  --state-dir "$STATE_DIR" \
265
275
  --bug-id "$bug_id" \
266
276
  --session-status "$session_status" \
267
277
  --session-id "$session_id" \
268
278
  --max-retries "$max_retries" \
269
- --action update >/dev/null 2>&1 || true
279
+ --action update 2>&1) || {
280
+ log_error "Failed to update bug status: $update_output"
281
+ update_output=""
282
+ }
283
+
284
+ _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
270
285
 
271
286
  _SPAWN_RESULT="$session_status"
272
287
  }
@@ -693,6 +708,7 @@ else:
693
708
  trap cleanup_single_bug SIGINT SIGTERM
694
709
 
695
710
  _SPAWN_RESULT=""
711
+ _SPAWN_ITEM_STATUS=""
696
712
 
697
713
  # Branch lifecycle: create and checkout bugfix branch
698
714
  local _proj_root
@@ -1078,12 +1094,14 @@ DEPLOY_PROMPT_EOF
1078
1094
  # Spawn session
1079
1095
  log_info "Spawning AI CLI session: $session_id"
1080
1096
  _SPAWN_RESULT=""
1097
+ _SPAWN_ITEM_STATUS=""
1081
1098
 
1082
1099
  spawn_and_wait_session \
1083
1100
  "$bug_id" "$bug_list" "$session_id" \
1084
1101
  "$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$bug_model" "$_ORIGINAL_BRANCH"
1085
1102
 
1086
1103
  local session_status="$_SPAWN_RESULT"
1104
+ local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
1087
1105
 
1088
1106
  # Merge per-bug dev branch back to original on success
1089
1107
  if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1112,15 +1130,18 @@ DEPLOY_PROMPT_EOF
1112
1130
  session_count=$((session_count + 1))
1113
1131
  total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
1114
1132
 
1115
- # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
1116
- if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1133
+ # Stop-on-failure: abort only after the task is actually marked failed.
1134
+ # Pending retry outcomes, including infrastructure errors, keep running.
1135
+ if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
1117
1136
  echo ""
1118
1137
  log_error "════════════════════════════════════════════════════"
1119
- log_error " STOP_ON_FAILURE: Pipeline halted after $bug_id failed."
1138
+ log_error " STOP_ON_FAILURE: Pipeline halted after $bug_id exhausted retries."
1120
1139
  log_error " Total sessions completed: $session_count"
1121
1140
  log_error " Set STOP_ON_FAILURE=0 to continue past failures."
1122
1141
  log_error "════════════════════════════════════════════════════"
1123
1142
  break
1143
+ elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1144
+ log_info "STOP_ON_FAILURE: $bug_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
1124
1145
  fi
1125
1146
 
1126
1147
  # Stuck detection
@@ -153,6 +153,11 @@ spawn_and_wait_session() {
153
153
  log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
154
154
  fi
155
155
 
156
+ local was_infra_error=false
157
+ if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
158
+ was_infra_error=true
159
+ fi
160
+
156
161
  # Show final session summary
157
162
  if [[ -f "$session_log" ]]; then
158
163
  local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -172,6 +177,10 @@ spawn_and_wait_session() {
172
177
  if [[ $exit_code -eq 124 ]]; then
173
178
  log_warn "Session timed out after ${SESSION_TIMEOUT}s"
174
179
  session_status="timed_out"
180
+ elif [[ "$was_infra_error" == true ]]; then
181
+ log_warn "Session failed due to AI CLI/provider infrastructure error"
182
+ log_warn "Infrastructure errors are retried without consuming code retry budget"
183
+ session_status="infra_error"
175
184
  elif [[ "$was_stale_killed" == true ]]; then
176
185
  log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
177
186
  log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -347,6 +356,8 @@ sys.exit(0)
347
356
  log_error ".prizmkit/plans/feature-list.json may be out of sync. Manual intervention needed."
348
357
  }
349
358
 
359
+ _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
360
+
350
361
  # Return status via global variable (avoids $() swallowing stdout)
351
362
  _SPAWN_RESULT="$session_status"
352
363
  }
@@ -848,6 +859,7 @@ else:
848
859
  trap cleanup_single_feature SIGINT SIGTERM
849
860
 
850
861
  _SPAWN_RESULT=""
862
+ _SPAWN_ITEM_STATUS=""
851
863
 
852
864
  # Branch lifecycle: create and checkout feature branch
853
865
  local _proj_root
@@ -1300,11 +1312,13 @@ DEPLOY_PROMPT_EOF
1300
1312
  log_info "Feature model: $feature_model"
1301
1313
  fi
1302
1314
  _SPAWN_RESULT=""
1315
+ _SPAWN_ITEM_STATUS=""
1303
1316
 
1304
1317
  spawn_and_wait_session \
1305
1318
  "$feature_id" "$feature_list" "$session_id" \
1306
1319
  "$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$feature_model" "$_ORIGINAL_BRANCH"
1307
1320
  local session_status="$_SPAWN_RESULT"
1321
+ local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
1308
1322
 
1309
1323
  # Merge per-feature dev branch back to original on success
1310
1324
  if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1333,15 +1347,18 @@ DEPLOY_PROMPT_EOF
1333
1347
  session_count=$((session_count + 1))
1334
1348
  total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
1335
1349
 
1336
- # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
1337
- if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1350
+ # Stop-on-failure: abort only after the task is actually marked failed.
1351
+ # Pending retry outcomes, including infrastructure errors, keep running.
1352
+ if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
1338
1353
  echo ""
1339
1354
  log_error "════════════════════════════════════════════════════"
1340
- log_error " STOP_ON_FAILURE: Pipeline halted after $feature_id failed."
1355
+ log_error " STOP_ON_FAILURE: Pipeline halted after $feature_id exhausted retries."
1341
1356
  log_error " Total sessions completed: $session_count"
1342
1357
  log_error " Set STOP_ON_FAILURE=0 to continue past failures."
1343
1358
  log_error "════════════════════════════════════════════════════"
1344
1359
  break
1360
+ elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1361
+ log_info "STOP_ON_FAILURE: $feature_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
1345
1362
  fi
1346
1363
 
1347
1364
  # Brief pause before next iteration
@@ -147,6 +147,11 @@ spawn_and_wait_session() {
147
147
  log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
148
148
  fi
149
149
 
150
+ local was_infra_error=false
151
+ if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
152
+ was_infra_error=true
153
+ fi
154
+
150
155
  # Session summary
151
156
  if [[ -f "$session_log" ]]; then
152
157
  local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -164,6 +169,10 @@ spawn_and_wait_session() {
164
169
  if [[ $exit_code -eq 124 ]]; then
165
170
  log_warn "Session timed out after ${SESSION_TIMEOUT}s"
166
171
  session_status="timed_out"
172
+ elif [[ "$was_infra_error" == true ]]; then
173
+ log_warn "Session failed due to AI CLI/provider infrastructure error"
174
+ log_warn "Infrastructure errors are retried without consuming code retry budget"
175
+ session_status="infra_error"
167
176
  elif [[ "$was_stale_killed" == true ]]; then
168
177
  log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
169
178
  log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -286,14 +295,20 @@ sys.exit(0)
286
295
  fi
287
296
 
288
297
  # Update refactor status (do NOT commit on dev branch — commit happens after merge)
289
- python3 "$SCRIPTS_DIR/update-refactor-status.py" \
298
+ local update_output
299
+ update_output=$(python3 "$SCRIPTS_DIR/update-refactor-status.py" \
290
300
  --refactor-list "$refactor_list" \
291
301
  --state-dir "$STATE_DIR" \
292
302
  --refactor-id "$refactor_id" \
293
303
  --session-status "$session_status" \
294
304
  --session-id "$session_id" \
295
305
  --max-retries "$max_retries" \
296
- --action update >/dev/null 2>&1 || true
306
+ --action update 2>&1) || {
307
+ log_error "Failed to update refactor status: $update_output"
308
+ update_output=""
309
+ }
310
+
311
+ _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
297
312
 
298
313
  _SPAWN_RESULT="$session_status"
299
314
  }
@@ -723,6 +738,7 @@ else:
723
738
  trap cleanup_single_refactor SIGINT SIGTERM
724
739
 
725
740
  _SPAWN_RESULT=""
741
+ _SPAWN_ITEM_STATUS=""
726
742
 
727
743
  # Branch lifecycle: create and checkout refactor branch
728
744
  local _proj_root
@@ -1114,6 +1130,7 @@ DEPLOY_PROMPT_EOF
1114
1130
  # Spawn session
1115
1131
  log_info "Spawning AI CLI session: $session_id"
1116
1132
  _SPAWN_RESULT=""
1133
+ _SPAWN_ITEM_STATUS=""
1117
1134
 
1118
1135
  spawn_and_wait_session \
1119
1136
  "$refactor_id" "$refactor_list" "$session_id" \
@@ -1130,6 +1147,7 @@ DEPLOY_PROMPT_EOF
1130
1147
  fi
1131
1148
 
1132
1149
  local session_status="$_SPAWN_RESULT"
1150
+ local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
1133
1151
 
1134
1152
  # Merge per-refactor dev branch back to original on success
1135
1153
  if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1168,15 +1186,18 @@ DEPLOY_PROMPT_EOF
1168
1186
  session_count=$((session_count + 1))
1169
1187
  total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
1170
1188
 
1171
- # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
1172
- if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1189
+ # Stop-on-failure: abort only after the task is actually marked failed.
1190
+ # Pending retry outcomes, including infrastructure errors, keep running.
1191
+ if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
1173
1192
  echo ""
1174
1193
  log_error "════════════════════════════════════════════════════"
1175
- log_error " STOP_ON_FAILURE: Pipeline halted after $refactor_id failed."
1194
+ log_error " STOP_ON_FAILURE: Pipeline halted after $refactor_id exhausted retries."
1176
1195
  log_error " Total sessions completed: $session_count"
1177
1196
  log_error " Set STOP_ON_FAILURE=0 to continue past failures."
1178
1197
  log_error "════════════════════════════════════════════════════"
1179
1198
  break
1199
+ elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1200
+ log_info "STOP_ON_FAILURE: $refactor_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
1180
1201
  fi
1181
1202
 
1182
1203
  log_info "Pausing 5s before next refactor..."