npm - prizmkit - Versions diffs - 1.1.66 → 1.1.68 - Mend

prizmkit 1.1.66 → 1.1.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/bundled/VERSION.json +3 -3
package/bundled/adapters/codex/settings-adapter.js +1 -1
package/bundled/dev-pipeline/.env.example +3 -0
package/bundled/dev-pipeline/SCHEMA_ANALYSIS.md +3 -1
package/bundled/dev-pipeline/lib/common.sh +61 -18
package/bundled/dev-pipeline/lib/heartbeat.sh +104 -11
package/bundled/dev-pipeline/run-bugfix.sh +26 -5
package/bundled/dev-pipeline/run-feature.sh +20 -3
package/bundled/dev-pipeline/run-refactor.sh +26 -5
package/bundled/dev-pipeline/scripts/parse-stream-progress.py +144 -12
package/bundled/dev-pipeline/scripts/update-bug-status.py +15 -0
package/bundled/dev-pipeline/scripts/update-feature-status.py +18 -0
package/bundled/dev-pipeline/scripts/update-refactor-status.py +15 -0
package/bundled/dev-pipeline/tests/test_auto_skip.py +39 -0
package/bundled/dev-pipeline-windows/.env.example +3 -2
package/bundled/dev-pipeline-windows/SCHEMA_ANALYSIS.md +4 -3
package/bundled/dev-pipeline-windows/lib/common.ps1 +97 -5
package/bundled/dev-pipeline-windows/lib/pipeline.ps1 +31 -7
package/bundled/dev-pipeline-windows/run-recovery.ps1 +8 -1
package/bundled/dev-pipeline-windows/scripts/parse-stream-progress.py +144 -12
package/bundled/dev-pipeline-windows/scripts/update-bug-status.py +15 -0
package/bundled/dev-pipeline-windows/scripts/update-feature-status.py +18 -0
package/bundled/dev-pipeline-windows/scripts/update-refactor-status.py +15 -0
package/bundled/skills/_metadata.json +1 -1
package/package.json +1 -1
package/src/scaffold.js +1 -1

package/bundled/VERSION.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "frameworkVersion": "1.1.66",
-  "bundledAt": "2026-06-08T19:45:48.630Z",
-  "bundledFrom": "940cbd4"
+  "frameworkVersion": "1.1.68",
+  "bundledAt": "2026-06-09T14:36:58.835Z",
+  "bundledFrom": "82060fd"
 }

package/bundled/adapters/codex/settings-adapter.js CHANGED Viewed

@@ -20,7 +20,7 @@ project_doc_fallback_filenames = ["CLAUDE.md", "CODEBUDDY.md"]
 [agents]
 max_depth = 1
-job_max_runtime_seconds = 840
+job_max_runtime_seconds = 3300
 `;
   await writeFile(configPath, configToml);

package/bundled/dev-pipeline/.env.example CHANGED Viewed

@@ -41,6 +41,9 @@
 # ─── Logging & Heartbeat ─────────────────────────────────────────────
 # HEARTBEAT_INTERVAL=30      # Heartbeat log interval in seconds
 # HEARTBEAT_STALE_THRESHOLD=600  # Max seconds without heartbeat before marking stale
+# STALE_KILL_THRESHOLD=900   # Auto-kill after N seconds without parent log progress (0 = disabled)
+# CODEX_WAIT_STALE_KILL_THRESHOLD=3600  # Longer no-log window while Codex waits on subagents
+# CODEX_SUBAGENT_TIMEOUT_SECONDS=3300   # Codex subagent max runtime; defaults to wait threshold - 300
 # LOG_CLEANUP_ENABLED=1      # Periodic log cleanup (1=on, 0=off)
 # LOG_RETENTION_DAYS=14      # Delete logs older than N days
 # LOG_MAX_TOTAL_MB=1024      # Keep total logs under N MB via oldest-first cleanup

package/bundled/dev-pipeline/SCHEMA_ANALYSIS.md CHANGED Viewed

@@ -353,6 +353,9 @@ pending, in_progress, completed, failed, skipped
 | `DEV_BRANCH` | string | auto-generated | Custom branch name |
 | `HEARTBEAT_INTERVAL` | integer | 30 | Heartbeat log interval (s) |
 | `HEARTBEAT_STALE_THRESHOLD` | integer | 600 | Max seconds without heartbeat |
+| `STALE_KILL_THRESHOLD` | integer | 900 | Auto-kill after N seconds without parent log progress |
+| `CODEX_WAIT_STALE_KILL_THRESHOLD` | integer | 3600 | Longer no-log stale window while Codex waits on subagents |
+| `CODEX_SUBAGENT_TIMEOUT_SECONDS` | integer | 3300 | Codex subagent max runtime |
 | `LOG_CLEANUP_ENABLED` | integer | 1 | Periodic cleanup |
 | `LOG_RETENTION_DAYS` | integer | 14 | Delete logs older than N days |
 | `LOG_MAX_TOTAL_MB` | integer | 1024 | Max total logs (MB) |
@@ -532,4 +535,3 @@ Located in `/dev-pipeline/templates/`:
 - Agent: 6 files
 - Base/Shared: 7 files
 - Singleton: 3 files

package/bundled/dev-pipeline/lib/common.sh CHANGED Viewed

@@ -344,6 +344,23 @@ prizm_detect_cli_and_platform() {
 # command substitution; the background process must remain a child of the
 # runner shell so wait/heartbeat/trap handling works correctly.
 PRIZM_AI_PID=""
+_prizm_codex_subagent_timeout_seconds() {
+    local configured="${CODEX_SUBAGENT_TIMEOUT_SECONDS:-}"
+    if [[ "$configured" =~ ^[0-9]+$ && "$configured" -gt 0 ]]; then
+        printf '%s\n' "$configured"
+        return 0
+    fi
+    local wait_threshold="${CODEX_WAIT_STALE_KILL_THRESHOLD:-3600}"
+    if [[ "$wait_threshold" =~ ^[0-9]+$ && "$wait_threshold" -gt 600 ]]; then
+        printf '%s\n' "$((wait_threshold - 300))"
+        return 0
+    fi
+    printf '%s\n' 3300
+}
 prizm_start_ai_session() {
     local prompt_path="$1"
     local log_path="$2"
@@ -370,15 +387,8 @@ prizm_start_ai_session() {
             ;;
         codex)
             local codex_args=(--ask-for-approval never --sandbox danger-full-access)
-            local codex_subagent_timeout="${CODEX_SUBAGENT_TIMEOUT_SECONDS:-}"
-            if [[ -z "$codex_subagent_timeout" ]]; then
-                local outer_stale_threshold="${STALE_KILL_THRESHOLD:-900}"
-                if [[ "$outer_stale_threshold" =~ ^[0-9]+$ && "$outer_stale_threshold" -gt 120 ]]; then
-                    codex_subagent_timeout=$((outer_stale_threshold - 60))
-                else
-                    codex_subagent_timeout=840
-                fi
-            fi
+            local codex_subagent_timeout
+            codex_subagent_timeout="$(_prizm_codex_subagent_timeout_seconds)"
             if [[ "$codex_subagent_timeout" =~ ^[0-9]+$ && "$codex_subagent_timeout" -gt 0 ]]; then
                 codex_args+=(--config "agents.job_max_runtime_seconds=$codex_subagent_timeout")
             fi
@@ -408,6 +418,46 @@ prizm_start_ai_session() {
     PRIZM_AI_PID=$!
 }
+# Detect AI CLI/provider infrastructure failures that are outside the
+# generated code's control. These should be retried without consuming the
+# item's code retry budget.
+prizm_detect_infra_error() {
+    local session_log="${1:-}"
+    local progress_json="${2:-}"
+    local haystack=""
+    if [[ -n "$session_log" && -f "$session_log" ]]; then
+        haystack="$(tail -c 65536 "$session_log" 2>/dev/null || true)"
+    fi
+    if [[ -n "$progress_json" && -f "$progress_json" ]]; then
+        haystack+=$'\n'
+        haystack+="$(cat "$progress_json" 2>/dev/null || true)"
+    fi
+    [[ -n "$haystack" ]] || return 1
+    if printf '%s' "$haystack" | grep -Eiq \
+        'auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded'; then
+        return 0
+    fi
+    return 1
+}
+prizm_extract_update_new_status() {
+    python3 -c "
+import json, sys
+raw = sys.stdin.read()
+try:
+    data = json.loads(raw)
+except Exception:
+    sys.exit(0)
+value = data.get('new_status')
+if value:
+    print(value)
+"
+}
 # Run an AI CLI session synchronously.
 # Usage: prizm_run_ai_session <prompt_path> <log_path> <model>
 prizm_run_ai_session() {
@@ -430,15 +480,8 @@ prizm_run_ai_session() {
             ;;
         codex)
             local codex_args=(--ask-for-approval never --sandbox danger-full-access)
-            local codex_subagent_timeout="${CODEX_SUBAGENT_TIMEOUT_SECONDS:-}"
-            if [[ -z "$codex_subagent_timeout" ]]; then
-                local outer_stale_threshold="${STALE_KILL_THRESHOLD:-900}"
-                if [[ "$outer_stale_threshold" =~ ^[0-9]+$ && "$outer_stale_threshold" -gt 120 ]]; then
-                    codex_subagent_timeout=$((outer_stale_threshold - 60))
-                else
-                    codex_subagent_timeout=840
-                fi
-            fi
+            local codex_subagent_timeout
+            codex_subagent_timeout="$(_prizm_codex_subagent_timeout_seconds)"
             if [[ "$codex_subagent_timeout" =~ ^[0-9]+$ && "$codex_subagent_timeout" -gt 0 ]]; then
                 codex_args+=(--config "agents.job_max_runtime_seconds=$codex_subagent_timeout")
             fi

package/bundled/dev-pipeline/lib/heartbeat.sh CHANGED Viewed

@@ -41,6 +41,7 @@ start_heartbeat() {
     (
         local elapsed=0
         local prev_size=0
+        local prev_child_activity_signature=""
         local stale_seconds=0
         while kill -0 "$cli_pid" 2>/dev/null; do
             sleep "$heartbeat_interval"
@@ -57,8 +58,41 @@ start_heartbeat() {
             local growth=$((cur_size - prev_size))
             prev_size=$cur_size
-            # Track progress staleness (no log growth = stale)
-            if [[ $growth -eq 0 ]]; then
+            local child_activity_signature=""
+            local child_total_bytes=0
+            local child_session_count=0
+            if [[ -f "$progress_json" ]]; then
+                local child_activity_data
+                child_activity_data=$(python3 - "$progress_json" <<'PY' 2>/dev/null || true
+import json
+import sys
+try:
+    with open(sys.argv[1], "r", encoding="utf-8") as fh:
+        progress = json.load(fh)
+except Exception:
+    sys.exit(0)
+signature = str(progress.get("child_activity_signature") or "")
+total_bytes = int(progress.get("child_total_bytes") or 0)
+session_count = len(progress.get("child_session_files") or [])
+print(f"{signature}\t{total_bytes}\t{session_count}")
+PY
+                )
+                if [[ -n "$child_activity_data" ]]; then
+                    IFS=$'\t' read -r child_activity_signature child_total_bytes child_session_count <<< "$child_activity_data"
+                fi
+            fi
+            local child_growth=0
+            if [[ -n "$child_activity_signature" && "$child_activity_signature" != "$prev_child_activity_signature" ]]; then
+                child_growth=1
+            fi
+            prev_child_activity_signature="$child_activity_signature"
+            # Track progress staleness. A Codex parent can sit in `wait`
+            # while child transcripts keep growing, so child activity counts.
+            if [[ $growth -eq 0 && $child_growth -eq 0 ]]; then
                 stale_seconds=$((stale_seconds + heartbeat_interval))
             else
                 stale_seconds=0
@@ -72,28 +106,87 @@ start_heartbeat() {
             else
                 size_display="${cur_size}B"
             fi
+            local child_display=""
+            if [[ ${child_total_bytes:-0} -gt 0 ]]; then
+                local child_size_display
+                if [[ $child_total_bytes -gt 1048576 ]]; then
+                    child_size_display="$((child_total_bytes / 1048576))MB"
+                elif [[ $child_total_bytes -gt 1024 ]]; then
+                    child_size_display="$((child_total_bytes / 1024))KB"
+                else
+                    child_size_display="${child_total_bytes}B"
+                fi
+                child_display=" | child: ${child_size_display}"
+                if [[ ${child_session_count:-0} -gt 1 ]]; then
+                    child_display="${child_display}/${child_session_count}"
+                fi
+            fi
             local mins=$((elapsed / 60))
             local secs=$((elapsed % 60))
             local status_icon
-            if [[ $growth -gt 0 ]]; then
+            if [[ $growth -gt 0 || $child_growth -gt 0 ]]; then
                 status_icon="${GREEN}▶${NC}"
             else
                 status_icon="${YELLOW}⏸${NC}"
             fi
-            # Stale-kill: auto-terminate process if no progress for too long
-            if [[ $stale_kill_threshold -gt 0 && $stale_seconds -ge $stale_kill_threshold ]]; then
+            local effective_stale_kill_threshold="$stale_kill_threshold"
+            if [[ $stale_kill_threshold -gt 0 && -f "$progress_json" ]]; then
+                local codex_wait_threshold
+                codex_wait_threshold=$(python3 - "$progress_json" "$stale_kill_threshold" <<'PY' 2>/dev/null || true
+import json
+import os
+import sys
+progress_path = sys.argv[1]
+base_threshold = int(sys.argv[2])
+with open(progress_path, "r", encoding="utf-8") as fh:
+    progress = json.load(fh)
+spawn_count = 0
+for tool in progress.get("tool_calls", []):
+    if isinstance(tool, dict) and tool.get("name") == "spawn_agent":
+        try:
+            spawn_count += int(tool.get("count", 0))
+        except (TypeError, ValueError):
+            pass
+if (
+    progress.get("event_format") == "codex-json"
+    and progress.get("current_tool") == "wait"
+    and spawn_count > 0
+):
+    configured = os.environ.get("CODEX_WAIT_STALE_KILL_THRESHOLD", "")
+    try:
+        wait_threshold = int(configured)
+    except ValueError:
+        wait_threshold = max(base_threshold * 4, 3600)
+    if wait_threshold > base_threshold:
+        print(wait_threshold)
+PY
+                )
+                if [[ "$codex_wait_threshold" =~ ^[0-9]+$ && "$codex_wait_threshold" -gt "$stale_kill_threshold" ]]; then
+                    effective_stale_kill_threshold="$codex_wait_threshold"
+                fi
+            fi
+            # Stale-kill: auto-terminate process if no progress for too long.
+            # Codex parent sessions can sit on the `wait` tool while a spawned
+            # subagent is still doing useful work. Give that valid wait a longer
+            # stale window; normal single-agent stalls still use the base limit.
+            if [[ $effective_stale_kill_threshold -gt 0 && $stale_seconds -ge $effective_stale_kill_threshold ]]; then
                 local stale_mins=$((stale_seconds / 60))
-                echo -e "  ${RED}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${RED}STALE-KILL: no progress for ${stale_mins}m (threshold: ${stale_kill_threshold}s)${NC}"
+                echo -e "  ${RED}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${RED}STALE-KILL: no progress for ${stale_mins}m (threshold: ${effective_stale_kill_threshold}s)${NC}"
                 echo -e "  ${RED}[HEARTBEAT]${NC} Killing AI CLI process $cli_pid (stale session)..."
                 # Write the marker before killing. Some CLIs exit quickly, and the
                 # parent runner may stop this heartbeat process immediately after
                 # wait(1) returns.
                 local _marker_dir
                 _marker_dir="$(dirname "$session_log")"
-                echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"stale_session\", \"stale_seconds\": $stale_seconds, \"threshold\": $stale_kill_threshold}" > "$_marker_dir/stale-kill.json" 2>/dev/null || true
+                echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"stale_session\", \"stale_seconds\": $stale_seconds, \"threshold\": $effective_stale_kill_threshold}" > "$_marker_dir/stale-kill.json" 2>/dev/null || true
                 kill -TERM "$cli_pid" 2>/dev/null || true
                 # Give process 10s to exit gracefully, then force kill
                 local stale_kill_grace_seconds="${STALE_KILL_GRACE_SECONDS:-10}"
@@ -109,9 +202,9 @@ start_heartbeat() {
             # Build staleness hint for display
             local stale_hint=""
-            if [[ $stale_kill_threshold -gt 0 && $stale_seconds -gt 0 ]]; then
+            if [[ $effective_stale_kill_threshold -gt 0 && $stale_seconds -gt 0 ]]; then
                 local stale_mins=$((stale_seconds / 60))
-                local threshold_mins=$((stale_kill_threshold / 60))
+                local threshold_mins=$((effective_stale_kill_threshold / 60))
                 stale_hint=" | stale: ${stale_mins}m/${threshold_mins}m"
             fi
@@ -134,7 +227,7 @@ try:
 except Exception:
     sys.exit(1)
 " "$progress_json" 2>/dev/null) && {
-                    echo -e "  ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${phase}${stale_hint}"
+                    echo -e "  ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display}${child_display} | ${phase}${stale_hint}"
                     continue
                 }
             fi
@@ -145,7 +238,7 @@ except Exception:
                 last_activity=$(tail -20 "$session_log" 2>/dev/null | grep -v '^$' | tail -1 | cut -c1-80 || echo "")
             fi
-            echo -e "  ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s elapsed | log: ${size_display} (+${growth}B) | ${last_activity}${stale_hint}"
+            echo -e "  ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s elapsed | log: ${size_display}${child_display} (+${growth}B) | ${last_activity}${stale_hint}"
         done
     ) &
     _HEARTBEAT_PID=$!

package/bundled/dev-pipeline/run-bugfix.sh CHANGED Viewed

@@ -145,6 +145,11 @@ spawn_and_wait_session() {
         log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
     fi
+    local was_infra_error=false
+    if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
+        was_infra_error=true
+    fi
     # Session summary
     if [[ -f "$session_log" ]]; then
         local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -162,6 +167,10 @@ spawn_and_wait_session() {
     if [[ $exit_code -eq 124 ]]; then
         log_warn "Session timed out after ${SESSION_TIMEOUT}s"
         session_status="timed_out"
+    elif [[ "$was_infra_error" == true ]]; then
+        log_warn "Session failed due to AI CLI/provider infrastructure error"
+        log_warn "Infrastructure errors are retried without consuming code retry budget"
+        session_status="infra_error"
     elif [[ "$was_stale_killed" == true ]]; then
         log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
         log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -259,14 +268,20 @@ sys.exit(0)
     prizm_detect_subagents "$session_log"
     # Update bug status (do NOT commit on dev branch — commit happens after merge)
-    python3 "$SCRIPTS_DIR/update-bug-status.py" \
+    local update_output
+    update_output=$(python3 "$SCRIPTS_DIR/update-bug-status.py" \
         --bug-list "$bug_list" \
         --state-dir "$STATE_DIR" \
         --bug-id "$bug_id" \
         --session-status "$session_status" \
         --session-id "$session_id" \
         --max-retries "$max_retries" \
-        --action update >/dev/null 2>&1 || true
+        --action update 2>&1) || {
+        log_error "Failed to update bug status: $update_output"
+        update_output=""
+    }
+    _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
     _SPAWN_RESULT="$session_status"
 }
@@ -693,6 +708,7 @@ else:
     trap cleanup_single_bug SIGINT SIGTERM
     _SPAWN_RESULT=""
+    _SPAWN_ITEM_STATUS=""
     # Branch lifecycle: create and checkout bugfix branch
     local _proj_root
@@ -1078,12 +1094,14 @@ DEPLOY_PROMPT_EOF
         # Spawn session
         log_info "Spawning AI CLI session: $session_id"
         _SPAWN_RESULT=""
+        _SPAWN_ITEM_STATUS=""
         spawn_and_wait_session \
             "$bug_id" "$bug_list" "$session_id" \
             "$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$bug_model" "$_ORIGINAL_BRANCH"
         local session_status="$_SPAWN_RESULT"
+        local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
         # Merge per-bug dev branch back to original on success
         if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1112,15 +1130,18 @@ DEPLOY_PROMPT_EOF
         session_count=$((session_count + 1))
         total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
-        # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
-        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+        # Stop-on-failure: abort only after the task is actually marked failed.
+        # Pending retry outcomes, including infrastructure errors, keep running.
+        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
             echo ""
             log_error "════════════════════════════════════════════════════"
-            log_error "  STOP_ON_FAILURE: Pipeline halted after $bug_id failed."
+            log_error "  STOP_ON_FAILURE: Pipeline halted after $bug_id exhausted retries."
             log_error "  Total sessions completed: $session_count"
             log_error "  Set STOP_ON_FAILURE=0 to continue past failures."
             log_error "════════════════════════════════════════════════════"
             break
+        elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+            log_info "STOP_ON_FAILURE: $bug_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
         fi
         # Stuck detection

package/bundled/dev-pipeline/run-feature.sh CHANGED Viewed

@@ -153,6 +153,11 @@ spawn_and_wait_session() {
         log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
     fi
+    local was_infra_error=false
+    if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
+        was_infra_error=true
+    fi
     # Show final session summary
     if [[ -f "$session_log" ]]; then
         local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -172,6 +177,10 @@ spawn_and_wait_session() {
     if [[ $exit_code -eq 124 ]]; then
         log_warn "Session timed out after ${SESSION_TIMEOUT}s"
         session_status="timed_out"
+    elif [[ "$was_infra_error" == true ]]; then
+        log_warn "Session failed due to AI CLI/provider infrastructure error"
+        log_warn "Infrastructure errors are retried without consuming code retry budget"
+        session_status="infra_error"
     elif [[ "$was_stale_killed" == true ]]; then
         log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
         log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -347,6 +356,8 @@ sys.exit(0)
         log_error ".prizmkit/plans/feature-list.json may be out of sync. Manual intervention needed."
     }
+    _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
     # Return status via global variable (avoids $() swallowing stdout)
     _SPAWN_RESULT="$session_status"
 }
@@ -848,6 +859,7 @@ else:
     trap cleanup_single_feature SIGINT SIGTERM
     _SPAWN_RESULT=""
+    _SPAWN_ITEM_STATUS=""
     # Branch lifecycle: create and checkout feature branch
     local _proj_root
@@ -1300,11 +1312,13 @@ DEPLOY_PROMPT_EOF
             log_info "Feature model: $feature_model"
         fi
         _SPAWN_RESULT=""
+        _SPAWN_ITEM_STATUS=""
         spawn_and_wait_session \
             "$feature_id" "$feature_list" "$session_id" \
             "$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$feature_model" "$_ORIGINAL_BRANCH"
         local session_status="$_SPAWN_RESULT"
+        local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
         # Merge per-feature dev branch back to original on success
         if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1333,15 +1347,18 @@ DEPLOY_PROMPT_EOF
         session_count=$((session_count + 1))
         total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
-        # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
-        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+        # Stop-on-failure: abort only after the task is actually marked failed.
+        # Pending retry outcomes, including infrastructure errors, keep running.
+        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
             echo ""
             log_error "════════════════════════════════════════════════════"
-            log_error "  STOP_ON_FAILURE: Pipeline halted after $feature_id failed."
+            log_error "  STOP_ON_FAILURE: Pipeline halted after $feature_id exhausted retries."
             log_error "  Total sessions completed: $session_count"
             log_error "  Set STOP_ON_FAILURE=0 to continue past failures."
             log_error "════════════════════════════════════════════════════"
             break
+        elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+            log_info "STOP_ON_FAILURE: $feature_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
         fi
         # Brief pause before next iteration

package/bundled/dev-pipeline/run-refactor.sh CHANGED Viewed

@@ -147,6 +147,11 @@ spawn_and_wait_session() {
         log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
     fi
+    local was_infra_error=false
+    if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
+        was_infra_error=true
+    fi
     # Session summary
     if [[ -f "$session_log" ]]; then
         local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -164,6 +169,10 @@ spawn_and_wait_session() {
     if [[ $exit_code -eq 124 ]]; then
         log_warn "Session timed out after ${SESSION_TIMEOUT}s"
         session_status="timed_out"
+    elif [[ "$was_infra_error" == true ]]; then
+        log_warn "Session failed due to AI CLI/provider infrastructure error"
+        log_warn "Infrastructure errors are retried without consuming code retry budget"
+        session_status="infra_error"
     elif [[ "$was_stale_killed" == true ]]; then
         log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
         log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -286,14 +295,20 @@ sys.exit(0)
     fi
     # Update refactor status (do NOT commit on dev branch — commit happens after merge)
-    python3 "$SCRIPTS_DIR/update-refactor-status.py" \
+    local update_output
+    update_output=$(python3 "$SCRIPTS_DIR/update-refactor-status.py" \
         --refactor-list "$refactor_list" \
         --state-dir "$STATE_DIR" \
         --refactor-id "$refactor_id" \
         --session-status "$session_status" \
         --session-id "$session_id" \
         --max-retries "$max_retries" \
-        --action update >/dev/null 2>&1 || true
+        --action update 2>&1) || {
+        log_error "Failed to update refactor status: $update_output"
+        update_output=""
+    }
+    _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
     _SPAWN_RESULT="$session_status"
 }
@@ -723,6 +738,7 @@ else:
     trap cleanup_single_refactor SIGINT SIGTERM
     _SPAWN_RESULT=""
+    _SPAWN_ITEM_STATUS=""
     # Branch lifecycle: create and checkout refactor branch
     local _proj_root
@@ -1114,6 +1130,7 @@ DEPLOY_PROMPT_EOF
         # Spawn session
         log_info "Spawning AI CLI session: $session_id"
         _SPAWN_RESULT=""
+        _SPAWN_ITEM_STATUS=""
         spawn_and_wait_session \
             "$refactor_id" "$refactor_list" "$session_id" \
@@ -1130,6 +1147,7 @@ DEPLOY_PROMPT_EOF
         fi
         local session_status="$_SPAWN_RESULT"
+        local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
         # Merge per-refactor dev branch back to original on success
         if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1168,15 +1186,18 @@ DEPLOY_PROMPT_EOF
         session_count=$((session_count + 1))
         total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
-        # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
-        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+        # Stop-on-failure: abort only after the task is actually marked failed.
+        # Pending retry outcomes, including infrastructure errors, keep running.
+        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
             echo ""
             log_error "════════════════════════════════════════════════════"
-            log_error "  STOP_ON_FAILURE: Pipeline halted after $refactor_id failed."
+            log_error "  STOP_ON_FAILURE: Pipeline halted after $refactor_id exhausted retries."
             log_error "  Total sessions completed: $session_count"
             log_error "  Set STOP_ON_FAILURE=0 to continue past failures."
             log_error "════════════════════════════════════════════════════"
             break
+        elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+            log_info "STOP_ON_FAILURE: $refactor_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
         fi
         log_info "Pausing 5s before next refactor..."