npm - prizmkit - Versions diffs - 1.1.67 → 1.1.68 - Mend

prizmkit 1.1.67 → 1.1.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/bundled/VERSION.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "frameworkVersion": "1.1.67",
-  "bundledAt": "2026-06-09T02:37:28.761Z",
-  "bundledFrom": "d4b8c30"
+  "frameworkVersion": "1.1.68",
+  "bundledAt": "2026-06-09T14:36:58.835Z",
+  "bundledFrom": "82060fd"
 }

package/bundled/dev-pipeline/lib/common.sh CHANGED Viewed

@@ -418,6 +418,46 @@ prizm_start_ai_session() {
     PRIZM_AI_PID=$!
 }
+# Detect AI CLI/provider infrastructure failures that are outside the
+# generated code's control. These should be retried without consuming the
+# item's code retry budget.
+prizm_detect_infra_error() {
+    local session_log="${1:-}"
+    local progress_json="${2:-}"
+    local haystack=""
+    if [[ -n "$session_log" && -f "$session_log" ]]; then
+        haystack="$(tail -c 65536 "$session_log" 2>/dev/null || true)"
+    fi
+    if [[ -n "$progress_json" && -f "$progress_json" ]]; then
+        haystack+=$'\n'
+        haystack+="$(cat "$progress_json" 2>/dev/null || true)"
+    fi
+    [[ -n "$haystack" ]] || return 1
+    if printf '%s' "$haystack" | grep -Eiq \
+        'auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded'; then
+        return 0
+    fi
+    return 1
+}
+prizm_extract_update_new_status() {
+    python3 -c "
+import json, sys
+raw = sys.stdin.read()
+try:
+    data = json.loads(raw)
+except Exception:
+    sys.exit(0)
+value = data.get('new_status')
+if value:
+    print(value)
+"
+}
 # Run an AI CLI session synchronously.
 # Usage: prizm_run_ai_session <prompt_path> <log_path> <model>
 prizm_run_ai_session() {

package/bundled/dev-pipeline/run-bugfix.sh CHANGED Viewed

@@ -145,6 +145,11 @@ spawn_and_wait_session() {
         log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
     fi
+    local was_infra_error=false
+    if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
+        was_infra_error=true
+    fi
     # Session summary
     if [[ -f "$session_log" ]]; then
         local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -162,6 +167,10 @@ spawn_and_wait_session() {
     if [[ $exit_code -eq 124 ]]; then
         log_warn "Session timed out after ${SESSION_TIMEOUT}s"
         session_status="timed_out"
+    elif [[ "$was_infra_error" == true ]]; then
+        log_warn "Session failed due to AI CLI/provider infrastructure error"
+        log_warn "Infrastructure errors are retried without consuming code retry budget"
+        session_status="infra_error"
     elif [[ "$was_stale_killed" == true ]]; then
         log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
         log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -259,14 +268,20 @@ sys.exit(0)
     prizm_detect_subagents "$session_log"
     # Update bug status (do NOT commit on dev branch — commit happens after merge)
-    python3 "$SCRIPTS_DIR/update-bug-status.py" \
+    local update_output
+    update_output=$(python3 "$SCRIPTS_DIR/update-bug-status.py" \
         --bug-list "$bug_list" \
         --state-dir "$STATE_DIR" \
         --bug-id "$bug_id" \
         --session-status "$session_status" \
         --session-id "$session_id" \
         --max-retries "$max_retries" \
-        --action update >/dev/null 2>&1 || true
+        --action update 2>&1) || {
+        log_error "Failed to update bug status: $update_output"
+        update_output=""
+    }
+    _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
     _SPAWN_RESULT="$session_status"
 }
@@ -693,6 +708,7 @@ else:
     trap cleanup_single_bug SIGINT SIGTERM
     _SPAWN_RESULT=""
+    _SPAWN_ITEM_STATUS=""
     # Branch lifecycle: create and checkout bugfix branch
     local _proj_root
@@ -1078,12 +1094,14 @@ DEPLOY_PROMPT_EOF
         # Spawn session
         log_info "Spawning AI CLI session: $session_id"
         _SPAWN_RESULT=""
+        _SPAWN_ITEM_STATUS=""
         spawn_and_wait_session \
             "$bug_id" "$bug_list" "$session_id" \
             "$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$bug_model" "$_ORIGINAL_BRANCH"
         local session_status="$_SPAWN_RESULT"
+        local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
         # Merge per-bug dev branch back to original on success
         if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1112,15 +1130,18 @@ DEPLOY_PROMPT_EOF
         session_count=$((session_count + 1))
         total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
-        # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
-        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+        # Stop-on-failure: abort only after the task is actually marked failed.
+        # Pending retry outcomes, including infrastructure errors, keep running.
+        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
             echo ""
             log_error "════════════════════════════════════════════════════"
-            log_error "  STOP_ON_FAILURE: Pipeline halted after $bug_id failed."
+            log_error "  STOP_ON_FAILURE: Pipeline halted after $bug_id exhausted retries."
             log_error "  Total sessions completed: $session_count"
             log_error "  Set STOP_ON_FAILURE=0 to continue past failures."
             log_error "════════════════════════════════════════════════════"
             break
+        elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+            log_info "STOP_ON_FAILURE: $bug_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
         fi
         # Stuck detection

package/bundled/dev-pipeline/run-feature.sh CHANGED Viewed

@@ -153,6 +153,11 @@ spawn_and_wait_session() {
         log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
     fi
+    local was_infra_error=false
+    if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
+        was_infra_error=true
+    fi
     # Show final session summary
     if [[ -f "$session_log" ]]; then
         local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -172,6 +177,10 @@ spawn_and_wait_session() {
     if [[ $exit_code -eq 124 ]]; then
         log_warn "Session timed out after ${SESSION_TIMEOUT}s"
         session_status="timed_out"
+    elif [[ "$was_infra_error" == true ]]; then
+        log_warn "Session failed due to AI CLI/provider infrastructure error"
+        log_warn "Infrastructure errors are retried without consuming code retry budget"
+        session_status="infra_error"
     elif [[ "$was_stale_killed" == true ]]; then
         log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
         log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -347,6 +356,8 @@ sys.exit(0)
         log_error ".prizmkit/plans/feature-list.json may be out of sync. Manual intervention needed."
     }
+    _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
     # Return status via global variable (avoids $() swallowing stdout)
     _SPAWN_RESULT="$session_status"
 }
@@ -848,6 +859,7 @@ else:
     trap cleanup_single_feature SIGINT SIGTERM
     _SPAWN_RESULT=""
+    _SPAWN_ITEM_STATUS=""
     # Branch lifecycle: create and checkout feature branch
     local _proj_root
@@ -1300,11 +1312,13 @@ DEPLOY_PROMPT_EOF
             log_info "Feature model: $feature_model"
         fi
         _SPAWN_RESULT=""
+        _SPAWN_ITEM_STATUS=""
         spawn_and_wait_session \
             "$feature_id" "$feature_list" "$session_id" \
             "$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$feature_model" "$_ORIGINAL_BRANCH"
         local session_status="$_SPAWN_RESULT"
+        local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
         # Merge per-feature dev branch back to original on success
         if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1333,15 +1347,18 @@ DEPLOY_PROMPT_EOF
         session_count=$((session_count + 1))
         total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
-        # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
-        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+        # Stop-on-failure: abort only after the task is actually marked failed.
+        # Pending retry outcomes, including infrastructure errors, keep running.
+        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
             echo ""
             log_error "════════════════════════════════════════════════════"
-            log_error "  STOP_ON_FAILURE: Pipeline halted after $feature_id failed."
+            log_error "  STOP_ON_FAILURE: Pipeline halted after $feature_id exhausted retries."
             log_error "  Total sessions completed: $session_count"
             log_error "  Set STOP_ON_FAILURE=0 to continue past failures."
             log_error "════════════════════════════════════════════════════"
             break
+        elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+            log_info "STOP_ON_FAILURE: $feature_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
         fi
         # Brief pause before next iteration

package/bundled/dev-pipeline/run-refactor.sh CHANGED Viewed

@@ -147,6 +147,11 @@ spawn_and_wait_session() {
         log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
     fi
+    local was_infra_error=false
+    if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
+        was_infra_error=true
+    fi
     # Session summary
     if [[ -f "$session_log" ]]; then
         local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -164,6 +169,10 @@ spawn_and_wait_session() {
     if [[ $exit_code -eq 124 ]]; then
         log_warn "Session timed out after ${SESSION_TIMEOUT}s"
         session_status="timed_out"
+    elif [[ "$was_infra_error" == true ]]; then
+        log_warn "Session failed due to AI CLI/provider infrastructure error"
+        log_warn "Infrastructure errors are retried without consuming code retry budget"
+        session_status="infra_error"
     elif [[ "$was_stale_killed" == true ]]; then
         log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
         log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -286,14 +295,20 @@ sys.exit(0)
     fi
     # Update refactor status (do NOT commit on dev branch — commit happens after merge)
-    python3 "$SCRIPTS_DIR/update-refactor-status.py" \
+    local update_output
+    update_output=$(python3 "$SCRIPTS_DIR/update-refactor-status.py" \
         --refactor-list "$refactor_list" \
         --state-dir "$STATE_DIR" \
         --refactor-id "$refactor_id" \
         --session-status "$session_status" \
         --session-id "$session_id" \
         --max-retries "$max_retries" \
-        --action update >/dev/null 2>&1 || true
+        --action update 2>&1) || {
+        log_error "Failed to update refactor status: $update_output"
+        update_output=""
+    }
+    _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
     _SPAWN_RESULT="$session_status"
 }
@@ -723,6 +738,7 @@ else:
     trap cleanup_single_refactor SIGINT SIGTERM
     _SPAWN_RESULT=""
+    _SPAWN_ITEM_STATUS=""
     # Branch lifecycle: create and checkout refactor branch
     local _proj_root
@@ -1114,6 +1130,7 @@ DEPLOY_PROMPT_EOF
         # Spawn session
         log_info "Spawning AI CLI session: $session_id"
         _SPAWN_RESULT=""
+        _SPAWN_ITEM_STATUS=""
         spawn_and_wait_session \
             "$refactor_id" "$refactor_list" "$session_id" \
@@ -1130,6 +1147,7 @@ DEPLOY_PROMPT_EOF
         fi
         local session_status="$_SPAWN_RESULT"
+        local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
         # Merge per-refactor dev branch back to original on success
         if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1168,15 +1186,18 @@ DEPLOY_PROMPT_EOF
         session_count=$((session_count + 1))
         total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
-        # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
-        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+        # Stop-on-failure: abort only after the task is actually marked failed.
+        # Pending retry outcomes, including infrastructure errors, keep running.
+        if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
             echo ""
             log_error "════════════════════════════════════════════════════"
-            log_error "  STOP_ON_FAILURE: Pipeline halted after $refactor_id failed."
+            log_error "  STOP_ON_FAILURE: Pipeline halted after $refactor_id exhausted retries."
             log_error "  Total sessions completed: $session_count"
             log_error "  Set STOP_ON_FAILURE=0 to continue past failures."
             log_error "════════════════════════════════════════════════════"
             break
+        elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
+            log_info "STOP_ON_FAILURE: $refactor_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
         fi
         log_info "Pausing 5s before next refactor..."

package/bundled/dev-pipeline/scripts/update-bug-status.py CHANGED Viewed

@@ -41,6 +41,7 @@ SESSION_STATUS_VALUES = [
     "failed",
     "crashed",
     "timed_out",
+    "infra_error",
     "commit_missing",
     "docs_missing",
     "merge_conflict",
@@ -280,6 +281,16 @@ def action_update(args, bug_list_path, state_dir):
         bs["sessions"] = []
         bs["last_session_id"] = None
+        err = update_bug_in_list(bug_list_path, bug_id, new_status)
+        if err:
+            error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
+            return
+    elif session_status == "infra_error":
+        new_status = "pending"
+        bs["infra_error_count"] = bs.get("infra_error_count", 0) + 1
+        bs["last_infra_error_session_id"] = session_id
+        bs["resume_from_phase"] = None
         err = update_bug_in_list(bug_list_path, bug_id, new_status)
         if err:
             error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
@@ -333,6 +344,10 @@ def action_update(args, bug_list_path, state_dir):
     if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
         summary["degraded_reason"] = session_status
         summary["restart_policy"] = "finalization_retry"
+    elif session_status == "infra_error":
+        summary["restart_policy"] = "infra_retry"
+        summary["infra_error_count"] = bs.get("infra_error_count", 0)
+        summary["artifacts_preserved"] = True
     elif session_status != "success":
         summary["restart_policy"] = "full_restart"
         summary["cleanup_performed"] = cleaned

package/bundled/dev-pipeline/scripts/update-feature-status.py CHANGED Viewed

@@ -45,6 +45,7 @@ SESSION_STATUS_VALUES = [
     "failed",
     "crashed",
     "timed_out",
+    "infra_error",
     "commit_missing",
     "docs_missing",
     "merge_conflict",
@@ -645,6 +646,19 @@ def action_update(args, feature_list_path, state_dir):
         fs["sessions"] = []
         fs["last_session_id"] = None
+        err = update_feature_in_list(feature_list_path, feature_id, new_status)
+        if err:
+            error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
+            return
+    elif session_status == "infra_error":
+        # AI CLI/provider outage, auth failure, gateway error, etc.
+        # This is outside the code's control, so keep the item pending without
+        # consuming the task's retry budget.
+        new_status = "pending"
+        fs["infra_error_count"] = fs.get("infra_error_count", 0) + 1
+        fs["last_infra_error_session_id"] = session_id
+        fs["resume_from_phase"] = None
         err = update_feature_in_list(feature_list_path, feature_id, new_status)
         if err:
             error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
@@ -701,6 +715,10 @@ def action_update(args, feature_list_path, state_dir):
     if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
         summary["degraded_reason"] = session_status
         summary["restart_policy"] = "finalization_retry"
+    elif session_status == "infra_error":
+        summary["restart_policy"] = "infra_retry"
+        summary["infra_error_count"] = fs.get("infra_error_count", 0)
+        summary["artifacts_preserved"] = True
     elif session_status != "success":
         summary["restart_policy"] = "preserve_and_retry"
         summary["artifacts_preserved"] = True

package/bundled/dev-pipeline/scripts/update-refactor-status.py CHANGED Viewed

@@ -42,6 +42,7 @@ SESSION_STATUS_VALUES = [
     "failed",
     "crashed",
     "timed_out",
+    "infra_error",
     "commit_missing",
     "docs_missing",
     "merge_conflict",
@@ -314,6 +315,16 @@ def action_update(args, refactor_list_path, state_dir):
         rs["sessions"] = []
         rs["last_session_id"] = None
+        err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
+        if err:
+            error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
+            return
+    elif session_status == "infra_error":
+        new_status = "pending"
+        rs["infra_error_count"] = rs.get("infra_error_count", 0) + 1
+        rs["last_infra_error_session_id"] = session_id
+        rs["resume_from_phase"] = None
         err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
         if err:
             error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
@@ -376,6 +387,10 @@ def action_update(args, refactor_list_path, state_dir):
     if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
         summary["degraded_reason"] = session_status
         summary["restart_policy"] = "finalization_retry"
+    elif session_status == "infra_error":
+        summary["restart_policy"] = "infra_retry"
+        summary["infra_error_count"] = rs.get("infra_error_count", 0)
+        summary["artifacts_preserved"] = True
     elif session_status != "success":
         summary["restart_policy"] = "full_restart"
         summary["cleanup_performed"] = cleaned

package/bundled/dev-pipeline/tests/test_auto_skip.py CHANGED Viewed

@@ -303,6 +303,45 @@ def _run_get_next(fl_path, state_dir):
     return result.stdout.strip()
+def _run_update(fl_path, state_dir, feature_id, session_status, session_id="session-1", max_retries=3):
+    cmd = [
+        "python3", _SCRIPT,
+        "--feature-list", fl_path,
+        "--state-dir", state_dir,
+        "--feature-id", feature_id,
+        "--session-status", session_status,
+        "--session-id", session_id,
+        "--max-retries", str(max_retries),
+        "--action", "update",
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    assert result.returncode == 0, result.stderr
+    return json.loads(result.stdout)
+class TestInfraErrorUpdate:
+    def test_infra_error_keeps_pending_without_consuming_retry(self, tmp_path):
+        features = [_make_feature("F-001", "Root", status="in_progress")]
+        fl_path = _write_fl(tmp_path, features)
+        state_dir = _init_state(tmp_path, ["F-001"])
+        status_path = os.path.join(state_dir, "features", "F-001", "status.json")
+        fs = load_feature_status(state_dir, "F-001")
+        fs["retry_count"] = 2
+        write_json_file(status_path, fs)
+        result = _run_update(fl_path, state_dir, "F-001", "infra_error", "session-infra", max_retries=3)
+        assert result["new_status"] == "pending"
+        assert result["retry_count"] == 2
+        assert result["restart_policy"] == "infra_retry"
+        assert _read_statuses(fl_path)["F-001"] == "pending"
+        fs = load_feature_status(state_dir, "F-001")
+        assert fs["retry_count"] == 2
+        assert fs["infra_error_count"] == 1
+        assert fs["last_infra_error_session_id"] == "session-infra"
 class TestUnskipByFeatureId:
     """Unskip with --feature-id targets a specific failed feature + downstream."""

package/bundled/dev-pipeline-windows/SCHEMA_ANALYSIS.md CHANGED Viewed

@@ -353,7 +353,7 @@ pending, in_progress, completed, failed, skipped
 | `LOG_CLEANUP_ENABLED` | boolean | 1 | Periodic session log cleanup |
 | `LOG_RETENTION_DAYS` | integer | 14 | Delete session logs older than N days |
 | `LOG_MAX_TOTAL_MB` | integer | 1024 | Keep total logs under N MB |
-| `STOP_ON_FAILURE` | boolean | 0 | Stop after the first failed task |
+| `STOP_ON_FAILURE` | boolean | 0 | Stop after a task exhausts retries |
 | `ENABLE_DEPLOY` | boolean | 0 | Start deploy session after all tasks complete |
 | `DEV_BRANCH` | string | auto-generated | Optional custom dev branch name |
 | `AUTO_PUSH` | boolean | 0 | Push original branch after successful merge |

package/bundled/dev-pipeline-windows/lib/common.ps1 CHANGED Viewed

@@ -145,6 +145,25 @@ function Invoke-PrizmPythonText {
   if ($LASTEXITCODE -ne 0) { throw "Python command failed: $($Arguments -join ' ')" }
 }
+function Test-PrizmInfraError {
+  param([string]$SessionLog, [string]$ProgressJson)
+  $parts = @()
+  if ($SessionLog -and (Test-Path $SessionLog)) {
+    try {
+      $text = Get-Content $SessionLog -Raw -ErrorAction Stop
+      if ($text.Length -gt 65536) { $text = $text.Substring($text.Length - 65536) }
+      $parts += $text
+    } catch {}
+  }
+  if ($ProgressJson -and (Test-Path $ProgressJson)) {
+    try { $parts += (Get-Content $ProgressJson -Raw -ErrorAction Stop) } catch {}
+  }
+  if ($parts.Count -eq 0) { return $false }
+  $haystack = $parts -join "`n"
+  return ($haystack -match '(?i)auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded')
+}
 function Get-PrizmConfigValue {
   param([string]$ConfigPath, [string]$Key)
   if (-not (Test-Path $ConfigPath)) { return $null }

package/bundled/dev-pipeline-windows/lib/pipeline.ps1 CHANGED Viewed

@@ -618,10 +618,16 @@ function Invoke-PrizmPipeline {
     }
     Stop-PrizmProgressParser $parserProcess
+    $wasInfraError = ($exitCode -ne 0 -and (Test-PrizmInfraError -SessionLog $sessionLog -ProgressJson $progressJson))
     $status = 'crashed'
     if ($wasTimedOut) {
       $status = 'timed_out'
       Write-PrizmWarn "AI session timed out after $timeoutSeconds seconds"
+    } elseif ($wasInfraError) {
+      $status = 'infra_error'
+      Write-PrizmWarn "AI session failed due to AI CLI/provider infrastructure error"
+      Write-PrizmWarn "Infrastructure errors are retried without consuming code retry budget"
     } elseif ($wasStaleKilled -or (Test-Path $staleKillMarker)) {
       Write-PrizmWarn "Session was stale-killed by heartbeat monitor (no progress for too long)"
       Write-PrizmWarn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -645,8 +651,12 @@ function Invoke-PrizmPipeline {
     }
     $mergeSucceeded = $true
+    $itemListStatus = ''
     if ($status -eq 'success') {
-      Invoke-PrizmPythonText $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
+      $updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
+      if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
+        $itemListStatus = [string]$updateResult.new_status
+      }
       if (Test-PrizmGitDirty $paths.ProjectRoot) {
         if ($hadDirtyBaseline) {
@@ -676,7 +686,10 @@ function Invoke-PrizmPipeline {
     }
     if ($status -ne 'success') {
-      Invoke-PrizmPythonText $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
+      $updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
+      if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
+        $itemListStatus = [string]$updateResult.new_status
+      }
       if ($isGitRepository) {
         Invoke-PrizmGitCommitPath $paths.ProjectRoot $listPath "chore($CurrentItemId): update $idName status" | Out-Null
       }
@@ -687,6 +700,7 @@ function Invoke-PrizmPipeline {
     } else {
       Write-PrizmError "$Kind item failed: $CurrentItemId. Log: $sessionLog"
     }
+    $script:PRIZM_ITEM_LIST_STATUS = $itemListStatus
     $script:PRIZM_ITEM_EXIT_CODE = if ($status -eq 'success' -and $mergeSucceeded) { 0 } else { 1 }
     return
   }
@@ -748,9 +762,11 @@ function Invoke-PrizmPipeline {
       $global:PRIZM_EXIT_CODE = $lastExitCode
       return
     }
-    if ($lastExitCode -ne 0 -and $stopOnFailure) {
+    if ($lastExitCode -ne 0 -and $stopOnFailure -and $script:PRIZM_ITEM_LIST_STATUS -eq 'failed') {
       $global:PRIZM_EXIT_CODE = $lastExitCode
       return
+    } elseif ($lastExitCode -ne 0 -and $stopOnFailure) {
+      Write-PrizmInfo "STOP_ON_FAILURE: $nextItemId is $($script:PRIZM_ITEM_LIST_STATUS); retry budget not exhausted, continuing."
     }
   }
 }

package/bundled/dev-pipeline-windows/scripts/update-bug-status.py CHANGED Viewed

@@ -41,6 +41,7 @@ SESSION_STATUS_VALUES = [
     "failed",
     "crashed",
     "timed_out",
+    "infra_error",
     "commit_missing",
     "docs_missing",
     "merge_conflict",
@@ -280,6 +281,16 @@ def action_update(args, bug_list_path, state_dir):
         bs["sessions"] = []
         bs["last_session_id"] = None
+        err = update_bug_in_list(bug_list_path, bug_id, new_status)
+        if err:
+            error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
+            return
+    elif session_status == "infra_error":
+        new_status = "pending"
+        bs["infra_error_count"] = bs.get("infra_error_count", 0) + 1
+        bs["last_infra_error_session_id"] = session_id
+        bs["resume_from_phase"] = None
         err = update_bug_in_list(bug_list_path, bug_id, new_status)
         if err:
             error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
@@ -333,6 +344,10 @@ def action_update(args, bug_list_path, state_dir):
     if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
         summary["degraded_reason"] = session_status
         summary["restart_policy"] = "finalization_retry"
+    elif session_status == "infra_error":
+        summary["restart_policy"] = "infra_retry"
+        summary["infra_error_count"] = bs.get("infra_error_count", 0)
+        summary["artifacts_preserved"] = True
     elif session_status != "success":
         summary["restart_policy"] = "full_restart"
         summary["cleanup_performed"] = cleaned

package/bundled/dev-pipeline-windows/scripts/update-feature-status.py CHANGED Viewed

@@ -45,6 +45,7 @@ SESSION_STATUS_VALUES = [
     "failed",
     "crashed",
     "timed_out",
+    "infra_error",
     "commit_missing",
     "docs_missing",
     "merge_conflict",
@@ -645,6 +646,19 @@ def action_update(args, feature_list_path, state_dir):
         fs["sessions"] = []
         fs["last_session_id"] = None
+        err = update_feature_in_list(feature_list_path, feature_id, new_status)
+        if err:
+            error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
+            return
+    elif session_status == "infra_error":
+        # AI CLI/provider outage, auth failure, gateway error, etc.
+        # This is outside the code's control, so keep the item pending without
+        # consuming the task's retry budget.
+        new_status = "pending"
+        fs["infra_error_count"] = fs.get("infra_error_count", 0) + 1
+        fs["last_infra_error_session_id"] = session_id
+        fs["resume_from_phase"] = None
         err = update_feature_in_list(feature_list_path, feature_id, new_status)
         if err:
             error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
@@ -701,6 +715,10 @@ def action_update(args, feature_list_path, state_dir):
     if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
         summary["degraded_reason"] = session_status
         summary["restart_policy"] = "finalization_retry"
+    elif session_status == "infra_error":
+        summary["restart_policy"] = "infra_retry"
+        summary["infra_error_count"] = fs.get("infra_error_count", 0)
+        summary["artifacts_preserved"] = True
     elif session_status != "success":
         summary["restart_policy"] = "preserve_and_retry"
         summary["artifacts_preserved"] = True

package/bundled/dev-pipeline-windows/scripts/update-refactor-status.py CHANGED Viewed

@@ -42,6 +42,7 @@ SESSION_STATUS_VALUES = [
     "failed",
     "crashed",
     "timed_out",
+    "infra_error",
     "commit_missing",
     "docs_missing",
     "merge_conflict",
@@ -314,6 +315,16 @@ def action_update(args, refactor_list_path, state_dir):
         rs["sessions"] = []
         rs["last_session_id"] = None
+        err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
+        if err:
+            error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
+            return
+    elif session_status == "infra_error":
+        new_status = "pending"
+        rs["infra_error_count"] = rs.get("infra_error_count", 0) + 1
+        rs["last_infra_error_session_id"] = session_id
+        rs["resume_from_phase"] = None
         err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
         if err:
             error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
@@ -376,6 +387,10 @@ def action_update(args, refactor_list_path, state_dir):
     if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
         summary["degraded_reason"] = session_status
         summary["restart_policy"] = "finalization_retry"
+    elif session_status == "infra_error":
+        summary["restart_policy"] = "infra_retry"
+        summary["infra_error_count"] = rs.get("infra_error_count", 0)
+        summary["artifacts_preserved"] = True
     elif session_status != "success":
         summary["restart_policy"] = "full_restart"
         summary["cleanup_performed"] = cleaned

package/bundled/skills/_metadata.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "version": "1.1.67",
+  "version": "1.1.68",
   "skills": {
     "prizm-kit": {
       "description": "Full-lifecycle dev toolkit. Covers spec-driven development, Prizm context docs, code quality, debugging, deployment, and knowledge management.",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "prizmkit",
-  "version": "1.1.67",
+  "version": "1.1.68",
   "description": "Create a new PrizmKit-powered project with clean initialization — no framework dev files, just what you need.",
   "type": "module",
   "bin": {