npm - prizmkit - Versions diffs - 1.1.67 → 1.1.69 - Mend

prizmkit 1.1.67 → 1.1.69

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/bundled/dev-pipeline/scripts/update-bug-status.py CHANGED Viewed

@@ -41,6 +41,7 @@ SESSION_STATUS_VALUES = [
     "failed",
     "crashed",
     "timed_out",
+    "infra_error",
     "commit_missing",
     "docs_missing",
     "merge_conflict",
@@ -280,6 +281,16 @@ def action_update(args, bug_list_path, state_dir):
         bs["sessions"] = []
         bs["last_session_id"] = None
+        err = update_bug_in_list(bug_list_path, bug_id, new_status)
+        if err:
+            error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
+            return
+    elif session_status == "infra_error":
+        new_status = "pending"
+        bs["infra_error_count"] = bs.get("infra_error_count", 0) + 1
+        bs["last_infra_error_session_id"] = session_id
+        bs["resume_from_phase"] = None
         err = update_bug_in_list(bug_list_path, bug_id, new_status)
         if err:
             error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
@@ -333,6 +344,10 @@ def action_update(args, bug_list_path, state_dir):
     if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
         summary["degraded_reason"] = session_status
         summary["restart_policy"] = "finalization_retry"
+    elif session_status == "infra_error":
+        summary["restart_policy"] = "infra_retry"
+        summary["infra_error_count"] = bs.get("infra_error_count", 0)
+        summary["artifacts_preserved"] = True
     elif session_status != "success":
         summary["restart_policy"] = "full_restart"
         summary["cleanup_performed"] = cleaned

package/bundled/dev-pipeline/scripts/update-feature-status.py CHANGED Viewed

@@ -45,6 +45,7 @@ SESSION_STATUS_VALUES = [
     "failed",
     "crashed",
     "timed_out",
+    "infra_error",
     "commit_missing",
     "docs_missing",
     "merge_conflict",
@@ -645,6 +646,19 @@ def action_update(args, feature_list_path, state_dir):
         fs["sessions"] = []
         fs["last_session_id"] = None
+        err = update_feature_in_list(feature_list_path, feature_id, new_status)
+        if err:
+            error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
+            return
+    elif session_status == "infra_error":
+        # AI CLI/provider outage, auth failure, gateway error, etc.
+        # This is outside the code's control, so keep the item pending without
+        # consuming the task's retry budget.
+        new_status = "pending"
+        fs["infra_error_count"] = fs.get("infra_error_count", 0) + 1
+        fs["last_infra_error_session_id"] = session_id
+        fs["resume_from_phase"] = None
         err = update_feature_in_list(feature_list_path, feature_id, new_status)
         if err:
             error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
@@ -701,6 +715,10 @@ def action_update(args, feature_list_path, state_dir):
     if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
         summary["degraded_reason"] = session_status
         summary["restart_policy"] = "finalization_retry"
+    elif session_status == "infra_error":
+        summary["restart_policy"] = "infra_retry"
+        summary["infra_error_count"] = fs.get("infra_error_count", 0)
+        summary["artifacts_preserved"] = True
     elif session_status != "success":
         summary["restart_policy"] = "preserve_and_retry"
         summary["artifacts_preserved"] = True

package/bundled/dev-pipeline/scripts/update-refactor-status.py CHANGED Viewed

@@ -42,6 +42,7 @@ SESSION_STATUS_VALUES = [
     "failed",
     "crashed",
     "timed_out",
+    "infra_error",
     "commit_missing",
     "docs_missing",
     "merge_conflict",
@@ -314,6 +315,16 @@ def action_update(args, refactor_list_path, state_dir):
         rs["sessions"] = []
         rs["last_session_id"] = None
+        err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
+        if err:
+            error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
+            return
+    elif session_status == "infra_error":
+        new_status = "pending"
+        rs["infra_error_count"] = rs.get("infra_error_count", 0) + 1
+        rs["last_infra_error_session_id"] = session_id
+        rs["resume_from_phase"] = None
         err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
         if err:
             error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
@@ -376,6 +387,10 @@ def action_update(args, refactor_list_path, state_dir):
     if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
         summary["degraded_reason"] = session_status
         summary["restart_policy"] = "finalization_retry"
+    elif session_status == "infra_error":
+        summary["restart_policy"] = "infra_retry"
+        summary["infra_error_count"] = rs.get("infra_error_count", 0)
+        summary["artifacts_preserved"] = True
     elif session_status != "success":
         summary["restart_policy"] = "full_restart"
         summary["cleanup_performed"] = cleaned

package/bundled/dev-pipeline/templates/bootstrap-tier2.md CHANGED Viewed

@@ -14,6 +14,12 @@ You are the **session orchestrator**. Implement Feature {{FEATURE_ID}}: "{{FEATU
 **Tier 2 — Dual Agent**: You handle context + planning directly. Then spawn Dev and Reviewer subagents. Spawn Dev and Reviewer agents via the Agent tool.
+**Agent spawn failure policy (all Agent tool calls)**:
+- If spawning Dev, Reviewer, or Critic fails with team/config/lock errors, retry at most once.
+- If the second attempt fails, do not keep spawning variants and do not enter artifact polling for Implementation Log, challenge report, or review report markers.
+- Use the documented inline/recovery fallback for that phase: write the required report yourself where possible, complete remaining Dev work directly in the orchestrator when safe, or write `failure-log.md` with the spawn error and last observable state before stopping for recovery.
+- Apply the same cap to any re-spawn for report repair or resume prompts; do not burn multiple minutes on identical team/config/lock failures.
 ### Feature Description
 {{FEATURE_DESCRIPTION}}
@@ -163,6 +169,8 @@ Before proceeding past CP-1, verify:
 Spawn Reviewer agent (Agent tool, subagent_type="prizm-dev-team-reviewer", mode="plan", run_in_background=false).
+Spawn failure cap: for team/config/lock errors, retry at most once for this Reviewer spawn. If the second attempt fails, do not poll for report artifacts; fix/check the plan inline or write `failure-log.md` before stopping for recovery.
 Prompt:
 > "Read {{REVIEWER_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
 > 1. Read `.prizmkit/specs/{{FEATURE_SLUG}}/context-snapshot.md` FIRST — Section 3 has project context, Section 4 has file manifest.
@@ -186,6 +194,8 @@ If CRITIC:MISSING — skip Phase 3.5 entirely and proceed to Phase 4. Log: "Crit
 Spawn Critic agent (Agent tool, subagent_type="prizm-dev-team-critic", mode="plan", run_in_background=false).
+Spawn failure cap: for team/config/lock errors, retry at most once for this Critic spawn. If the second attempt fails, do not poll for `challenge-report.md`; perform the plan challenge inline and record the fallback.
 Prompt:
 > "Read {{CRITIC_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
 > **MODE: Plan Challenge**
@@ -208,6 +218,8 @@ Wait for Critic to return.
 Spawn Dev subagent (Agent tool, subagent_type="prizm-dev-team-dev", run_in_background=false).
+Spawn failure cap: for team/config/lock errors, retry at most once for this Dev spawn. If the second attempt fails, do not poll for `## Implementation Log`; write `failure-log.md` and either implement remaining tasks directly in the orchestrator or stop for recovery.
 Prompt:
 > "Read {{DEV_SUBAGENT_PATH}}. Implement feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}).
 > **IMPORTANT**: Read `.prizmkit/specs/{{FEATURE_SLUG}}/context-snapshot.md` FIRST — Section 3 has Prizm Context (TRAPS/RULES), Section 4 has File Manifest with paths and interfaces.
@@ -232,6 +244,8 @@ If GATE:MISSING — send message to Dev (re-spawn if needed): "Write the '## Imp
 Spawn Reviewer subagent (Agent tool, subagent_type="prizm-dev-team-reviewer", run_in_background=false).
+Spawn failure cap: for team/config/lock errors, retry at most once for this Reviewer spawn. If the second attempt fails, do not poll for `review-report.md`; write `failure-log.md` with the spawn error and last observable state before stopping or performing an inline fallback.
 Prompt:
 > "Read {{REVIEWER_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
 > 1. Read `.prizmkit/specs/{{FEATURE_SLUG}}/spec.md` for goals and acceptance criteria
@@ -248,7 +262,11 @@ After Reviewer agent returns, verify the review report was written:
 ```bash
 grep -q "## Verdict" .prizmkit/specs/{{FEATURE_SLUG}}/review-report.md && echo "GATE:PASS" || echo "GATE:MISSING"
 ```
-If GATE:MISSING — send message to Reviewer (re-spawn if needed): "Write review-report.md to .prizmkit/specs/{{FEATURE_SLUG}}/."
+If GATE:MISSING:
+- Do not re-spawn Reviewer or re-run `/prizmkit-code-review` in an unbounded report-repair loop.
+- Perform one bounded status check; retry at most once: inspect Reviewer output, code-review skill output, `review-report.md` path, and any Reviewer/Dev spawn messages.
+- If the missing report is caused by team/config/lock errors from Reviewer or the internal code-review loop, write `failure-log.md` with the spawn/skill error and last observable state.
+- If the report is still missing after that single check/retry, either perform a safe inline fallback review and write `review-report.md` with `## Verdict`, or stop with a clear recovery failure.
 Read `review-report.md` and check the Verdict:
 - `PASS` → proceed to next phase

package/bundled/dev-pipeline/templates/bootstrap-tier3.md CHANGED Viewed

@@ -14,6 +14,12 @@ You are the **session orchestrator**. Implement Feature {{FEATURE_ID}}: "{{FEATU
 **Tier 3 — Full Team**: For complex features, use the full pipeline (Phase 0–6) with Dev + Reviewer agents spawned via the Agent tool.
+**Agent spawn failure policy (all Agent tool calls)**:
+- If spawning Dev, Reviewer, or Critic fails with team/config/lock errors, retry at most once.
+- If the second attempt fails, do not keep spawning variants and do not enter artifact polling for Implementation Log, challenge report, or review report markers.
+- Use the documented inline/recovery fallback for that phase: write the required report yourself where possible, complete remaining Dev work directly in the orchestrator when safe, or write `failure-log.md` with the spawn error and last observable state before stopping for recovery.
+- Apply the same cap to any re-spawn for report repair or resume prompts; do not burn multiple minutes on identical team/config/lock failures.
 ### Feature Description
 {{FEATURE_DESCRIPTION}}
@@ -190,6 +196,8 @@ Before proceeding past CP-1, verify:
 Spawn Reviewer agent (Agent tool, subagent_type="prizm-dev-team-reviewer", mode="plan", run_in_background=false).
+Spawn failure cap: for team/config/lock errors, retry at most once for this Reviewer spawn. If the second attempt fails, do not poll for report artifacts; fix/check the plan inline or write `failure-log.md` before stopping for recovery.
 Prompt:
 > "Read {{REVIEWER_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
 > 1. Read `.prizmkit/specs/{{FEATURE_SLUG}}/context-snapshot.md` FIRST — Section 3 has project context, Section 4 has file manifest.
@@ -217,6 +225,8 @@ If CRITIC:MISSING — skip Phase 3.5 entirely and proceed to Phase 4. Log: "Crit
 Spawn Critic agent (Agent tool, subagent_type="prizm-dev-team-critic", mode="plan", run_in_background=false).
+Spawn failure cap: for team/config/lock errors, retry at most once for this Critic spawn. If the second attempt fails, do not poll for challenge reports; perform the plan challenge inline and record the fallback.
 Prompt:
 > "Read {{CRITIC_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
 > **MODE: Plan Challenge**
@@ -263,6 +273,8 @@ grep -c '^\- \[ \]' .prizmkit/specs/{{FEATURE_SLUG}}/plan.md 2>/dev/null || true
 Spawn Dev agent (Agent tool, subagent_type="prizm-dev-team-dev", run_in_background=false).
+Spawn failure cap: for team/config/lock errors, retry at most once for this Dev spawn. If the second attempt fails, do not poll for `## Implementation Log`; write `failure-log.md` and either implement remaining tasks directly in the orchestrator or stop for recovery.
 Prompt:
 > "Read {{DEV_SUBAGENT_PATH}}. Implement feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}).
 > **IMPORTANT**: Read `.prizmkit/specs/{{FEATURE_SLUG}}/context-snapshot.md` FIRST — Section 3 has Prizm Context (TRAPS/RULES), Section 4 has File Manifest with paths and interfaces.
@@ -297,6 +309,8 @@ All tasks `[x]`, tests pass.
 Spawn Reviewer agent (Agent tool, subagent_type="prizm-dev-team-reviewer", run_in_background=false).
+Spawn failure cap: for team/config/lock errors, retry at most once for this Reviewer spawn. If the second attempt fails, do not poll for `review-report.md`; write `failure-log.md` with the spawn error and last observable state before stopping or performing an inline fallback.
 Prompt:
 > "Read {{REVIEWER_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
 > 1. Read `.prizmkit/specs/{{FEATURE_SLUG}}/spec.md` for goals and acceptance criteria
@@ -313,7 +327,11 @@ After Reviewer agent returns, verify the review report was written:
 ```bash
 grep -q "## Verdict" .prizmkit/specs/{{FEATURE_SLUG}}/review-report.md && echo "GATE:PASS" || echo "GATE:MISSING"
 ```
-If GATE:MISSING — send message to Reviewer (re-spawn if needed): "Write review-report.md to .prizmkit/specs/{{FEATURE_SLUG}}/."
+If GATE:MISSING:
+- Do not re-spawn Reviewer or re-run `/prizmkit-code-review` in an unbounded report-repair loop.
+- Perform one bounded status check; retry at most once: inspect Reviewer output, code-review skill output, `review-report.md` path, and any Reviewer/Dev spawn messages.
+- If the missing report is caused by team/config/lock errors from Reviewer or the internal code-review loop, write `failure-log.md` with the spawn/skill error and last observable state.
+- If the report is still missing after that single check/retry, either perform a safe inline fallback review and write `review-report.md` with `## Verdict`, or stop with a clear recovery failure.
 Read `review-report.md` and check the Verdict:
 - `PASS` → proceed to next phase

package/bundled/dev-pipeline/templates/refactor-bootstrap-prompt.md CHANGED Viewed

@@ -80,6 +80,12 @@ You are the **refactor session orchestrator**. Execute Refactor {{REFACTOR_ID}}:
 **YOU are the orchestrator. Execute each phase by spawning the appropriate team agent with run_in_background=false.**
+**Agent spawn failure policy (all Agent tool calls)**:
+- If spawning Dev or Reviewer fails with team/config/lock errors, retry at most once.
+- If the second attempt fails, do not keep spawning variants and do not enter artifact polling for Implementation Log, review-report, or refactor-report markers.
+- Use the documented inline/recovery fallback for that phase: complete remaining refactor work directly in the orchestrator when safe, write the required report yourself where possible, or write `failure-log.md` with the spawn error and last observable state before stopping for recovery.
+- Apply the same cap to any re-spawn for report repair or resume prompts; do not burn multiple minutes on identical team/config/lock failures.
 ## Workflow Checkpoint System
 A checkpoint file tracks your progress through this workflow:
@@ -164,6 +170,7 @@ Include browser verification approach in plan.md:
 **Goal**: Execute all tasks from plan.md while preserving existing behavior.
 - Spawn Dev agent (Agent tool, subagent_type="prizm-dev-team-dev", run_in_background=false)
+  Spawn failure cap: for team/config/lock errors, retry at most once for this Dev spawn. If the second attempt fails, do not poll for `## Implementation Log`; write `failure-log.md` and either complete remaining refactor work directly in the orchestrator or stop for recovery.
   Prompt: "Read {{DEV_SUBAGENT_PATH}}. For refactor {{REFACTOR_ID}} ('{{REFACTOR_TITLE}}'):
   1. Read `.prizmkit/refactor/{{REFACTOR_ID}}/spec.md` and `.prizmkit/refactor/{{REFACTOR_ID}}/plan.md`
   2. Read `.prizmkit/prizm-docs/` for affected modules (TRAPS, RULES, PATTERNS)
@@ -201,6 +208,7 @@ Include browser verification approach in plan.md:
 **Goal**: Verify refactoring quality and behavior preservation.
 - Spawn Reviewer agent (Agent tool, subagent_type="prizm-dev-team-reviewer", run_in_background=false)
+  Spawn failure cap: for team/config/lock errors, retry at most once for this Reviewer spawn. If the second attempt fails, do not poll for `review-report.md`; write `failure-log.md` with the spawn error and last observable state before stopping or performing an inline fallback.
   Prompt: "Read {{REVIEWER_SUBAGENT_PATH}}. For refactor {{REFACTOR_ID}}:
   1. Read `.prizmkit/refactor/{{REFACTOR_ID}}/spec.md` for goals and behavior preservation contracts
   2. Read `.prizmkit/refactor/{{REFACTOR_ID}}/plan.md` for architecture decisions and completed tasks
@@ -221,7 +229,20 @@ Include browser verification approach in plan.md:
   7. Report: verdict (PASS/NEEDS_FIXES), number of rounds, findings fixed/rejected
   "
 - **Wait for Reviewer to return**
-- Read `review-report.md` — if PASS proceed, if NEEDS_FIXES log remaining findings and proceed.
+- **Gate Check — Review Report**:
+  After Reviewer returns, verify the review report contains a verdict:
+  ```bash
+  grep -q "## Verdict" .prizmkit/refactor/{{REFACTOR_ID}}/review-report.md && echo "GATE:PASS" || echo "GATE:MISSING"
+  ```
+  If GATE:MISSING:
+  - Do not enter an unbounded report-repair loop and do not repeatedly re-spawn Reviewer.
+  - Perform one bounded status check; retry at most once: inspect the Reviewer output, `review-report.md` path, and any internal Reviewer/Dev spawn messages from `/prizmkit-code-review`.
+  - If the missing report is caused by team/config/lock errors from the Reviewer or internal Reviewer/Dev agent spawn, retry the Reviewer agent at most once only if it appears transient.
+  - If the report is still missing after that single check/retry, write `.prizmkit/refactor/{{REFACTOR_ID}}/failure-log.md` with the spawn/skill error and last observable state, then either perform a safe inline fallback review (spec/plan/diff/tests → write `review-report.md` with `## Verdict`) or stop with a clear recovery failure.
+Read `review-report.md` and check the Verdict:
+- `PASS` → proceed to next phase
+- `NEEDS_FIXES` → log remaining findings and proceed (do not retry externally)
 - **CP-RF-3**: Code review complete, tests pass, behavior preserved
 - **Checkpoint update**: set step `prizmkit-code-review` to `"completed"` in `{{CHECKPOINT_PATH}}`

package/bundled/dev-pipeline/templates/sections/phase-critic-plan-full.md CHANGED Viewed

@@ -8,6 +8,16 @@ If CRITIC:MISSING — skip this phase entirely and proceed. Log: "Critic agent n
 **Choose ONE path based on `{{CRITIC_COUNT}}`:**
+**Agent spawn failure policy**:
+- If spawning Critic fails with team/config/lock errors, retry at most once.
+- If the second attempt fails, do not keep spawning variants. Either create the required team once (when team tooling is available) or perform the plan challenge inline and write the required challenge report yourself.
+- Record the fallback in the report; do not burn multiple minutes on repeated identical spawn failures.
+**No silent report polling**:
+- Do NOT run a long no-output loop waiting for `challenge-report*.md`.
+- If you need to wait for a report file, use a short bounded check (≤120s) that prints elapsed time and reports present on every iteration.
+- If reports are still missing after the bounded check, request one status update; if still missing, perform the missing challenge lens inline and continue.
 **If {{CRITIC_COUNT}} = 1 → Single Critic** (skip to CP-2.5 after this):
 **Spawn Agent**:

package/bundled/dev-pipeline/templates/sections/phase-critic-plan.md CHANGED Viewed

@@ -16,6 +16,16 @@ If CRITIC:MISSING — skip this phase entirely and proceed. Log: "Critic agent n
 **Prompt**:
 > {{AGENT_PROMPT_CRITIC_PLAN_CHALLENGE}}
+**Agent spawn failure policy**:
+- If spawning Critic fails with team/config/lock errors, retry at most once.
+- If the second attempt fails, do not keep spawning variants. Either create the required team once (when team tooling is available) or perform the plan challenge inline and write `challenge-report.md` yourself.
+- Record the fallback in the report; do not burn multiple minutes on repeated identical spawn failures.
+**No silent report polling**:
+- Do NOT run a long no-output loop waiting for `challenge-report.md`.
+- If you need to wait for the report file, use a short bounded check (≤120s) that prints elapsed time and whether the report exists on every iteration.
+- If the report is still missing after the bounded check, request one status update; if still missing, perform the challenge inline and continue.
 Wait for Critic to return.
 - Read challenge-report.md. For items marked CRITICAL/HIGH: decide whether to adjust plan.md or document why the plan stands.
 - Max 1 plan revision round.

package/bundled/dev-pipeline/templates/sections/phase-implement-agent.md CHANGED Viewed

@@ -15,11 +15,23 @@
 | subagent_type | prizm-dev-team-dev |
 | run_in_background | false |
+**Agent spawn failure policy**:
+- If spawning Dev fails with team/config/lock errors, retry at most once.
+- If the second attempt fails, do not enter Implementation Log polling or repeated recovery spawn loops.
+- Use the documented inline/recovery fallback: write `failure-log.md` with the spawn error and last observable state, then either complete remaining tasks directly in the orchestrator or stop with a clear failure for recovery.
+- Apply the same cap to Dev re-spawns for Implementation Log repair or resume prompts; do not burn multiple minutes on identical team/config/lock failures.
 **Prompt**:
 > {{AGENT_PROMPT_DEV_IMPLEMENT}}
 Wait for Dev to return. All tasks must be `[x]`, tests pass.
+**No silent artifact polling**:
+- Do NOT run a long no-output loop that only waits for `## Implementation Log` or any other file marker.
+- If you must wait for Dev after spawning or sending a status request, use short bounded checks (≤120s) that print a heartbeat line each iteration with: elapsed time, remaining unchecked task count, whether `## Implementation Log` exists, and whether `git diff --stat` changed.
+- If Dev has no transcript/file/diff progress for one bounded check, send one status request. If there is still no progress on the next bounded check, stop waiting, write `failure-log.md` with the last observable state, and follow Subagent Timeout Recovery.
+- Prefer the Agent tool's completion notification or Dev's `COMPLETION_SIGNAL`; file presence alone is not a liveness signal.
 **Gate Check — Implementation Log**:
 After Dev agent returns, verify the Implementation Log was written:
 ```bash

package/bundled/dev-pipeline/templates/sections/phase-implement-full.md CHANGED Viewed

@@ -22,9 +22,21 @@ grep -c '^\- \[ \]' .prizmkit/specs/{{FEATURE_SLUG}}/plan.md 2>/dev/null || true
 | subagent_type | prizm-dev-team-dev |
 | run_in_background | false |
+**Agent spawn failure policy**:
+- If spawning Dev fails with team/config/lock errors, retry at most once.
+- If the second attempt fails, do not enter Implementation Log polling or repeated recovery spawn loops.
+- Use the documented inline/recovery fallback: write `failure-log.md` with the spawn error and last observable state, then either complete remaining tasks directly in the orchestrator or stop with a clear failure for recovery.
+- Apply the same cap to Dev re-spawns for Implementation Log repair or resume prompts; do not burn multiple minutes on identical team/config/lock failures.
 **Prompt**:
 > {{AGENT_PROMPT_DEV_IMPLEMENT}}
+**No silent artifact polling**:
+- Do NOT run a long no-output loop that only waits for `## Implementation Log` or any other file marker.
+- If you must wait for Dev after spawning or sending a status request, use short bounded checks (≤120s) that print a heartbeat line each iteration with: elapsed time, remaining unchecked task count, whether `## Implementation Log` exists, and whether `git diff --stat` changed.
+- If Dev has no transcript/file/diff progress for one bounded check, send one status request. If there is still no progress on the next bounded check, stop waiting, write `failure-log.md` with the last observable state, and follow Subagent Timeout Recovery.
+- Prefer the Agent tool's completion notification or Dev's `COMPLETION_SIGNAL`; file presence alone is not a liveness signal.
 **Gate Check — Implementation Log**:
 After Dev agent returns, verify the Implementation Log was written:
 ```bash

package/bundled/dev-pipeline/templates/sections/phase-review-agent.md CHANGED Viewed

@@ -9,7 +9,11 @@ After `/prizmkit-code-review` returns, verify the review report:
 ```bash
 grep -q "## Verdict" .prizmkit/specs/{{FEATURE_SLUG}}/review-report.md && echo "GATE:PASS" || echo "GATE:MISSING"
 ```
-If GATE:MISSING — re-run `/prizmkit-code-review`.
+If GATE:MISSING:
+- Do not re-run `/prizmkit-code-review` in an unbounded report-repair loop.
+- Perform one bounded status check; retry at most once: inspect the skill output, `review-report.md` path, and any Reviewer/Dev spawn messages.
+- If the missing report is caused by team/config/lock errors from the internal Reviewer/Dev agent spawn, retry `/prizmkit-code-review` at most once only if it appears transient.
+- If the report is still missing after that single check/retry, write `failure-log.md` with the spawn/skill error and last observable state, then either perform a safe inline fallback review (spec/plan/diff/tests → write `review-report.md` with `## Verdict`) or stop with a clear recovery failure.
 Read `review-report.md` and check the Verdict:
 - `PASS` → proceed to next phase

package/bundled/dev-pipeline/templates/sections/phase-review-full.md CHANGED Viewed

@@ -9,7 +9,11 @@ After `/prizmkit-code-review` returns, verify the review report:
 ```bash
 grep -q "## Verdict" .prizmkit/specs/{{FEATURE_SLUG}}/review-report.md && echo "GATE:PASS" || echo "GATE:MISSING"
 ```
-If GATE:MISSING — re-run `/prizmkit-code-review`.
+If GATE:MISSING:
+- Do not re-run `/prizmkit-code-review` in an unbounded report-repair loop.
+- Perform one bounded status check; retry at most once: inspect the skill output, `review-report.md` path, and any Reviewer/Dev spawn messages.
+- If the missing report is caused by team/config/lock errors from the internal Reviewer/Dev agent spawn, retry `/prizmkit-code-review` at most once only if it appears transient.
+- If the report is still missing after that single check/retry, write `failure-log.md` with the spawn/skill error and last observable state, then either perform a safe inline fallback review (spec/plan/diff/tests → write `review-report.md` with `## Verdict`) or stop with a clear recovery failure.
 Read `review-report.md` and check the Verdict:
 - `PASS` → proceed to next phase

package/bundled/dev-pipeline/tests/test_auto_skip.py CHANGED Viewed

@@ -303,6 +303,45 @@ def _run_get_next(fl_path, state_dir):
     return result.stdout.strip()
+def _run_update(fl_path, state_dir, feature_id, session_status, session_id="session-1", max_retries=3):
+    cmd = [
+        "python3", _SCRIPT,
+        "--feature-list", fl_path,
+        "--state-dir", state_dir,
+        "--feature-id", feature_id,
+        "--session-status", session_status,
+        "--session-id", session_id,
+        "--max-retries", str(max_retries),
+        "--action", "update",
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    assert result.returncode == 0, result.stderr
+    return json.loads(result.stdout)
+class TestInfraErrorUpdate:
+    def test_infra_error_keeps_pending_without_consuming_retry(self, tmp_path):
+        features = [_make_feature("F-001", "Root", status="in_progress")]
+        fl_path = _write_fl(tmp_path, features)
+        state_dir = _init_state(tmp_path, ["F-001"])
+        status_path = os.path.join(state_dir, "features", "F-001", "status.json")
+        fs = load_feature_status(state_dir, "F-001")
+        fs["retry_count"] = 2
+        write_json_file(status_path, fs)
+        result = _run_update(fl_path, state_dir, "F-001", "infra_error", "session-infra", max_retries=3)
+        assert result["new_status"] == "pending"
+        assert result["retry_count"] == 2
+        assert result["restart_policy"] == "infra_retry"
+        assert _read_statuses(fl_path)["F-001"] == "pending"
+        fs = load_feature_status(state_dir, "F-001")
+        assert fs["retry_count"] == 2
+        assert fs["infra_error_count"] == 1
+        assert fs["last_infra_error_session_id"] == "session-infra"
 class TestUnskipByFeatureId:
     """Unskip with --feature-id targets a specific failed feature + downstream."""

package/bundled/dev-pipeline-windows/SCHEMA_ANALYSIS.md CHANGED Viewed

@@ -353,7 +353,7 @@ pending, in_progress, completed, failed, skipped
 | `LOG_CLEANUP_ENABLED` | boolean | 1 | Periodic session log cleanup |
 | `LOG_RETENTION_DAYS` | integer | 14 | Delete session logs older than N days |
 | `LOG_MAX_TOTAL_MB` | integer | 1024 | Keep total logs under N MB |
-| `STOP_ON_FAILURE` | boolean | 0 | Stop after the first failed task |
+| `STOP_ON_FAILURE` | boolean | 0 | Stop after a task exhausts retries |
 | `ENABLE_DEPLOY` | boolean | 0 | Start deploy session after all tasks complete |
 | `DEV_BRANCH` | string | auto-generated | Optional custom dev branch name |
 | `AUTO_PUSH` | boolean | 0 | Push original branch after successful merge |

package/bundled/dev-pipeline-windows/lib/common.ps1 CHANGED Viewed

@@ -145,6 +145,25 @@ function Invoke-PrizmPythonText {
   if ($LASTEXITCODE -ne 0) { throw "Python command failed: $($Arguments -join ' ')" }
 }
+function Test-PrizmInfraError {
+  param([string]$SessionLog, [string]$ProgressJson)
+  $parts = @()
+  if ($SessionLog -and (Test-Path $SessionLog)) {
+    try {
+      $text = Get-Content $SessionLog -Raw -ErrorAction Stop
+      if ($text.Length -gt 65536) { $text = $text.Substring($text.Length - 65536) }
+      $parts += $text
+    } catch {}
+  }
+  if ($ProgressJson -and (Test-Path $ProgressJson)) {
+    try { $parts += (Get-Content $ProgressJson -Raw -ErrorAction Stop) } catch {}
+  }
+  if ($parts.Count -eq 0) { return $false }
+  $haystack = $parts -join "`n"
+  return ($haystack -match '(?i)auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded')
+}
 function Get-PrizmConfigValue {
   param([string]$ConfigPath, [string]$Key)
   if (-not (Test-Path $ConfigPath)) { return $null }

package/bundled/dev-pipeline-windows/lib/pipeline.ps1 CHANGED Viewed

@@ -618,10 +618,16 @@ function Invoke-PrizmPipeline {
     }
     Stop-PrizmProgressParser $parserProcess
+    $wasInfraError = ($exitCode -ne 0 -and (Test-PrizmInfraError -SessionLog $sessionLog -ProgressJson $progressJson))
     $status = 'crashed'
     if ($wasTimedOut) {
       $status = 'timed_out'
       Write-PrizmWarn "AI session timed out after $timeoutSeconds seconds"
+    } elseif ($wasInfraError) {
+      $status = 'infra_error'
+      Write-PrizmWarn "AI session failed due to AI CLI/provider infrastructure error"
+      Write-PrizmWarn "Infrastructure errors are retried without consuming code retry budget"
     } elseif ($wasStaleKilled -or (Test-Path $staleKillMarker)) {
       Write-PrizmWarn "Session was stale-killed by heartbeat monitor (no progress for too long)"
       Write-PrizmWarn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -645,8 +651,12 @@ function Invoke-PrizmPipeline {
     }
     $mergeSucceeded = $true
+    $itemListStatus = ''
     if ($status -eq 'success') {
-      Invoke-PrizmPythonText $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
+      $updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
+      if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
+        $itemListStatus = [string]$updateResult.new_status
+      }
       if (Test-PrizmGitDirty $paths.ProjectRoot) {
         if ($hadDirtyBaseline) {
@@ -676,7 +686,10 @@ function Invoke-PrizmPipeline {
     }
     if ($status -ne 'success') {
-      Invoke-PrizmPythonText $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
+      $updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
+      if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
+        $itemListStatus = [string]$updateResult.new_status
+      }
       if ($isGitRepository) {
         Invoke-PrizmGitCommitPath $paths.ProjectRoot $listPath "chore($CurrentItemId): update $idName status" | Out-Null
       }
@@ -687,6 +700,7 @@ function Invoke-PrizmPipeline {
     } else {
       Write-PrizmError "$Kind item failed: $CurrentItemId. Log: $sessionLog"
     }
+    $script:PRIZM_ITEM_LIST_STATUS = $itemListStatus
     $script:PRIZM_ITEM_EXIT_CODE = if ($status -eq 'success' -and $mergeSucceeded) { 0 } else { 1 }
     return
   }
@@ -748,9 +762,11 @@ function Invoke-PrizmPipeline {
       $global:PRIZM_EXIT_CODE = $lastExitCode
       return
     }
-    if ($lastExitCode -ne 0 -and $stopOnFailure) {
+    if ($lastExitCode -ne 0 -and $stopOnFailure -and $script:PRIZM_ITEM_LIST_STATUS -eq 'failed') {
       $global:PRIZM_EXIT_CODE = $lastExitCode
       return
+    } elseif ($lastExitCode -ne 0 -and $stopOnFailure) {
+      Write-PrizmInfo "STOP_ON_FAILURE: $nextItemId is $($script:PRIZM_ITEM_LIST_STATUS); retry budget not exhausted, continuing."
     }
   }
 }