prizmkit 1.1.67 → 1.1.69

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/bundled/VERSION.json +3 -3
  2. package/bundled/dev-pipeline/lib/common.sh +40 -0
  3. package/bundled/dev-pipeline/lib/heartbeat.sh +5 -5
  4. package/bundled/dev-pipeline/run-bugfix.sh +26 -5
  5. package/bundled/dev-pipeline/run-feature.sh +20 -3
  6. package/bundled/dev-pipeline/run-refactor.sh +26 -5
  7. package/bundled/dev-pipeline/scripts/parse-stream-progress.py +217 -18
  8. package/bundled/dev-pipeline/scripts/update-bug-status.py +15 -0
  9. package/bundled/dev-pipeline/scripts/update-feature-status.py +18 -0
  10. package/bundled/dev-pipeline/scripts/update-refactor-status.py +15 -0
  11. package/bundled/dev-pipeline/templates/bootstrap-tier2.md +19 -1
  12. package/bundled/dev-pipeline/templates/bootstrap-tier3.md +19 -1
  13. package/bundled/dev-pipeline/templates/refactor-bootstrap-prompt.md +22 -1
  14. package/bundled/dev-pipeline/templates/sections/phase-critic-plan-full.md +10 -0
  15. package/bundled/dev-pipeline/templates/sections/phase-critic-plan.md +10 -0
  16. package/bundled/dev-pipeline/templates/sections/phase-implement-agent.md +12 -0
  17. package/bundled/dev-pipeline/templates/sections/phase-implement-full.md +12 -0
  18. package/bundled/dev-pipeline/templates/sections/phase-review-agent.md +5 -1
  19. package/bundled/dev-pipeline/templates/sections/phase-review-full.md +5 -1
  20. package/bundled/dev-pipeline/tests/test_auto_skip.py +39 -0
  21. package/bundled/dev-pipeline-windows/SCHEMA_ANALYSIS.md +1 -1
  22. package/bundled/dev-pipeline-windows/lib/common.ps1 +19 -0
  23. package/bundled/dev-pipeline-windows/lib/pipeline.ps1 +19 -3
  24. package/bundled/dev-pipeline-windows/scripts/parse-stream-progress.py +217 -18
  25. package/bundled/dev-pipeline-windows/scripts/update-bug-status.py +15 -0
  26. package/bundled/dev-pipeline-windows/scripts/update-feature-status.py +18 -0
  27. package/bundled/dev-pipeline-windows/scripts/update-refactor-status.py +15 -0
  28. package/bundled/dev-pipeline-windows/templates/refactor-bootstrap-prompt.md +22 -1
  29. package/bundled/dev-pipeline-windows/templates/sections/phase-critic-plan-full.md +10 -0
  30. package/bundled/dev-pipeline-windows/templates/sections/phase-critic-plan.md +10 -0
  31. package/bundled/dev-pipeline-windows/templates/sections/phase-implement-agent.md +12 -0
  32. package/bundled/dev-pipeline-windows/templates/sections/phase-implement-full.md +12 -0
  33. package/bundled/dev-pipeline-windows/templates/sections/phase-review-agent.md +5 -1
  34. package/bundled/dev-pipeline-windows/templates/sections/phase-review-full.md +5 -1
  35. package/bundled/skills/_metadata.json +1 -1
  36. package/package.json +1 -1
@@ -41,6 +41,7 @@ SESSION_STATUS_VALUES = [
41
41
  "failed",
42
42
  "crashed",
43
43
  "timed_out",
44
+ "infra_error",
44
45
  "commit_missing",
45
46
  "docs_missing",
46
47
  "merge_conflict",
@@ -280,6 +281,16 @@ def action_update(args, bug_list_path, state_dir):
280
281
  bs["sessions"] = []
281
282
  bs["last_session_id"] = None
282
283
 
284
+ err = update_bug_in_list(bug_list_path, bug_id, new_status)
285
+ if err:
286
+ error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
287
+ return
288
+ elif session_status == "infra_error":
289
+ new_status = "pending"
290
+ bs["infra_error_count"] = bs.get("infra_error_count", 0) + 1
291
+ bs["last_infra_error_session_id"] = session_id
292
+ bs["resume_from_phase"] = None
293
+
283
294
  err = update_bug_in_list(bug_list_path, bug_id, new_status)
284
295
  if err:
285
296
  error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
@@ -333,6 +344,10 @@ def action_update(args, bug_list_path, state_dir):
333
344
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
334
345
  summary["degraded_reason"] = session_status
335
346
  summary["restart_policy"] = "finalization_retry"
347
+ elif session_status == "infra_error":
348
+ summary["restart_policy"] = "infra_retry"
349
+ summary["infra_error_count"] = bs.get("infra_error_count", 0)
350
+ summary["artifacts_preserved"] = True
336
351
  elif session_status != "success":
337
352
  summary["restart_policy"] = "full_restart"
338
353
  summary["cleanup_performed"] = cleaned
@@ -45,6 +45,7 @@ SESSION_STATUS_VALUES = [
45
45
  "failed",
46
46
  "crashed",
47
47
  "timed_out",
48
+ "infra_error",
48
49
  "commit_missing",
49
50
  "docs_missing",
50
51
  "merge_conflict",
@@ -645,6 +646,19 @@ def action_update(args, feature_list_path, state_dir):
645
646
  fs["sessions"] = []
646
647
  fs["last_session_id"] = None
647
648
 
649
+ err = update_feature_in_list(feature_list_path, feature_id, new_status)
650
+ if err:
651
+ error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
652
+ return
653
+ elif session_status == "infra_error":
654
+ # AI CLI/provider outage, auth failure, gateway error, etc.
655
+ # This is outside the code's control, so keep the item pending without
656
+ # consuming the task's retry budget.
657
+ new_status = "pending"
658
+ fs["infra_error_count"] = fs.get("infra_error_count", 0) + 1
659
+ fs["last_infra_error_session_id"] = session_id
660
+ fs["resume_from_phase"] = None
661
+
648
662
  err = update_feature_in_list(feature_list_path, feature_id, new_status)
649
663
  if err:
650
664
  error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
@@ -701,6 +715,10 @@ def action_update(args, feature_list_path, state_dir):
701
715
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
702
716
  summary["degraded_reason"] = session_status
703
717
  summary["restart_policy"] = "finalization_retry"
718
+ elif session_status == "infra_error":
719
+ summary["restart_policy"] = "infra_retry"
720
+ summary["infra_error_count"] = fs.get("infra_error_count", 0)
721
+ summary["artifacts_preserved"] = True
704
722
  elif session_status != "success":
705
723
  summary["restart_policy"] = "preserve_and_retry"
706
724
  summary["artifacts_preserved"] = True
@@ -42,6 +42,7 @@ SESSION_STATUS_VALUES = [
42
42
  "failed",
43
43
  "crashed",
44
44
  "timed_out",
45
+ "infra_error",
45
46
  "commit_missing",
46
47
  "docs_missing",
47
48
  "merge_conflict",
@@ -314,6 +315,16 @@ def action_update(args, refactor_list_path, state_dir):
314
315
  rs["sessions"] = []
315
316
  rs["last_session_id"] = None
316
317
 
318
+ err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
319
+ if err:
320
+ error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
321
+ return
322
+ elif session_status == "infra_error":
323
+ new_status = "pending"
324
+ rs["infra_error_count"] = rs.get("infra_error_count", 0) + 1
325
+ rs["last_infra_error_session_id"] = session_id
326
+ rs["resume_from_phase"] = None
327
+
317
328
  err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
318
329
  if err:
319
330
  error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
@@ -376,6 +387,10 @@ def action_update(args, refactor_list_path, state_dir):
376
387
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
377
388
  summary["degraded_reason"] = session_status
378
389
  summary["restart_policy"] = "finalization_retry"
390
+ elif session_status == "infra_error":
391
+ summary["restart_policy"] = "infra_retry"
392
+ summary["infra_error_count"] = rs.get("infra_error_count", 0)
393
+ summary["artifacts_preserved"] = True
379
394
  elif session_status != "success":
380
395
  summary["restart_policy"] = "full_restart"
381
396
  summary["cleanup_performed"] = cleaned
@@ -14,6 +14,12 @@ You are the **session orchestrator**. Implement Feature {{FEATURE_ID}}: "{{FEATU
14
14
 
15
15
  **Tier 2 — Dual Agent**: You handle context + planning directly. Then spawn Dev and Reviewer subagents. Spawn Dev and Reviewer agents via the Agent tool.
16
16
 
17
+ **Agent spawn failure policy (all Agent tool calls)**:
18
+ - If spawning Dev, Reviewer, or Critic fails with team/config/lock errors, retry at most once.
19
+ - If the second attempt fails, do not keep spawning variants and do not enter artifact polling for Implementation Log, challenge report, or review report markers.
20
+ - Use the documented inline/recovery fallback for that phase: write the required report yourself where possible, complete remaining Dev work directly in the orchestrator when safe, or write `failure-log.md` with the spawn error and last observable state before stopping for recovery.
21
+ - Apply the same cap to any re-spawn for report repair or resume prompts; do not burn multiple minutes on identical team/config/lock failures.
22
+
17
23
  ### Feature Description
18
24
 
19
25
  {{FEATURE_DESCRIPTION}}
@@ -163,6 +169,8 @@ Before proceeding past CP-1, verify:
163
169
 
164
170
  Spawn Reviewer agent (Agent tool, subagent_type="prizm-dev-team-reviewer", mode="plan", run_in_background=false).
165
171
 
172
+ Spawn failure cap: for team/config/lock errors, retry at most once for this Reviewer spawn. If the second attempt fails, do not poll for report artifacts; fix/check the plan inline or write `failure-log.md` before stopping for recovery.
173
+
166
174
  Prompt:
167
175
  > "Read {{REVIEWER_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
168
176
  > 1. Read `.prizmkit/specs/{{FEATURE_SLUG}}/context-snapshot.md` FIRST — Section 3 has project context, Section 4 has file manifest.
@@ -186,6 +194,8 @@ If CRITIC:MISSING — skip Phase 3.5 entirely and proceed to Phase 4. Log: "Crit
186
194
 
187
195
  Spawn Critic agent (Agent tool, subagent_type="prizm-dev-team-critic", mode="plan", run_in_background=false).
188
196
 
197
+ Spawn failure cap: for team/config/lock errors, retry at most once for this Critic spawn. If the second attempt fails, do not poll for `challenge-report.md`; perform the plan challenge inline and record the fallback.
198
+
189
199
  Prompt:
190
200
  > "Read {{CRITIC_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
191
201
  > **MODE: Plan Challenge**
@@ -208,6 +218,8 @@ Wait for Critic to return.
208
218
 
209
219
  Spawn Dev subagent (Agent tool, subagent_type="prizm-dev-team-dev", run_in_background=false).
210
220
 
221
+ Spawn failure cap: for team/config/lock errors, retry at most once for this Dev spawn. If the second attempt fails, do not poll for `## Implementation Log`; write `failure-log.md` and either implement remaining tasks directly in the orchestrator or stop for recovery.
222
+
211
223
  Prompt:
212
224
  > "Read {{DEV_SUBAGENT_PATH}}. Implement feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}).
213
225
  > **IMPORTANT**: Read `.prizmkit/specs/{{FEATURE_SLUG}}/context-snapshot.md` FIRST — Section 3 has Prizm Context (TRAPS/RULES), Section 4 has File Manifest with paths and interfaces.
@@ -232,6 +244,8 @@ If GATE:MISSING — send message to Dev (re-spawn if needed): "Write the '## Imp
232
244
 
233
245
  Spawn Reviewer subagent (Agent tool, subagent_type="prizm-dev-team-reviewer", run_in_background=false).
234
246
 
247
+ Spawn failure cap: for team/config/lock errors, retry at most once for this Reviewer spawn. If the second attempt fails, do not poll for `review-report.md`; write `failure-log.md` with the spawn error and last observable state before stopping or performing an inline fallback.
248
+
235
249
  Prompt:
236
250
  > "Read {{REVIEWER_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
237
251
  > 1. Read `.prizmkit/specs/{{FEATURE_SLUG}}/spec.md` for goals and acceptance criteria
@@ -248,7 +262,11 @@ After Reviewer agent returns, verify the review report was written:
248
262
  ```bash
249
263
  grep -q "## Verdict" .prizmkit/specs/{{FEATURE_SLUG}}/review-report.md && echo "GATE:PASS" || echo "GATE:MISSING"
250
264
  ```
251
- If GATE:MISSING — send message to Reviewer (re-spawn if needed): "Write review-report.md to .prizmkit/specs/{{FEATURE_SLUG}}/."
265
+ If GATE:MISSING:
266
+ - Do not re-spawn Reviewer or re-run `/prizmkit-code-review` in an unbounded report-repair loop.
267
+ - Perform one bounded status check; retry at most once: inspect Reviewer output, code-review skill output, `review-report.md` path, and any Reviewer/Dev spawn messages.
268
+ - If the missing report is caused by team/config/lock errors from Reviewer or the internal code-review loop, write `failure-log.md` with the spawn/skill error and last observable state.
269
+ - If the report is still missing after that single check/retry, either perform a safe inline fallback review and write `review-report.md` with `## Verdict`, or stop with a clear recovery failure.
252
270
 
253
271
  Read `review-report.md` and check the Verdict:
254
272
  - `PASS` → proceed to next phase
@@ -14,6 +14,12 @@ You are the **session orchestrator**. Implement Feature {{FEATURE_ID}}: "{{FEATU
14
14
 
15
15
  **Tier 3 — Full Team**: For complex features, use the full pipeline (Phase 0–6) with Dev + Reviewer agents spawned via the Agent tool.
16
16
 
17
+ **Agent spawn failure policy (all Agent tool calls)**:
18
+ - If spawning Dev, Reviewer, or Critic fails with team/config/lock errors, retry at most once.
19
+ - If the second attempt fails, do not keep spawning variants and do not enter artifact polling for Implementation Log, challenge report, or review report markers.
20
+ - Use the documented inline/recovery fallback for that phase: write the required report yourself where possible, complete remaining Dev work directly in the orchestrator when safe, or write `failure-log.md` with the spawn error and last observable state before stopping for recovery.
21
+ - Apply the same cap to any re-spawn for report repair or resume prompts; do not burn multiple minutes on identical team/config/lock failures.
22
+
17
23
  ### Feature Description
18
24
 
19
25
  {{FEATURE_DESCRIPTION}}
@@ -190,6 +196,8 @@ Before proceeding past CP-1, verify:
190
196
 
191
197
  Spawn Reviewer agent (Agent tool, subagent_type="prizm-dev-team-reviewer", mode="plan", run_in_background=false).
192
198
 
199
+ Spawn failure cap: for team/config/lock errors, retry at most once for this Reviewer spawn. If the second attempt fails, do not poll for report artifacts; fix/check the plan inline or write `failure-log.md` before stopping for recovery.
200
+
193
201
  Prompt:
194
202
  > "Read {{REVIEWER_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
195
203
  > 1. Read `.prizmkit/specs/{{FEATURE_SLUG}}/context-snapshot.md` FIRST — Section 3 has project context, Section 4 has file manifest.
@@ -217,6 +225,8 @@ If CRITIC:MISSING — skip Phase 3.5 entirely and proceed to Phase 4. Log: "Crit
217
225
 
218
226
  Spawn Critic agent (Agent tool, subagent_type="prizm-dev-team-critic", mode="plan", run_in_background=false).
219
227
 
228
+ Spawn failure cap: for team/config/lock errors, retry at most once for this Critic spawn. If the second attempt fails, do not poll for challenge reports; perform the plan challenge inline and record the fallback.
229
+
220
230
  Prompt:
221
231
  > "Read {{CRITIC_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
222
232
  > **MODE: Plan Challenge**
@@ -263,6 +273,8 @@ grep -c '^\- \[ \]' .prizmkit/specs/{{FEATURE_SLUG}}/plan.md 2>/dev/null || true
263
273
 
264
274
  Spawn Dev agent (Agent tool, subagent_type="prizm-dev-team-dev", run_in_background=false).
265
275
 
276
+ Spawn failure cap: for team/config/lock errors, retry at most once for this Dev spawn. If the second attempt fails, do not poll for `## Implementation Log`; write `failure-log.md` and either implement remaining tasks directly in the orchestrator or stop for recovery.
277
+
266
278
  Prompt:
267
279
  > "Read {{DEV_SUBAGENT_PATH}}. Implement feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}).
268
280
  > **IMPORTANT**: Read `.prizmkit/specs/{{FEATURE_SLUG}}/context-snapshot.md` FIRST — Section 3 has Prizm Context (TRAPS/RULES), Section 4 has File Manifest with paths and interfaces.
@@ -297,6 +309,8 @@ All tasks `[x]`, tests pass.
297
309
 
298
310
  Spawn Reviewer agent (Agent tool, subagent_type="prizm-dev-team-reviewer", run_in_background=false).
299
311
 
312
+ Spawn failure cap: for team/config/lock errors, retry at most once for this Reviewer spawn. If the second attempt fails, do not poll for `review-report.md`; write `failure-log.md` with the spawn error and last observable state before stopping or performing an inline fallback.
313
+
300
314
  Prompt:
301
315
  > "Read {{REVIEWER_SUBAGENT_PATH}}. For feature {{FEATURE_ID}} (slug: {{FEATURE_SLUG}}):
302
316
  > 1. Read `.prizmkit/specs/{{FEATURE_SLUG}}/spec.md` for goals and acceptance criteria
@@ -313,7 +327,11 @@ After Reviewer agent returns, verify the review report was written:
313
327
  ```bash
314
328
  grep -q "## Verdict" .prizmkit/specs/{{FEATURE_SLUG}}/review-report.md && echo "GATE:PASS" || echo "GATE:MISSING"
315
329
  ```
316
- If GATE:MISSING — send message to Reviewer (re-spawn if needed): "Write review-report.md to .prizmkit/specs/{{FEATURE_SLUG}}/."
330
+ If GATE:MISSING:
331
+ - Do not re-spawn Reviewer or re-run `/prizmkit-code-review` in an unbounded report-repair loop.
332
+ - Perform one bounded status check; retry at most once: inspect Reviewer output, code-review skill output, `review-report.md` path, and any Reviewer/Dev spawn messages.
333
+ - If the missing report is caused by team/config/lock errors from Reviewer or the internal code-review loop, write `failure-log.md` with the spawn/skill error and last observable state.
334
+ - If the report is still missing after that single check/retry, either perform a safe inline fallback review and write `review-report.md` with `## Verdict`, or stop with a clear recovery failure.
317
335
 
318
336
  Read `review-report.md` and check the Verdict:
319
337
  - `PASS` → proceed to next phase
@@ -80,6 +80,12 @@ You are the **refactor session orchestrator**. Execute Refactor {{REFACTOR_ID}}:
80
80
 
81
81
  **YOU are the orchestrator. Execute each phase by spawning the appropriate team agent with run_in_background=false.**
82
82
 
83
+ **Agent spawn failure policy (all Agent tool calls)**:
84
+ - If spawning Dev or Reviewer fails with team/config/lock errors, retry at most once.
85
+ - If the second attempt fails, do not keep spawning variants and do not enter artifact polling for Implementation Log, review-report, or refactor-report markers.
86
+ - Use the documented inline/recovery fallback for that phase: complete remaining refactor work directly in the orchestrator when safe, write the required report yourself where possible, or write `failure-log.md` with the spawn error and last observable state before stopping for recovery.
87
+ - Apply the same cap to any re-spawn for report repair or resume prompts; do not burn multiple minutes on identical team/config/lock failures.
88
+
83
89
  ## Workflow Checkpoint System
84
90
 
85
91
  A checkpoint file tracks your progress through this workflow:
@@ -164,6 +170,7 @@ Include browser verification approach in plan.md:
164
170
  **Goal**: Execute all tasks from plan.md while preserving existing behavior.
165
171
 
166
172
  - Spawn Dev agent (Agent tool, subagent_type="prizm-dev-team-dev", run_in_background=false)
173
+ Spawn failure cap: for team/config/lock errors, retry at most once for this Dev spawn. If the second attempt fails, do not poll for `## Implementation Log`; write `failure-log.md` and either complete remaining refactor work directly in the orchestrator or stop for recovery.
167
174
  Prompt: "Read {{DEV_SUBAGENT_PATH}}. For refactor {{REFACTOR_ID}} ('{{REFACTOR_TITLE}}'):
168
175
  1. Read `.prizmkit/refactor/{{REFACTOR_ID}}/spec.md` and `.prizmkit/refactor/{{REFACTOR_ID}}/plan.md`
169
176
  2. Read `.prizmkit/prizm-docs/` for affected modules (TRAPS, RULES, PATTERNS)
@@ -201,6 +208,7 @@ Include browser verification approach in plan.md:
201
208
  **Goal**: Verify refactoring quality and behavior preservation.
202
209
 
203
210
  - Spawn Reviewer agent (Agent tool, subagent_type="prizm-dev-team-reviewer", run_in_background=false)
211
+ Spawn failure cap: for team/config/lock errors, retry at most once for this Reviewer spawn. If the second attempt fails, do not poll for `review-report.md`; write `failure-log.md` with the spawn error and last observable state before stopping or performing an inline fallback.
204
212
  Prompt: "Read {{REVIEWER_SUBAGENT_PATH}}. For refactor {{REFACTOR_ID}}:
205
213
  1. Read `.prizmkit/refactor/{{REFACTOR_ID}}/spec.md` for goals and behavior preservation contracts
206
214
  2. Read `.prizmkit/refactor/{{REFACTOR_ID}}/plan.md` for architecture decisions and completed tasks
@@ -221,7 +229,20 @@ Include browser verification approach in plan.md:
221
229
  7. Report: verdict (PASS/NEEDS_FIXES), number of rounds, findings fixed/rejected
222
230
  "
223
231
  - **Wait for Reviewer to return**
224
- - Read `review-report.md`if PASS proceed, if NEEDS_FIXES log remaining findings and proceed.
232
+ - **Gate CheckReview Report**:
233
+ After Reviewer returns, verify the review report contains a verdict:
234
+ ```bash
235
+ grep -q "## Verdict" .prizmkit/refactor/{{REFACTOR_ID}}/review-report.md && echo "GATE:PASS" || echo "GATE:MISSING"
236
+ ```
237
+ If GATE:MISSING:
238
+ - Do not enter an unbounded report-repair loop and do not repeatedly re-spawn Reviewer.
239
+ - Perform one bounded status check; retry at most once: inspect the Reviewer output, `review-report.md` path, and any internal Reviewer/Dev spawn messages from `/prizmkit-code-review`.
240
+ - If the missing report is caused by team/config/lock errors from the Reviewer or internal Reviewer/Dev agent spawn, retry the Reviewer agent at most once only if it appears transient.
241
+ - If the report is still missing after that single check/retry, write `.prizmkit/refactor/{{REFACTOR_ID}}/failure-log.md` with the spawn/skill error and last observable state, then either perform a safe inline fallback review (spec/plan/diff/tests → write `review-report.md` with `## Verdict`) or stop with a clear recovery failure.
242
+
243
+ Read `review-report.md` and check the Verdict:
244
+ - `PASS` → proceed to next phase
245
+ - `NEEDS_FIXES` → log remaining findings and proceed (do not retry externally)
225
246
  - **CP-RF-3**: Code review complete, tests pass, behavior preserved
226
247
  - **Checkpoint update**: set step `prizmkit-code-review` to `"completed"` in `{{CHECKPOINT_PATH}}`
227
248
 
@@ -8,6 +8,16 @@ If CRITIC:MISSING — skip this phase entirely and proceed. Log: "Critic agent n
8
8
 
9
9
  **Choose ONE path based on `{{CRITIC_COUNT}}`:**
10
10
 
11
+ **Agent spawn failure policy**:
12
+ - If spawning Critic fails with team/config/lock errors, retry at most once.
13
+ - If the second attempt fails, do not keep spawning variants. Either create the required team once (when team tooling is available) or perform the plan challenge inline and write the required challenge report yourself.
14
+ - Record the fallback in the report; do not burn multiple minutes on repeated identical spawn failures.
15
+
16
+ **No silent report polling**:
17
+ - Do NOT run a long no-output loop waiting for `challenge-report*.md`.
18
+ - If you need to wait for a report file, use a short bounded check (≤120s) that prints elapsed time and reports present on every iteration.
19
+ - If reports are still missing after the bounded check, request one status update; if still missing, perform the missing challenge lens inline and continue.
20
+
11
21
  **If {{CRITIC_COUNT}} = 1 → Single Critic** (skip to CP-2.5 after this):
12
22
 
13
23
  **Spawn Agent**:
@@ -16,6 +16,16 @@ If CRITIC:MISSING — skip this phase entirely and proceed. Log: "Critic agent n
16
16
  **Prompt**:
17
17
  > {{AGENT_PROMPT_CRITIC_PLAN_CHALLENGE}}
18
18
 
19
+ **Agent spawn failure policy**:
20
+ - If spawning Critic fails with team/config/lock errors, retry at most once.
21
+ - If the second attempt fails, do not keep spawning variants. Either create the required team once (when team tooling is available) or perform the plan challenge inline and write `challenge-report.md` yourself.
22
+ - Record the fallback in the report; do not burn multiple minutes on repeated identical spawn failures.
23
+
24
+ **No silent report polling**:
25
+ - Do NOT run a long no-output loop waiting for `challenge-report.md`.
26
+ - If you need to wait for the report file, use a short bounded check (≤120s) that prints elapsed time and whether the report exists on every iteration.
27
+ - If the report is still missing after the bounded check, request one status update; if still missing, perform the challenge inline and continue.
28
+
19
29
  Wait for Critic to return.
20
30
  - Read challenge-report.md. For items marked CRITICAL/HIGH: decide whether to adjust plan.md or document why the plan stands.
21
31
  - Max 1 plan revision round.
@@ -15,11 +15,23 @@
15
15
  | subagent_type | prizm-dev-team-dev |
16
16
  | run_in_background | false |
17
17
 
18
+ **Agent spawn failure policy**:
19
+ - If spawning Dev fails with team/config/lock errors, retry at most once.
20
+ - If the second attempt fails, do not enter Implementation Log polling or repeated recovery spawn loops.
21
+ - Use the documented inline/recovery fallback: write `failure-log.md` with the spawn error and last observable state, then either complete remaining tasks directly in the orchestrator or stop with a clear failure for recovery.
22
+ - Apply the same cap to Dev re-spawns for Implementation Log repair or resume prompts; do not burn multiple minutes on identical team/config/lock failures.
23
+
18
24
  **Prompt**:
19
25
  > {{AGENT_PROMPT_DEV_IMPLEMENT}}
20
26
 
21
27
  Wait for Dev to return. All tasks must be `[x]`, tests pass.
22
28
 
29
+ **No silent artifact polling**:
30
+ - Do NOT run a long no-output loop that only waits for `## Implementation Log` or any other file marker.
31
+ - If you must wait for Dev after spawning or sending a status request, use short bounded checks (≤120s) that print a heartbeat line each iteration with: elapsed time, remaining unchecked task count, whether `## Implementation Log` exists, and whether `git diff --stat` changed.
32
+ - If Dev has no transcript/file/diff progress for one bounded check, send one status request. If there is still no progress on the next bounded check, stop waiting, write `failure-log.md` with the last observable state, and follow Subagent Timeout Recovery.
33
+ - Prefer the Agent tool's completion notification or Dev's `COMPLETION_SIGNAL`; file presence alone is not a liveness signal.
34
+
23
35
  **Gate Check — Implementation Log**:
24
36
  After Dev agent returns, verify the Implementation Log was written:
25
37
  ```bash
@@ -22,9 +22,21 @@ grep -c '^\- \[ \]' .prizmkit/specs/{{FEATURE_SLUG}}/plan.md 2>/dev/null || true
22
22
  | subagent_type | prizm-dev-team-dev |
23
23
  | run_in_background | false |
24
24
 
25
+ **Agent spawn failure policy**:
26
+ - If spawning Dev fails with team/config/lock errors, retry at most once.
27
+ - If the second attempt fails, do not enter Implementation Log polling or repeated recovery spawn loops.
28
+ - Use the documented inline/recovery fallback: write `failure-log.md` with the spawn error and last observable state, then either complete remaining tasks directly in the orchestrator or stop with a clear failure for recovery.
29
+ - Apply the same cap to Dev re-spawns for Implementation Log repair or resume prompts; do not burn multiple minutes on identical team/config/lock failures.
30
+
25
31
  **Prompt**:
26
32
  > {{AGENT_PROMPT_DEV_IMPLEMENT}}
27
33
 
34
+ **No silent artifact polling**:
35
+ - Do NOT run a long no-output loop that only waits for `## Implementation Log` or any other file marker.
36
+ - If you must wait for Dev after spawning or sending a status request, use short bounded checks (≤120s) that print a heartbeat line each iteration with: elapsed time, remaining unchecked task count, whether `## Implementation Log` exists, and whether `git diff --stat` changed.
37
+ - If Dev has no transcript/file/diff progress for one bounded check, send one status request. If there is still no progress on the next bounded check, stop waiting, write `failure-log.md` with the last observable state, and follow Subagent Timeout Recovery.
38
+ - Prefer the Agent tool's completion notification or Dev's `COMPLETION_SIGNAL`; file presence alone is not a liveness signal.
39
+
28
40
  **Gate Check — Implementation Log**:
29
41
  After Dev agent returns, verify the Implementation Log was written:
30
42
  ```bash
@@ -9,7 +9,11 @@ After `/prizmkit-code-review` returns, verify the review report:
9
9
  ```bash
10
10
  grep -q "## Verdict" .prizmkit/specs/{{FEATURE_SLUG}}/review-report.md && echo "GATE:PASS" || echo "GATE:MISSING"
11
11
  ```
12
- If GATE:MISSING — re-run `/prizmkit-code-review`.
12
+ If GATE:MISSING:
13
+ - Do not re-run `/prizmkit-code-review` in an unbounded report-repair loop.
14
+ - Perform one bounded status check; retry at most once: inspect the skill output, `review-report.md` path, and any Reviewer/Dev spawn messages.
15
+ - If the missing report is caused by team/config/lock errors from the internal Reviewer/Dev agent spawn, retry `/prizmkit-code-review` at most once only if it appears transient.
16
+ - If the report is still missing after that single check/retry, write `failure-log.md` with the spawn/skill error and last observable state, then either perform a safe inline fallback review (spec/plan/diff/tests → write `review-report.md` with `## Verdict`) or stop with a clear recovery failure.
13
17
 
14
18
  Read `review-report.md` and check the Verdict:
15
19
  - `PASS` → proceed to next phase
@@ -9,7 +9,11 @@ After `/prizmkit-code-review` returns, verify the review report:
9
9
  ```bash
10
10
  grep -q "## Verdict" .prizmkit/specs/{{FEATURE_SLUG}}/review-report.md && echo "GATE:PASS" || echo "GATE:MISSING"
11
11
  ```
12
- If GATE:MISSING — re-run `/prizmkit-code-review`.
12
+ If GATE:MISSING:
13
+ - Do not re-run `/prizmkit-code-review` in an unbounded report-repair loop.
14
+ - Perform one bounded status check; retry at most once: inspect the skill output, `review-report.md` path, and any Reviewer/Dev spawn messages.
15
+ - If the missing report is caused by team/config/lock errors from the internal Reviewer/Dev agent spawn, retry `/prizmkit-code-review` at most once only if it appears transient.
16
+ - If the report is still missing after that single check/retry, write `failure-log.md` with the spawn/skill error and last observable state, then either perform a safe inline fallback review (spec/plan/diff/tests → write `review-report.md` with `## Verdict`) or stop with a clear recovery failure.
13
17
 
14
18
  Read `review-report.md` and check the Verdict:
15
19
  - `PASS` → proceed to next phase
@@ -303,6 +303,45 @@ def _run_get_next(fl_path, state_dir):
303
303
  return result.stdout.strip()
304
304
 
305
305
 
306
+ def _run_update(fl_path, state_dir, feature_id, session_status, session_id="session-1", max_retries=3):
307
+ cmd = [
308
+ "python3", _SCRIPT,
309
+ "--feature-list", fl_path,
310
+ "--state-dir", state_dir,
311
+ "--feature-id", feature_id,
312
+ "--session-status", session_status,
313
+ "--session-id", session_id,
314
+ "--max-retries", str(max_retries),
315
+ "--action", "update",
316
+ ]
317
+ result = subprocess.run(cmd, capture_output=True, text=True)
318
+ assert result.returncode == 0, result.stderr
319
+ return json.loads(result.stdout)
320
+
321
+
322
+ class TestInfraErrorUpdate:
323
+ def test_infra_error_keeps_pending_without_consuming_retry(self, tmp_path):
324
+ features = [_make_feature("F-001", "Root", status="in_progress")]
325
+ fl_path = _write_fl(tmp_path, features)
326
+ state_dir = _init_state(tmp_path, ["F-001"])
327
+ status_path = os.path.join(state_dir, "features", "F-001", "status.json")
328
+ fs = load_feature_status(state_dir, "F-001")
329
+ fs["retry_count"] = 2
330
+ write_json_file(status_path, fs)
331
+
332
+ result = _run_update(fl_path, state_dir, "F-001", "infra_error", "session-infra", max_retries=3)
333
+
334
+ assert result["new_status"] == "pending"
335
+ assert result["retry_count"] == 2
336
+ assert result["restart_policy"] == "infra_retry"
337
+ assert _read_statuses(fl_path)["F-001"] == "pending"
338
+
339
+ fs = load_feature_status(state_dir, "F-001")
340
+ assert fs["retry_count"] == 2
341
+ assert fs["infra_error_count"] == 1
342
+ assert fs["last_infra_error_session_id"] == "session-infra"
343
+
344
+
306
345
  class TestUnskipByFeatureId:
307
346
  """Unskip with --feature-id targets a specific failed feature + downstream."""
308
347
 
@@ -353,7 +353,7 @@ pending, in_progress, completed, failed, skipped
353
353
  | `LOG_CLEANUP_ENABLED` | boolean | 1 | Periodic session log cleanup |
354
354
  | `LOG_RETENTION_DAYS` | integer | 14 | Delete session logs older than N days |
355
355
  | `LOG_MAX_TOTAL_MB` | integer | 1024 | Keep total logs under N MB |
356
- | `STOP_ON_FAILURE` | boolean | 0 | Stop after the first failed task |
356
+ | `STOP_ON_FAILURE` | boolean | 0 | Stop after a task exhausts retries |
357
357
  | `ENABLE_DEPLOY` | boolean | 0 | Start deploy session after all tasks complete |
358
358
  | `DEV_BRANCH` | string | auto-generated | Optional custom dev branch name |
359
359
  | `AUTO_PUSH` | boolean | 0 | Push original branch after successful merge |
@@ -145,6 +145,25 @@ function Invoke-PrizmPythonText {
145
145
  if ($LASTEXITCODE -ne 0) { throw "Python command failed: $($Arguments -join ' ')" }
146
146
  }
147
147
 
148
+ function Test-PrizmInfraError {
149
+ param([string]$SessionLog, [string]$ProgressJson)
150
+ $parts = @()
151
+ if ($SessionLog -and (Test-Path $SessionLog)) {
152
+ try {
153
+ $text = Get-Content $SessionLog -Raw -ErrorAction Stop
154
+ if ($text.Length -gt 65536) { $text = $text.Substring($text.Length - 65536) }
155
+ $parts += $text
156
+ } catch {}
157
+ }
158
+ if ($ProgressJson -and (Test-Path $ProgressJson)) {
159
+ try { $parts += (Get-Content $ProgressJson -Raw -ErrorAction Stop) } catch {}
160
+ }
161
+ if ($parts.Count -eq 0) { return $false }
162
+
163
+ $haystack = $parts -join "`n"
164
+ return ($haystack -match '(?i)auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded')
165
+ }
166
+
148
167
  function Get-PrizmConfigValue {
149
168
  param([string]$ConfigPath, [string]$Key)
150
169
  if (-not (Test-Path $ConfigPath)) { return $null }
@@ -618,10 +618,16 @@ function Invoke-PrizmPipeline {
618
618
  }
619
619
  Stop-PrizmProgressParser $parserProcess
620
620
 
621
+ $wasInfraError = ($exitCode -ne 0 -and (Test-PrizmInfraError -SessionLog $sessionLog -ProgressJson $progressJson))
622
+
621
623
  $status = 'crashed'
622
624
  if ($wasTimedOut) {
623
625
  $status = 'timed_out'
624
626
  Write-PrizmWarn "AI session timed out after $timeoutSeconds seconds"
627
+ } elseif ($wasInfraError) {
628
+ $status = 'infra_error'
629
+ Write-PrizmWarn "AI session failed due to AI CLI/provider infrastructure error"
630
+ Write-PrizmWarn "Infrastructure errors are retried without consuming code retry budget"
625
631
  } elseif ($wasStaleKilled -or (Test-Path $staleKillMarker)) {
626
632
  Write-PrizmWarn "Session was stale-killed by heartbeat monitor (no progress for too long)"
627
633
  Write-PrizmWarn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -645,8 +651,12 @@ function Invoke-PrizmPipeline {
645
651
  }
646
652
 
647
653
  $mergeSucceeded = $true
654
+ $itemListStatus = ''
648
655
  if ($status -eq 'success') {
649
- Invoke-PrizmPythonText $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
656
+ $updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
657
+ if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
658
+ $itemListStatus = [string]$updateResult.new_status
659
+ }
650
660
 
651
661
  if (Test-PrizmGitDirty $paths.ProjectRoot) {
652
662
  if ($hadDirtyBaseline) {
@@ -676,7 +686,10 @@ function Invoke-PrizmPipeline {
676
686
  }
677
687
 
678
688
  if ($status -ne 'success') {
679
- Invoke-PrizmPythonText $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
689
+ $updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
690
+ if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
691
+ $itemListStatus = [string]$updateResult.new_status
692
+ }
680
693
  if ($isGitRepository) {
681
694
  Invoke-PrizmGitCommitPath $paths.ProjectRoot $listPath "chore($CurrentItemId): update $idName status" | Out-Null
682
695
  }
@@ -687,6 +700,7 @@ function Invoke-PrizmPipeline {
687
700
  } else {
688
701
  Write-PrizmError "$Kind item failed: $CurrentItemId. Log: $sessionLog"
689
702
  }
703
+ $script:PRIZM_ITEM_LIST_STATUS = $itemListStatus
690
704
  $script:PRIZM_ITEM_EXIT_CODE = if ($status -eq 'success' -and $mergeSucceeded) { 0 } else { 1 }
691
705
  return
692
706
  }
@@ -748,9 +762,11 @@ function Invoke-PrizmPipeline {
748
762
  $global:PRIZM_EXIT_CODE = $lastExitCode
749
763
  return
750
764
  }
751
- if ($lastExitCode -ne 0 -and $stopOnFailure) {
765
+ if ($lastExitCode -ne 0 -and $stopOnFailure -and $script:PRIZM_ITEM_LIST_STATUS -eq 'failed') {
752
766
  $global:PRIZM_EXIT_CODE = $lastExitCode
753
767
  return
768
+ } elseif ($lastExitCode -ne 0 -and $stopOnFailure) {
769
+ Write-PrizmInfo "STOP_ON_FAILURE: $nextItemId is $($script:PRIZM_ITEM_LIST_STATUS); retry budget not exhausted, continuing."
754
770
  }
755
771
  }
756
772
  }