prizmkit 1.1.67 → 1.1.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  {
2
- "frameworkVersion": "1.1.67",
3
- "bundledAt": "2026-06-09T02:37:28.761Z",
4
- "bundledFrom": "d4b8c30"
2
+ "frameworkVersion": "1.1.68",
3
+ "bundledAt": "2026-06-09T14:36:58.835Z",
4
+ "bundledFrom": "82060fd"
5
5
  }
@@ -418,6 +418,46 @@ prizm_start_ai_session() {
418
418
  PRIZM_AI_PID=$!
419
419
  }
420
420
 
421
+ # Detect AI CLI/provider infrastructure failures that are outside the
422
+ # generated code's control. These should be retried without consuming the
423
+ # item's code retry budget.
424
+ prizm_detect_infra_error() {
425
+ local session_log="${1:-}"
426
+ local progress_json="${2:-}"
427
+
428
+ local haystack=""
429
+ if [[ -n "$session_log" && -f "$session_log" ]]; then
430
+ haystack="$(tail -c 65536 "$session_log" 2>/dev/null || true)"
431
+ fi
432
+ if [[ -n "$progress_json" && -f "$progress_json" ]]; then
433
+ haystack+=$'\n'
434
+ haystack+="$(cat "$progress_json" 2>/dev/null || true)"
435
+ fi
436
+
437
+ [[ -n "$haystack" ]] || return 1
438
+
439
+ if printf '%s' "$haystack" | grep -Eiq \
440
+ 'auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded'; then
441
+ return 0
442
+ fi
443
+
444
+ return 1
445
+ }
446
+
447
+ prizm_extract_update_new_status() {
448
+ python3 -c "
449
+ import json, sys
450
+ raw = sys.stdin.read()
451
+ try:
452
+ data = json.loads(raw)
453
+ except Exception:
454
+ sys.exit(0)
455
+ value = data.get('new_status')
456
+ if value:
457
+ print(value)
458
+ "
459
+ }
460
+
421
461
  # Run an AI CLI session synchronously.
422
462
  # Usage: prizm_run_ai_session <prompt_path> <log_path> <model>
423
463
  prizm_run_ai_session() {
@@ -145,6 +145,11 @@ spawn_and_wait_session() {
145
145
  log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
146
146
  fi
147
147
 
148
+ local was_infra_error=false
149
+ if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
150
+ was_infra_error=true
151
+ fi
152
+
148
153
  # Session summary
149
154
  if [[ -f "$session_log" ]]; then
150
155
  local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -162,6 +167,10 @@ spawn_and_wait_session() {
162
167
  if [[ $exit_code -eq 124 ]]; then
163
168
  log_warn "Session timed out after ${SESSION_TIMEOUT}s"
164
169
  session_status="timed_out"
170
+ elif [[ "$was_infra_error" == true ]]; then
171
+ log_warn "Session failed due to AI CLI/provider infrastructure error"
172
+ log_warn "Infrastructure errors are retried without consuming code retry budget"
173
+ session_status="infra_error"
165
174
  elif [[ "$was_stale_killed" == true ]]; then
166
175
  log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
167
176
  log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -259,14 +268,20 @@ sys.exit(0)
259
268
  prizm_detect_subagents "$session_log"
260
269
 
261
270
  # Update bug status (do NOT commit on dev branch — commit happens after merge)
262
- python3 "$SCRIPTS_DIR/update-bug-status.py" \
271
+ local update_output
272
+ update_output=$(python3 "$SCRIPTS_DIR/update-bug-status.py" \
263
273
  --bug-list "$bug_list" \
264
274
  --state-dir "$STATE_DIR" \
265
275
  --bug-id "$bug_id" \
266
276
  --session-status "$session_status" \
267
277
  --session-id "$session_id" \
268
278
  --max-retries "$max_retries" \
269
- --action update >/dev/null 2>&1 || true
279
+ --action update 2>&1) || {
280
+ log_error "Failed to update bug status: $update_output"
281
+ update_output=""
282
+ }
283
+
284
+ _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
270
285
 
271
286
  _SPAWN_RESULT="$session_status"
272
287
  }
@@ -693,6 +708,7 @@ else:
693
708
  trap cleanup_single_bug SIGINT SIGTERM
694
709
 
695
710
  _SPAWN_RESULT=""
711
+ _SPAWN_ITEM_STATUS=""
696
712
 
697
713
  # Branch lifecycle: create and checkout bugfix branch
698
714
  local _proj_root
@@ -1078,12 +1094,14 @@ DEPLOY_PROMPT_EOF
1078
1094
  # Spawn session
1079
1095
  log_info "Spawning AI CLI session: $session_id"
1080
1096
  _SPAWN_RESULT=""
1097
+ _SPAWN_ITEM_STATUS=""
1081
1098
 
1082
1099
  spawn_and_wait_session \
1083
1100
  "$bug_id" "$bug_list" "$session_id" \
1084
1101
  "$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$bug_model" "$_ORIGINAL_BRANCH"
1085
1102
 
1086
1103
  local session_status="$_SPAWN_RESULT"
1104
+ local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
1087
1105
 
1088
1106
  # Merge per-bug dev branch back to original on success
1089
1107
  if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1112,15 +1130,18 @@ DEPLOY_PROMPT_EOF
1112
1130
  session_count=$((session_count + 1))
1113
1131
  total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
1114
1132
 
1115
- # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
1116
- if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1133
+ # Stop-on-failure: abort only after the task is actually marked failed.
1134
+ # Pending retry outcomes, including infrastructure errors, keep running.
1135
+ if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
1117
1136
  echo ""
1118
1137
  log_error "════════════════════════════════════════════════════"
1119
- log_error " STOP_ON_FAILURE: Pipeline halted after $bug_id failed."
1138
+ log_error " STOP_ON_FAILURE: Pipeline halted after $bug_id exhausted retries."
1120
1139
  log_error " Total sessions completed: $session_count"
1121
1140
  log_error " Set STOP_ON_FAILURE=0 to continue past failures."
1122
1141
  log_error "════════════════════════════════════════════════════"
1123
1142
  break
1143
+ elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1144
+ log_info "STOP_ON_FAILURE: $bug_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
1124
1145
  fi
1125
1146
 
1126
1147
  # Stuck detection
@@ -153,6 +153,11 @@ spawn_and_wait_session() {
153
153
  log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
154
154
  fi
155
155
 
156
+ local was_infra_error=false
157
+ if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
158
+ was_infra_error=true
159
+ fi
160
+
156
161
  # Show final session summary
157
162
  if [[ -f "$session_log" ]]; then
158
163
  local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -172,6 +177,10 @@ spawn_and_wait_session() {
172
177
  if [[ $exit_code -eq 124 ]]; then
173
178
  log_warn "Session timed out after ${SESSION_TIMEOUT}s"
174
179
  session_status="timed_out"
180
+ elif [[ "$was_infra_error" == true ]]; then
181
+ log_warn "Session failed due to AI CLI/provider infrastructure error"
182
+ log_warn "Infrastructure errors are retried without consuming code retry budget"
183
+ session_status="infra_error"
175
184
  elif [[ "$was_stale_killed" == true ]]; then
176
185
  log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
177
186
  log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -347,6 +356,8 @@ sys.exit(0)
347
356
  log_error ".prizmkit/plans/feature-list.json may be out of sync. Manual intervention needed."
348
357
  }
349
358
 
359
+ _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
360
+
350
361
  # Return status via global variable (avoids $() swallowing stdout)
351
362
  _SPAWN_RESULT="$session_status"
352
363
  }
@@ -848,6 +859,7 @@ else:
848
859
  trap cleanup_single_feature SIGINT SIGTERM
849
860
 
850
861
  _SPAWN_RESULT=""
862
+ _SPAWN_ITEM_STATUS=""
851
863
 
852
864
  # Branch lifecycle: create and checkout feature branch
853
865
  local _proj_root
@@ -1300,11 +1312,13 @@ DEPLOY_PROMPT_EOF
1300
1312
  log_info "Feature model: $feature_model"
1301
1313
  fi
1302
1314
  _SPAWN_RESULT=""
1315
+ _SPAWN_ITEM_STATUS=""
1303
1316
 
1304
1317
  spawn_and_wait_session \
1305
1318
  "$feature_id" "$feature_list" "$session_id" \
1306
1319
  "$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$feature_model" "$_ORIGINAL_BRANCH"
1307
1320
  local session_status="$_SPAWN_RESULT"
1321
+ local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
1308
1322
 
1309
1323
  # Merge per-feature dev branch back to original on success
1310
1324
  if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1333,15 +1347,18 @@ DEPLOY_PROMPT_EOF
1333
1347
  session_count=$((session_count + 1))
1334
1348
  total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
1335
1349
 
1336
- # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
1337
- if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1350
+ # Stop-on-failure: abort only after the task is actually marked failed.
1351
+ # Pending retry outcomes, including infrastructure errors, keep running.
1352
+ if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
1338
1353
  echo ""
1339
1354
  log_error "════════════════════════════════════════════════════"
1340
- log_error " STOP_ON_FAILURE: Pipeline halted after $feature_id failed."
1355
+ log_error " STOP_ON_FAILURE: Pipeline halted after $feature_id exhausted retries."
1341
1356
  log_error " Total sessions completed: $session_count"
1342
1357
  log_error " Set STOP_ON_FAILURE=0 to continue past failures."
1343
1358
  log_error "════════════════════════════════════════════════════"
1344
1359
  break
1360
+ elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1361
+ log_info "STOP_ON_FAILURE: $feature_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
1345
1362
  fi
1346
1363
 
1347
1364
  # Brief pause before next iteration
@@ -147,6 +147,11 @@ spawn_and_wait_session() {
147
147
  log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
148
148
  fi
149
149
 
150
+ local was_infra_error=false
151
+ if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
152
+ was_infra_error=true
153
+ fi
154
+
150
155
  # Session summary
151
156
  if [[ -f "$session_log" ]]; then
152
157
  local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
@@ -164,6 +169,10 @@ spawn_and_wait_session() {
164
169
  if [[ $exit_code -eq 124 ]]; then
165
170
  log_warn "Session timed out after ${SESSION_TIMEOUT}s"
166
171
  session_status="timed_out"
172
+ elif [[ "$was_infra_error" == true ]]; then
173
+ log_warn "Session failed due to AI CLI/provider infrastructure error"
174
+ log_warn "Infrastructure errors are retried without consuming code retry budget"
175
+ session_status="infra_error"
167
176
  elif [[ "$was_stale_killed" == true ]]; then
168
177
  log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
169
178
  log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -286,14 +295,20 @@ sys.exit(0)
286
295
  fi
287
296
 
288
297
  # Update refactor status (do NOT commit on dev branch — commit happens after merge)
289
- python3 "$SCRIPTS_DIR/update-refactor-status.py" \
298
+ local update_output
299
+ update_output=$(python3 "$SCRIPTS_DIR/update-refactor-status.py" \
290
300
  --refactor-list "$refactor_list" \
291
301
  --state-dir "$STATE_DIR" \
292
302
  --refactor-id "$refactor_id" \
293
303
  --session-status "$session_status" \
294
304
  --session-id "$session_id" \
295
305
  --max-retries "$max_retries" \
296
- --action update >/dev/null 2>&1 || true
306
+ --action update 2>&1) || {
307
+ log_error "Failed to update refactor status: $update_output"
308
+ update_output=""
309
+ }
310
+
311
+ _SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
297
312
 
298
313
  _SPAWN_RESULT="$session_status"
299
314
  }
@@ -723,6 +738,7 @@ else:
723
738
  trap cleanup_single_refactor SIGINT SIGTERM
724
739
 
725
740
  _SPAWN_RESULT=""
741
+ _SPAWN_ITEM_STATUS=""
726
742
 
727
743
  # Branch lifecycle: create and checkout refactor branch
728
744
  local _proj_root
@@ -1114,6 +1130,7 @@ DEPLOY_PROMPT_EOF
1114
1130
  # Spawn session
1115
1131
  log_info "Spawning AI CLI session: $session_id"
1116
1132
  _SPAWN_RESULT=""
1133
+ _SPAWN_ITEM_STATUS=""
1117
1134
 
1118
1135
  spawn_and_wait_session \
1119
1136
  "$refactor_id" "$refactor_list" "$session_id" \
@@ -1130,6 +1147,7 @@ DEPLOY_PROMPT_EOF
1130
1147
  fi
1131
1148
 
1132
1149
  local session_status="$_SPAWN_RESULT"
1150
+ local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
1133
1151
 
1134
1152
  # Merge per-refactor dev branch back to original on success
1135
1153
  if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
@@ -1168,15 +1186,18 @@ DEPLOY_PROMPT_EOF
1168
1186
  session_count=$((session_count + 1))
1169
1187
  total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
1170
1188
 
1171
- # Stop-on-failure: abort pipeline if task failed and STOP_ON_FAILURE is enabled
1172
- if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1189
+ # Stop-on-failure: abort only after the task is actually marked failed.
1190
+ # Pending retry outcomes, including infrastructure errors, keep running.
1191
+ if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
1173
1192
  echo ""
1174
1193
  log_error "════════════════════════════════════════════════════"
1175
- log_error " STOP_ON_FAILURE: Pipeline halted after $refactor_id failed."
1194
+ log_error " STOP_ON_FAILURE: Pipeline halted after $refactor_id exhausted retries."
1176
1195
  log_error " Total sessions completed: $session_count"
1177
1196
  log_error " Set STOP_ON_FAILURE=0 to continue past failures."
1178
1197
  log_error "════════════════════════════════════════════════════"
1179
1198
  break
1199
+ elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
1200
+ log_info "STOP_ON_FAILURE: $refactor_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
1180
1201
  fi
1181
1202
 
1182
1203
  log_info "Pausing 5s before next refactor..."
@@ -41,6 +41,7 @@ SESSION_STATUS_VALUES = [
41
41
  "failed",
42
42
  "crashed",
43
43
  "timed_out",
44
+ "infra_error",
44
45
  "commit_missing",
45
46
  "docs_missing",
46
47
  "merge_conflict",
@@ -280,6 +281,16 @@ def action_update(args, bug_list_path, state_dir):
280
281
  bs["sessions"] = []
281
282
  bs["last_session_id"] = None
282
283
 
284
+ err = update_bug_in_list(bug_list_path, bug_id, new_status)
285
+ if err:
286
+ error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
287
+ return
288
+ elif session_status == "infra_error":
289
+ new_status = "pending"
290
+ bs["infra_error_count"] = bs.get("infra_error_count", 0) + 1
291
+ bs["last_infra_error_session_id"] = session_id
292
+ bs["resume_from_phase"] = None
293
+
283
294
  err = update_bug_in_list(bug_list_path, bug_id, new_status)
284
295
  if err:
285
296
  error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
@@ -333,6 +344,10 @@ def action_update(args, bug_list_path, state_dir):
333
344
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
334
345
  summary["degraded_reason"] = session_status
335
346
  summary["restart_policy"] = "finalization_retry"
347
+ elif session_status == "infra_error":
348
+ summary["restart_policy"] = "infra_retry"
349
+ summary["infra_error_count"] = bs.get("infra_error_count", 0)
350
+ summary["artifacts_preserved"] = True
336
351
  elif session_status != "success":
337
352
  summary["restart_policy"] = "full_restart"
338
353
  summary["cleanup_performed"] = cleaned
@@ -45,6 +45,7 @@ SESSION_STATUS_VALUES = [
45
45
  "failed",
46
46
  "crashed",
47
47
  "timed_out",
48
+ "infra_error",
48
49
  "commit_missing",
49
50
  "docs_missing",
50
51
  "merge_conflict",
@@ -645,6 +646,19 @@ def action_update(args, feature_list_path, state_dir):
645
646
  fs["sessions"] = []
646
647
  fs["last_session_id"] = None
647
648
 
649
+ err = update_feature_in_list(feature_list_path, feature_id, new_status)
650
+ if err:
651
+ error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
652
+ return
653
+ elif session_status == "infra_error":
654
+ # AI CLI/provider outage, auth failure, gateway error, etc.
655
+ # This is outside the code's control, so keep the item pending without
656
+ # consuming the task's retry budget.
657
+ new_status = "pending"
658
+ fs["infra_error_count"] = fs.get("infra_error_count", 0) + 1
659
+ fs["last_infra_error_session_id"] = session_id
660
+ fs["resume_from_phase"] = None
661
+
648
662
  err = update_feature_in_list(feature_list_path, feature_id, new_status)
649
663
  if err:
650
664
  error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
@@ -701,6 +715,10 @@ def action_update(args, feature_list_path, state_dir):
701
715
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
702
716
  summary["degraded_reason"] = session_status
703
717
  summary["restart_policy"] = "finalization_retry"
718
+ elif session_status == "infra_error":
719
+ summary["restart_policy"] = "infra_retry"
720
+ summary["infra_error_count"] = fs.get("infra_error_count", 0)
721
+ summary["artifacts_preserved"] = True
704
722
  elif session_status != "success":
705
723
  summary["restart_policy"] = "preserve_and_retry"
706
724
  summary["artifacts_preserved"] = True
@@ -42,6 +42,7 @@ SESSION_STATUS_VALUES = [
42
42
  "failed",
43
43
  "crashed",
44
44
  "timed_out",
45
+ "infra_error",
45
46
  "commit_missing",
46
47
  "docs_missing",
47
48
  "merge_conflict",
@@ -314,6 +315,16 @@ def action_update(args, refactor_list_path, state_dir):
314
315
  rs["sessions"] = []
315
316
  rs["last_session_id"] = None
316
317
 
318
+ err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
319
+ if err:
320
+ error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
321
+ return
322
+ elif session_status == "infra_error":
323
+ new_status = "pending"
324
+ rs["infra_error_count"] = rs.get("infra_error_count", 0) + 1
325
+ rs["last_infra_error_session_id"] = session_id
326
+ rs["resume_from_phase"] = None
327
+
317
328
  err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
318
329
  if err:
319
330
  error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
@@ -376,6 +387,10 @@ def action_update(args, refactor_list_path, state_dir):
376
387
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
377
388
  summary["degraded_reason"] = session_status
378
389
  summary["restart_policy"] = "finalization_retry"
390
+ elif session_status == "infra_error":
391
+ summary["restart_policy"] = "infra_retry"
392
+ summary["infra_error_count"] = rs.get("infra_error_count", 0)
393
+ summary["artifacts_preserved"] = True
379
394
  elif session_status != "success":
380
395
  summary["restart_policy"] = "full_restart"
381
396
  summary["cleanup_performed"] = cleaned
@@ -303,6 +303,45 @@ def _run_get_next(fl_path, state_dir):
303
303
  return result.stdout.strip()
304
304
 
305
305
 
306
+ def _run_update(fl_path, state_dir, feature_id, session_status, session_id="session-1", max_retries=3):
307
+ cmd = [
308
+ "python3", _SCRIPT,
309
+ "--feature-list", fl_path,
310
+ "--state-dir", state_dir,
311
+ "--feature-id", feature_id,
312
+ "--session-status", session_status,
313
+ "--session-id", session_id,
314
+ "--max-retries", str(max_retries),
315
+ "--action", "update",
316
+ ]
317
+ result = subprocess.run(cmd, capture_output=True, text=True)
318
+ assert result.returncode == 0, result.stderr
319
+ return json.loads(result.stdout)
320
+
321
+
322
+ class TestInfraErrorUpdate:
323
+ def test_infra_error_keeps_pending_without_consuming_retry(self, tmp_path):
324
+ features = [_make_feature("F-001", "Root", status="in_progress")]
325
+ fl_path = _write_fl(tmp_path, features)
326
+ state_dir = _init_state(tmp_path, ["F-001"])
327
+ status_path = os.path.join(state_dir, "features", "F-001", "status.json")
328
+ fs = load_feature_status(state_dir, "F-001")
329
+ fs["retry_count"] = 2
330
+ write_json_file(status_path, fs)
331
+
332
+ result = _run_update(fl_path, state_dir, "F-001", "infra_error", "session-infra", max_retries=3)
333
+
334
+ assert result["new_status"] == "pending"
335
+ assert result["retry_count"] == 2
336
+ assert result["restart_policy"] == "infra_retry"
337
+ assert _read_statuses(fl_path)["F-001"] == "pending"
338
+
339
+ fs = load_feature_status(state_dir, "F-001")
340
+ assert fs["retry_count"] == 2
341
+ assert fs["infra_error_count"] == 1
342
+ assert fs["last_infra_error_session_id"] == "session-infra"
343
+
344
+
306
345
  class TestUnskipByFeatureId:
307
346
  """Unskip with --feature-id targets a specific failed feature + downstream."""
308
347
 
@@ -353,7 +353,7 @@ pending, in_progress, completed, failed, skipped
353
353
  | `LOG_CLEANUP_ENABLED` | boolean | 1 | Periodic session log cleanup |
354
354
  | `LOG_RETENTION_DAYS` | integer | 14 | Delete session logs older than N days |
355
355
  | `LOG_MAX_TOTAL_MB` | integer | 1024 | Keep total logs under N MB |
356
- | `STOP_ON_FAILURE` | boolean | 0 | Stop after the first failed task |
356
+ | `STOP_ON_FAILURE` | boolean | 0 | Stop after a task exhausts retries |
357
357
  | `ENABLE_DEPLOY` | boolean | 0 | Start deploy session after all tasks complete |
358
358
  | `DEV_BRANCH` | string | auto-generated | Optional custom dev branch name |
359
359
  | `AUTO_PUSH` | boolean | 0 | Push original branch after successful merge |
@@ -145,6 +145,25 @@ function Invoke-PrizmPythonText {
145
145
  if ($LASTEXITCODE -ne 0) { throw "Python command failed: $($Arguments -join ' ')" }
146
146
  }
147
147
 
148
+ function Test-PrizmInfraError {
149
+ param([string]$SessionLog, [string]$ProgressJson)
150
+ $parts = @()
151
+ if ($SessionLog -and (Test-Path $SessionLog)) {
152
+ try {
153
+ $text = Get-Content $SessionLog -Raw -ErrorAction Stop
154
+ if ($text.Length -gt 65536) { $text = $text.Substring($text.Length - 65536) }
155
+ $parts += $text
156
+ } catch {}
157
+ }
158
+ if ($ProgressJson -and (Test-Path $ProgressJson)) {
159
+ try { $parts += (Get-Content $ProgressJson -Raw -ErrorAction Stop) } catch {}
160
+ }
161
+ if ($parts.Count -eq 0) { return $false }
162
+
163
+ $haystack = $parts -join "`n"
164
+ return ($haystack -match '(?i)auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded')
165
+ }
166
+
148
167
  function Get-PrizmConfigValue {
149
168
  param([string]$ConfigPath, [string]$Key)
150
169
  if (-not (Test-Path $ConfigPath)) { return $null }
@@ -618,10 +618,16 @@ function Invoke-PrizmPipeline {
618
618
  }
619
619
  Stop-PrizmProgressParser $parserProcess
620
620
 
621
+ $wasInfraError = ($exitCode -ne 0 -and (Test-PrizmInfraError -SessionLog $sessionLog -ProgressJson $progressJson))
622
+
621
623
  $status = 'crashed'
622
624
  if ($wasTimedOut) {
623
625
  $status = 'timed_out'
624
626
  Write-PrizmWarn "AI session timed out after $timeoutSeconds seconds"
627
+ } elseif ($wasInfraError) {
628
+ $status = 'infra_error'
629
+ Write-PrizmWarn "AI session failed due to AI CLI/provider infrastructure error"
630
+ Write-PrizmWarn "Infrastructure errors are retried without consuming code retry budget"
625
631
  } elseif ($wasStaleKilled -or (Test-Path $staleKillMarker)) {
626
632
  Write-PrizmWarn "Session was stale-killed by heartbeat monitor (no progress for too long)"
627
633
  Write-PrizmWarn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -645,8 +651,12 @@ function Invoke-PrizmPipeline {
645
651
  }
646
652
 
647
653
  $mergeSucceeded = $true
654
+ $itemListStatus = ''
648
655
  if ($status -eq 'success') {
649
- Invoke-PrizmPythonText $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
656
+ $updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
657
+ if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
658
+ $itemListStatus = [string]$updateResult.new_status
659
+ }
650
660
 
651
661
  if (Test-PrizmGitDirty $paths.ProjectRoot) {
652
662
  if ($hadDirtyBaseline) {
@@ -676,7 +686,10 @@ function Invoke-PrizmPipeline {
676
686
  }
677
687
 
678
688
  if ($status -ne 'success') {
679
- Invoke-PrizmPythonText $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
689
+ $updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
690
+ if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
691
+ $itemListStatus = [string]$updateResult.new_status
692
+ }
680
693
  if ($isGitRepository) {
681
694
  Invoke-PrizmGitCommitPath $paths.ProjectRoot $listPath "chore($CurrentItemId): update $idName status" | Out-Null
682
695
  }
@@ -687,6 +700,7 @@ function Invoke-PrizmPipeline {
687
700
  } else {
688
701
  Write-PrizmError "$Kind item failed: $CurrentItemId. Log: $sessionLog"
689
702
  }
703
+ $script:PRIZM_ITEM_LIST_STATUS = $itemListStatus
690
704
  $script:PRIZM_ITEM_EXIT_CODE = if ($status -eq 'success' -and $mergeSucceeded) { 0 } else { 1 }
691
705
  return
692
706
  }
@@ -748,9 +762,11 @@ function Invoke-PrizmPipeline {
748
762
  $global:PRIZM_EXIT_CODE = $lastExitCode
749
763
  return
750
764
  }
751
- if ($lastExitCode -ne 0 -and $stopOnFailure) {
765
+ if ($lastExitCode -ne 0 -and $stopOnFailure -and $script:PRIZM_ITEM_LIST_STATUS -eq 'failed') {
752
766
  $global:PRIZM_EXIT_CODE = $lastExitCode
753
767
  return
768
+ } elseif ($lastExitCode -ne 0 -and $stopOnFailure) {
769
+ Write-PrizmInfo "STOP_ON_FAILURE: $nextItemId is $($script:PRIZM_ITEM_LIST_STATUS); retry budget not exhausted, continuing."
754
770
  }
755
771
  }
756
772
  }
@@ -41,6 +41,7 @@ SESSION_STATUS_VALUES = [
41
41
  "failed",
42
42
  "crashed",
43
43
  "timed_out",
44
+ "infra_error",
44
45
  "commit_missing",
45
46
  "docs_missing",
46
47
  "merge_conflict",
@@ -280,6 +281,16 @@ def action_update(args, bug_list_path, state_dir):
280
281
  bs["sessions"] = []
281
282
  bs["last_session_id"] = None
282
283
 
284
+ err = update_bug_in_list(bug_list_path, bug_id, new_status)
285
+ if err:
286
+ error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
287
+ return
288
+ elif session_status == "infra_error":
289
+ new_status = "pending"
290
+ bs["infra_error_count"] = bs.get("infra_error_count", 0) + 1
291
+ bs["last_infra_error_session_id"] = session_id
292
+ bs["resume_from_phase"] = None
293
+
283
294
  err = update_bug_in_list(bug_list_path, bug_id, new_status)
284
295
  if err:
285
296
  error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
@@ -333,6 +344,10 @@ def action_update(args, bug_list_path, state_dir):
333
344
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
334
345
  summary["degraded_reason"] = session_status
335
346
  summary["restart_policy"] = "finalization_retry"
347
+ elif session_status == "infra_error":
348
+ summary["restart_policy"] = "infra_retry"
349
+ summary["infra_error_count"] = bs.get("infra_error_count", 0)
350
+ summary["artifacts_preserved"] = True
336
351
  elif session_status != "success":
337
352
  summary["restart_policy"] = "full_restart"
338
353
  summary["cleanup_performed"] = cleaned
@@ -45,6 +45,7 @@ SESSION_STATUS_VALUES = [
45
45
  "failed",
46
46
  "crashed",
47
47
  "timed_out",
48
+ "infra_error",
48
49
  "commit_missing",
49
50
  "docs_missing",
50
51
  "merge_conflict",
@@ -645,6 +646,19 @@ def action_update(args, feature_list_path, state_dir):
645
646
  fs["sessions"] = []
646
647
  fs["last_session_id"] = None
647
648
 
649
+ err = update_feature_in_list(feature_list_path, feature_id, new_status)
650
+ if err:
651
+ error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
652
+ return
653
+ elif session_status == "infra_error":
654
+ # AI CLI/provider outage, auth failure, gateway error, etc.
655
+ # This is outside the code's control, so keep the item pending without
656
+ # consuming the task's retry budget.
657
+ new_status = "pending"
658
+ fs["infra_error_count"] = fs.get("infra_error_count", 0) + 1
659
+ fs["last_infra_error_session_id"] = session_id
660
+ fs["resume_from_phase"] = None
661
+
648
662
  err = update_feature_in_list(feature_list_path, feature_id, new_status)
649
663
  if err:
650
664
  error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
@@ -701,6 +715,10 @@ def action_update(args, feature_list_path, state_dir):
701
715
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
702
716
  summary["degraded_reason"] = session_status
703
717
  summary["restart_policy"] = "finalization_retry"
718
+ elif session_status == "infra_error":
719
+ summary["restart_policy"] = "infra_retry"
720
+ summary["infra_error_count"] = fs.get("infra_error_count", 0)
721
+ summary["artifacts_preserved"] = True
704
722
  elif session_status != "success":
705
723
  summary["restart_policy"] = "preserve_and_retry"
706
724
  summary["artifacts_preserved"] = True
@@ -42,6 +42,7 @@ SESSION_STATUS_VALUES = [
42
42
  "failed",
43
43
  "crashed",
44
44
  "timed_out",
45
+ "infra_error",
45
46
  "commit_missing",
46
47
  "docs_missing",
47
48
  "merge_conflict",
@@ -314,6 +315,16 @@ def action_update(args, refactor_list_path, state_dir):
314
315
  rs["sessions"] = []
315
316
  rs["last_session_id"] = None
316
317
 
318
+ err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
319
+ if err:
320
+ error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
321
+ return
322
+ elif session_status == "infra_error":
323
+ new_status = "pending"
324
+ rs["infra_error_count"] = rs.get("infra_error_count", 0) + 1
325
+ rs["last_infra_error_session_id"] = session_id
326
+ rs["resume_from_phase"] = None
327
+
317
328
  err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
318
329
  if err:
319
330
  error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
@@ -376,6 +387,10 @@ def action_update(args, refactor_list_path, state_dir):
376
387
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
377
388
  summary["degraded_reason"] = session_status
378
389
  summary["restart_policy"] = "finalization_retry"
390
+ elif session_status == "infra_error":
391
+ summary["restart_policy"] = "infra_retry"
392
+ summary["infra_error_count"] = rs.get("infra_error_count", 0)
393
+ summary["artifacts_preserved"] = True
379
394
  elif session_status != "success":
380
395
  summary["restart_policy"] = "full_restart"
381
396
  summary["cleanup_performed"] = cleaned
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "1.1.67",
2
+ "version": "1.1.68",
3
3
  "skills": {
4
4
  "prizm-kit": {
5
5
  "description": "Full-lifecycle dev toolkit. Covers spec-driven development, Prizm context docs, code quality, debugging, deployment, and knowledge management.",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "prizmkit",
3
- "version": "1.1.67",
3
+ "version": "1.1.68",
4
4
  "description": "Create a new PrizmKit-powered project with clean initialization — no framework dev files, just what you need.",
5
5
  "type": "module",
6
6
  "bin": {