prizmkit 1.1.67 → 1.1.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled/VERSION.json +3 -3
- package/bundled/dev-pipeline/lib/common.sh +40 -0
- package/bundled/dev-pipeline/run-bugfix.sh +26 -5
- package/bundled/dev-pipeline/run-feature.sh +20 -3
- package/bundled/dev-pipeline/run-refactor.sh +26 -5
- package/bundled/dev-pipeline/scripts/update-bug-status.py +15 -0
- package/bundled/dev-pipeline/scripts/update-feature-status.py +18 -0
- package/bundled/dev-pipeline/scripts/update-refactor-status.py +15 -0
- package/bundled/dev-pipeline/tests/test_auto_skip.py +39 -0
- package/bundled/dev-pipeline-windows/SCHEMA_ANALYSIS.md +1 -1
- package/bundled/dev-pipeline-windows/lib/common.ps1 +19 -0
- package/bundled/dev-pipeline-windows/lib/pipeline.ps1 +19 -3
- package/bundled/dev-pipeline-windows/scripts/update-bug-status.py +15 -0
- package/bundled/dev-pipeline-windows/scripts/update-feature-status.py +18 -0
- package/bundled/dev-pipeline-windows/scripts/update-refactor-status.py +15 -0
- package/bundled/skills/_metadata.json +1 -1
- package/package.json +1 -1
package/bundled/VERSION.json
CHANGED
|
@@ -418,6 +418,46 @@ prizm_start_ai_session() {
|
|
|
418
418
|
PRIZM_AI_PID=$!
|
|
419
419
|
}
|
|
420
420
|
|
|
421
|
+
# Detect AI CLI/provider infrastructure failures that are outside the
|
|
422
|
+
# generated code's control. These should be retried without consuming the
|
|
423
|
+
# item's code retry budget.
|
|
424
|
+
prizm_detect_infra_error() {
|
|
425
|
+
local session_log="${1:-}"
|
|
426
|
+
local progress_json="${2:-}"
|
|
427
|
+
|
|
428
|
+
local haystack=""
|
|
429
|
+
if [[ -n "$session_log" && -f "$session_log" ]]; then
|
|
430
|
+
haystack="$(tail -c 65536 "$session_log" 2>/dev/null || true)"
|
|
431
|
+
fi
|
|
432
|
+
if [[ -n "$progress_json" && -f "$progress_json" ]]; then
|
|
433
|
+
haystack+=$'\n'
|
|
434
|
+
haystack+="$(cat "$progress_json" 2>/dev/null || true)"
|
|
435
|
+
fi
|
|
436
|
+
|
|
437
|
+
[[ -n "$haystack" ]] || return 1
|
|
438
|
+
|
|
439
|
+
if printf '%s' "$haystack" | grep -Eiq \
|
|
440
|
+
'auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded'; then
|
|
441
|
+
return 0
|
|
442
|
+
fi
|
|
443
|
+
|
|
444
|
+
return 1
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
prizm_extract_update_new_status() {
|
|
448
|
+
python3 -c "
|
|
449
|
+
import json, sys
|
|
450
|
+
raw = sys.stdin.read()
|
|
451
|
+
try:
|
|
452
|
+
data = json.loads(raw)
|
|
453
|
+
except Exception:
|
|
454
|
+
sys.exit(0)
|
|
455
|
+
value = data.get('new_status')
|
|
456
|
+
if value:
|
|
457
|
+
print(value)
|
|
458
|
+
"
|
|
459
|
+
}
|
|
460
|
+
|
|
421
461
|
# Run an AI CLI session synchronously.
|
|
422
462
|
# Usage: prizm_run_ai_session <prompt_path> <log_path> <model>
|
|
423
463
|
prizm_run_ai_session() {
|
|
@@ -145,6 +145,11 @@ spawn_and_wait_session() {
|
|
|
145
145
|
log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
146
146
|
fi
|
|
147
147
|
|
|
148
|
+
local was_infra_error=false
|
|
149
|
+
if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
|
|
150
|
+
was_infra_error=true
|
|
151
|
+
fi
|
|
152
|
+
|
|
148
153
|
# Session summary
|
|
149
154
|
if [[ -f "$session_log" ]]; then
|
|
150
155
|
local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
|
|
@@ -162,6 +167,10 @@ spawn_and_wait_session() {
|
|
|
162
167
|
if [[ $exit_code -eq 124 ]]; then
|
|
163
168
|
log_warn "Session timed out after ${SESSION_TIMEOUT}s"
|
|
164
169
|
session_status="timed_out"
|
|
170
|
+
elif [[ "$was_infra_error" == true ]]; then
|
|
171
|
+
log_warn "Session failed due to AI CLI/provider infrastructure error"
|
|
172
|
+
log_warn "Infrastructure errors are retried without consuming code retry budget"
|
|
173
|
+
session_status="infra_error"
|
|
165
174
|
elif [[ "$was_stale_killed" == true ]]; then
|
|
166
175
|
log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
|
|
167
176
|
log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
|
|
@@ -259,14 +268,20 @@ sys.exit(0)
|
|
|
259
268
|
prizm_detect_subagents "$session_log"
|
|
260
269
|
|
|
261
270
|
# Update bug status (do NOT commit on dev branch — commit happens after merge)
|
|
262
|
-
|
|
271
|
+
local update_output
|
|
272
|
+
update_output=$(python3 "$SCRIPTS_DIR/update-bug-status.py" \
|
|
263
273
|
--bug-list "$bug_list" \
|
|
264
274
|
--state-dir "$STATE_DIR" \
|
|
265
275
|
--bug-id "$bug_id" \
|
|
266
276
|
--session-status "$session_status" \
|
|
267
277
|
--session-id "$session_id" \
|
|
268
278
|
--max-retries "$max_retries" \
|
|
269
|
-
--action update
|
|
279
|
+
--action update 2>&1) || {
|
|
280
|
+
log_error "Failed to update bug status: $update_output"
|
|
281
|
+
update_output=""
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
_SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
|
|
270
285
|
|
|
271
286
|
_SPAWN_RESULT="$session_status"
|
|
272
287
|
}
|
|
@@ -693,6 +708,7 @@ else:
|
|
|
693
708
|
trap cleanup_single_bug SIGINT SIGTERM
|
|
694
709
|
|
|
695
710
|
_SPAWN_RESULT=""
|
|
711
|
+
_SPAWN_ITEM_STATUS=""
|
|
696
712
|
|
|
697
713
|
# Branch lifecycle: create and checkout bugfix branch
|
|
698
714
|
local _proj_root
|
|
@@ -1078,12 +1094,14 @@ DEPLOY_PROMPT_EOF
|
|
|
1078
1094
|
# Spawn session
|
|
1079
1095
|
log_info "Spawning AI CLI session: $session_id"
|
|
1080
1096
|
_SPAWN_RESULT=""
|
|
1097
|
+
_SPAWN_ITEM_STATUS=""
|
|
1081
1098
|
|
|
1082
1099
|
spawn_and_wait_session \
|
|
1083
1100
|
"$bug_id" "$bug_list" "$session_id" \
|
|
1084
1101
|
"$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$bug_model" "$_ORIGINAL_BRANCH"
|
|
1085
1102
|
|
|
1086
1103
|
local session_status="$_SPAWN_RESULT"
|
|
1104
|
+
local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
|
|
1087
1105
|
|
|
1088
1106
|
# Merge per-bug dev branch back to original on success
|
|
1089
1107
|
if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
|
|
@@ -1112,15 +1130,18 @@ DEPLOY_PROMPT_EOF
|
|
|
1112
1130
|
session_count=$((session_count + 1))
|
|
1113
1131
|
total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
|
|
1114
1132
|
|
|
1115
|
-
# Stop-on-failure: abort
|
|
1116
|
-
|
|
1133
|
+
# Stop-on-failure: abort only after the task is actually marked failed.
|
|
1134
|
+
# Pending retry outcomes, including infrastructure errors, keep running.
|
|
1135
|
+
if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
|
|
1117
1136
|
echo ""
|
|
1118
1137
|
log_error "════════════════════════════════════════════════════"
|
|
1119
|
-
log_error " STOP_ON_FAILURE: Pipeline halted after $bug_id
|
|
1138
|
+
log_error " STOP_ON_FAILURE: Pipeline halted after $bug_id exhausted retries."
|
|
1120
1139
|
log_error " Total sessions completed: $session_count"
|
|
1121
1140
|
log_error " Set STOP_ON_FAILURE=0 to continue past failures."
|
|
1122
1141
|
log_error "════════════════════════════════════════════════════"
|
|
1123
1142
|
break
|
|
1143
|
+
elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
|
|
1144
|
+
log_info "STOP_ON_FAILURE: $bug_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
|
|
1124
1145
|
fi
|
|
1125
1146
|
|
|
1126
1147
|
# Stuck detection
|
|
@@ -153,6 +153,11 @@ spawn_and_wait_session() {
|
|
|
153
153
|
log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
154
154
|
fi
|
|
155
155
|
|
|
156
|
+
local was_infra_error=false
|
|
157
|
+
if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
|
|
158
|
+
was_infra_error=true
|
|
159
|
+
fi
|
|
160
|
+
|
|
156
161
|
# Show final session summary
|
|
157
162
|
if [[ -f "$session_log" ]]; then
|
|
158
163
|
local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
|
|
@@ -172,6 +177,10 @@ spawn_and_wait_session() {
|
|
|
172
177
|
if [[ $exit_code -eq 124 ]]; then
|
|
173
178
|
log_warn "Session timed out after ${SESSION_TIMEOUT}s"
|
|
174
179
|
session_status="timed_out"
|
|
180
|
+
elif [[ "$was_infra_error" == true ]]; then
|
|
181
|
+
log_warn "Session failed due to AI CLI/provider infrastructure error"
|
|
182
|
+
log_warn "Infrastructure errors are retried without consuming code retry budget"
|
|
183
|
+
session_status="infra_error"
|
|
175
184
|
elif [[ "$was_stale_killed" == true ]]; then
|
|
176
185
|
log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
|
|
177
186
|
log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
|
|
@@ -347,6 +356,8 @@ sys.exit(0)
|
|
|
347
356
|
log_error ".prizmkit/plans/feature-list.json may be out of sync. Manual intervention needed."
|
|
348
357
|
}
|
|
349
358
|
|
|
359
|
+
_SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
|
|
360
|
+
|
|
350
361
|
# Return status via global variable (avoids $() swallowing stdout)
|
|
351
362
|
_SPAWN_RESULT="$session_status"
|
|
352
363
|
}
|
|
@@ -848,6 +859,7 @@ else:
|
|
|
848
859
|
trap cleanup_single_feature SIGINT SIGTERM
|
|
849
860
|
|
|
850
861
|
_SPAWN_RESULT=""
|
|
862
|
+
_SPAWN_ITEM_STATUS=""
|
|
851
863
|
|
|
852
864
|
# Branch lifecycle: create and checkout feature branch
|
|
853
865
|
local _proj_root
|
|
@@ -1300,11 +1312,13 @@ DEPLOY_PROMPT_EOF
|
|
|
1300
1312
|
log_info "Feature model: $feature_model"
|
|
1301
1313
|
fi
|
|
1302
1314
|
_SPAWN_RESULT=""
|
|
1315
|
+
_SPAWN_ITEM_STATUS=""
|
|
1303
1316
|
|
|
1304
1317
|
spawn_and_wait_session \
|
|
1305
1318
|
"$feature_id" "$feature_list" "$session_id" \
|
|
1306
1319
|
"$bootstrap_prompt" "$session_dir" "$MAX_RETRIES" "$feature_model" "$_ORIGINAL_BRANCH"
|
|
1307
1320
|
local session_status="$_SPAWN_RESULT"
|
|
1321
|
+
local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
|
|
1308
1322
|
|
|
1309
1323
|
# Merge per-feature dev branch back to original on success
|
|
1310
1324
|
if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
|
|
@@ -1333,15 +1347,18 @@ DEPLOY_PROMPT_EOF
|
|
|
1333
1347
|
session_count=$((session_count + 1))
|
|
1334
1348
|
total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
|
|
1335
1349
|
|
|
1336
|
-
# Stop-on-failure: abort
|
|
1337
|
-
|
|
1350
|
+
# Stop-on-failure: abort only after the task is actually marked failed.
|
|
1351
|
+
# Pending retry outcomes, including infrastructure errors, keep running.
|
|
1352
|
+
if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
|
|
1338
1353
|
echo ""
|
|
1339
1354
|
log_error "════════════════════════════════════════════════════"
|
|
1340
|
-
log_error " STOP_ON_FAILURE: Pipeline halted after $feature_id
|
|
1355
|
+
log_error " STOP_ON_FAILURE: Pipeline halted after $feature_id exhausted retries."
|
|
1341
1356
|
log_error " Total sessions completed: $session_count"
|
|
1342
1357
|
log_error " Set STOP_ON_FAILURE=0 to continue past failures."
|
|
1343
1358
|
log_error "════════════════════════════════════════════════════"
|
|
1344
1359
|
break
|
|
1360
|
+
elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
|
|
1361
|
+
log_info "STOP_ON_FAILURE: $feature_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
|
|
1345
1362
|
fi
|
|
1346
1363
|
|
|
1347
1364
|
# Brief pause before next iteration
|
|
@@ -147,6 +147,11 @@ spawn_and_wait_session() {
|
|
|
147
147
|
log_warn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
148
148
|
fi
|
|
149
149
|
|
|
150
|
+
local was_infra_error=false
|
|
151
|
+
if [[ $exit_code -ne 0 ]] && prizm_detect_infra_error "$session_log" "$progress_json"; then
|
|
152
|
+
was_infra_error=true
|
|
153
|
+
fi
|
|
154
|
+
|
|
150
155
|
# Session summary
|
|
151
156
|
if [[ -f "$session_log" ]]; then
|
|
152
157
|
local final_size=$(wc -c < "$session_log" 2>/dev/null | tr -d ' ')
|
|
@@ -164,6 +169,10 @@ spawn_and_wait_session() {
|
|
|
164
169
|
if [[ $exit_code -eq 124 ]]; then
|
|
165
170
|
log_warn "Session timed out after ${SESSION_TIMEOUT}s"
|
|
166
171
|
session_status="timed_out"
|
|
172
|
+
elif [[ "$was_infra_error" == true ]]; then
|
|
173
|
+
log_warn "Session failed due to AI CLI/provider infrastructure error"
|
|
174
|
+
log_warn "Infrastructure errors are retried without consuming code retry budget"
|
|
175
|
+
session_status="infra_error"
|
|
167
176
|
elif [[ "$was_stale_killed" == true ]]; then
|
|
168
177
|
log_warn "Session stale-killed (no progress for ${STALE_KILL_THRESHOLD}s)"
|
|
169
178
|
log_warn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
|
|
@@ -286,14 +295,20 @@ sys.exit(0)
|
|
|
286
295
|
fi
|
|
287
296
|
|
|
288
297
|
# Update refactor status (do NOT commit on dev branch — commit happens after merge)
|
|
289
|
-
|
|
298
|
+
local update_output
|
|
299
|
+
update_output=$(python3 "$SCRIPTS_DIR/update-refactor-status.py" \
|
|
290
300
|
--refactor-list "$refactor_list" \
|
|
291
301
|
--state-dir "$STATE_DIR" \
|
|
292
302
|
--refactor-id "$refactor_id" \
|
|
293
303
|
--session-status "$session_status" \
|
|
294
304
|
--session-id "$session_id" \
|
|
295
305
|
--max-retries "$max_retries" \
|
|
296
|
-
--action update
|
|
306
|
+
--action update 2>&1) || {
|
|
307
|
+
log_error "Failed to update refactor status: $update_output"
|
|
308
|
+
update_output=""
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
_SPAWN_ITEM_STATUS="$(printf '%s' "$update_output" | prizm_extract_update_new_status)"
|
|
297
312
|
|
|
298
313
|
_SPAWN_RESULT="$session_status"
|
|
299
314
|
}
|
|
@@ -723,6 +738,7 @@ else:
|
|
|
723
738
|
trap cleanup_single_refactor SIGINT SIGTERM
|
|
724
739
|
|
|
725
740
|
_SPAWN_RESULT=""
|
|
741
|
+
_SPAWN_ITEM_STATUS=""
|
|
726
742
|
|
|
727
743
|
# Branch lifecycle: create and checkout refactor branch
|
|
728
744
|
local _proj_root
|
|
@@ -1114,6 +1130,7 @@ DEPLOY_PROMPT_EOF
|
|
|
1114
1130
|
# Spawn session
|
|
1115
1131
|
log_info "Spawning AI CLI session: $session_id"
|
|
1116
1132
|
_SPAWN_RESULT=""
|
|
1133
|
+
_SPAWN_ITEM_STATUS=""
|
|
1117
1134
|
|
|
1118
1135
|
spawn_and_wait_session \
|
|
1119
1136
|
"$refactor_id" "$refactor_list" "$session_id" \
|
|
@@ -1130,6 +1147,7 @@ DEPLOY_PROMPT_EOF
|
|
|
1130
1147
|
fi
|
|
1131
1148
|
|
|
1132
1149
|
local session_status="$_SPAWN_RESULT"
|
|
1150
|
+
local item_status_after_session="${_SPAWN_ITEM_STATUS:-}"
|
|
1133
1151
|
|
|
1134
1152
|
# Merge per-refactor dev branch back to original on success
|
|
1135
1153
|
if [[ "$session_status" == "success" && -n "$_DEV_BRANCH_NAME" ]]; then
|
|
@@ -1168,15 +1186,18 @@ DEPLOY_PROMPT_EOF
|
|
|
1168
1186
|
session_count=$((session_count + 1))
|
|
1169
1187
|
total_subagent_calls=$((total_subagent_calls + _SUBAGENT_COUNT))
|
|
1170
1188
|
|
|
1171
|
-
# Stop-on-failure: abort
|
|
1172
|
-
|
|
1189
|
+
# Stop-on-failure: abort only after the task is actually marked failed.
|
|
1190
|
+
# Pending retry outcomes, including infrastructure errors, keep running.
|
|
1191
|
+
if [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" && "$item_status_after_session" == "failed" ]]; then
|
|
1173
1192
|
echo ""
|
|
1174
1193
|
log_error "════════════════════════════════════════════════════"
|
|
1175
|
-
log_error " STOP_ON_FAILURE: Pipeline halted after $refactor_id
|
|
1194
|
+
log_error " STOP_ON_FAILURE: Pipeline halted after $refactor_id exhausted retries."
|
|
1176
1195
|
log_error " Total sessions completed: $session_count"
|
|
1177
1196
|
log_error " Set STOP_ON_FAILURE=0 to continue past failures."
|
|
1178
1197
|
log_error "════════════════════════════════════════════════════"
|
|
1179
1198
|
break
|
|
1199
|
+
elif [[ "$session_status" != "success" && "$STOP_ON_FAILURE" == "1" ]]; then
|
|
1200
|
+
log_info "STOP_ON_FAILURE: $refactor_id is ${item_status_after_session:-unknown}; retry budget not exhausted, continuing."
|
|
1180
1201
|
fi
|
|
1181
1202
|
|
|
1182
1203
|
log_info "Pausing 5s before next refactor..."
|
|
@@ -41,6 +41,7 @@ SESSION_STATUS_VALUES = [
|
|
|
41
41
|
"failed",
|
|
42
42
|
"crashed",
|
|
43
43
|
"timed_out",
|
|
44
|
+
"infra_error",
|
|
44
45
|
"commit_missing",
|
|
45
46
|
"docs_missing",
|
|
46
47
|
"merge_conflict",
|
|
@@ -280,6 +281,16 @@ def action_update(args, bug_list_path, state_dir):
|
|
|
280
281
|
bs["sessions"] = []
|
|
281
282
|
bs["last_session_id"] = None
|
|
282
283
|
|
|
284
|
+
err = update_bug_in_list(bug_list_path, bug_id, new_status)
|
|
285
|
+
if err:
|
|
286
|
+
error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
|
|
287
|
+
return
|
|
288
|
+
elif session_status == "infra_error":
|
|
289
|
+
new_status = "pending"
|
|
290
|
+
bs["infra_error_count"] = bs.get("infra_error_count", 0) + 1
|
|
291
|
+
bs["last_infra_error_session_id"] = session_id
|
|
292
|
+
bs["resume_from_phase"] = None
|
|
293
|
+
|
|
283
294
|
err = update_bug_in_list(bug_list_path, bug_id, new_status)
|
|
284
295
|
if err:
|
|
285
296
|
error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
|
|
@@ -333,6 +344,10 @@ def action_update(args, bug_list_path, state_dir):
|
|
|
333
344
|
if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
|
|
334
345
|
summary["degraded_reason"] = session_status
|
|
335
346
|
summary["restart_policy"] = "finalization_retry"
|
|
347
|
+
elif session_status == "infra_error":
|
|
348
|
+
summary["restart_policy"] = "infra_retry"
|
|
349
|
+
summary["infra_error_count"] = bs.get("infra_error_count", 0)
|
|
350
|
+
summary["artifacts_preserved"] = True
|
|
336
351
|
elif session_status != "success":
|
|
337
352
|
summary["restart_policy"] = "full_restart"
|
|
338
353
|
summary["cleanup_performed"] = cleaned
|
|
@@ -45,6 +45,7 @@ SESSION_STATUS_VALUES = [
|
|
|
45
45
|
"failed",
|
|
46
46
|
"crashed",
|
|
47
47
|
"timed_out",
|
|
48
|
+
"infra_error",
|
|
48
49
|
"commit_missing",
|
|
49
50
|
"docs_missing",
|
|
50
51
|
"merge_conflict",
|
|
@@ -645,6 +646,19 @@ def action_update(args, feature_list_path, state_dir):
|
|
|
645
646
|
fs["sessions"] = []
|
|
646
647
|
fs["last_session_id"] = None
|
|
647
648
|
|
|
649
|
+
err = update_feature_in_list(feature_list_path, feature_id, new_status)
|
|
650
|
+
if err:
|
|
651
|
+
error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
|
|
652
|
+
return
|
|
653
|
+
elif session_status == "infra_error":
|
|
654
|
+
# AI CLI/provider outage, auth failure, gateway error, etc.
|
|
655
|
+
# This is outside the code's control, so keep the item pending without
|
|
656
|
+
# consuming the task's retry budget.
|
|
657
|
+
new_status = "pending"
|
|
658
|
+
fs["infra_error_count"] = fs.get("infra_error_count", 0) + 1
|
|
659
|
+
fs["last_infra_error_session_id"] = session_id
|
|
660
|
+
fs["resume_from_phase"] = None
|
|
661
|
+
|
|
648
662
|
err = update_feature_in_list(feature_list_path, feature_id, new_status)
|
|
649
663
|
if err:
|
|
650
664
|
error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
|
|
@@ -701,6 +715,10 @@ def action_update(args, feature_list_path, state_dir):
|
|
|
701
715
|
if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
|
|
702
716
|
summary["degraded_reason"] = session_status
|
|
703
717
|
summary["restart_policy"] = "finalization_retry"
|
|
718
|
+
elif session_status == "infra_error":
|
|
719
|
+
summary["restart_policy"] = "infra_retry"
|
|
720
|
+
summary["infra_error_count"] = fs.get("infra_error_count", 0)
|
|
721
|
+
summary["artifacts_preserved"] = True
|
|
704
722
|
elif session_status != "success":
|
|
705
723
|
summary["restart_policy"] = "preserve_and_retry"
|
|
706
724
|
summary["artifacts_preserved"] = True
|
|
@@ -42,6 +42,7 @@ SESSION_STATUS_VALUES = [
|
|
|
42
42
|
"failed",
|
|
43
43
|
"crashed",
|
|
44
44
|
"timed_out",
|
|
45
|
+
"infra_error",
|
|
45
46
|
"commit_missing",
|
|
46
47
|
"docs_missing",
|
|
47
48
|
"merge_conflict",
|
|
@@ -314,6 +315,16 @@ def action_update(args, refactor_list_path, state_dir):
|
|
|
314
315
|
rs["sessions"] = []
|
|
315
316
|
rs["last_session_id"] = None
|
|
316
317
|
|
|
318
|
+
err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
|
|
319
|
+
if err:
|
|
320
|
+
error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
|
|
321
|
+
return
|
|
322
|
+
elif session_status == "infra_error":
|
|
323
|
+
new_status = "pending"
|
|
324
|
+
rs["infra_error_count"] = rs.get("infra_error_count", 0) + 1
|
|
325
|
+
rs["last_infra_error_session_id"] = session_id
|
|
326
|
+
rs["resume_from_phase"] = None
|
|
327
|
+
|
|
317
328
|
err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
|
|
318
329
|
if err:
|
|
319
330
|
error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
|
|
@@ -376,6 +387,10 @@ def action_update(args, refactor_list_path, state_dir):
|
|
|
376
387
|
if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
|
|
377
388
|
summary["degraded_reason"] = session_status
|
|
378
389
|
summary["restart_policy"] = "finalization_retry"
|
|
390
|
+
elif session_status == "infra_error":
|
|
391
|
+
summary["restart_policy"] = "infra_retry"
|
|
392
|
+
summary["infra_error_count"] = rs.get("infra_error_count", 0)
|
|
393
|
+
summary["artifacts_preserved"] = True
|
|
379
394
|
elif session_status != "success":
|
|
380
395
|
summary["restart_policy"] = "full_restart"
|
|
381
396
|
summary["cleanup_performed"] = cleaned
|
|
@@ -303,6 +303,45 @@ def _run_get_next(fl_path, state_dir):
|
|
|
303
303
|
return result.stdout.strip()
|
|
304
304
|
|
|
305
305
|
|
|
306
|
+
def _run_update(fl_path, state_dir, feature_id, session_status, session_id="session-1", max_retries=3):
|
|
307
|
+
cmd = [
|
|
308
|
+
"python3", _SCRIPT,
|
|
309
|
+
"--feature-list", fl_path,
|
|
310
|
+
"--state-dir", state_dir,
|
|
311
|
+
"--feature-id", feature_id,
|
|
312
|
+
"--session-status", session_status,
|
|
313
|
+
"--session-id", session_id,
|
|
314
|
+
"--max-retries", str(max_retries),
|
|
315
|
+
"--action", "update",
|
|
316
|
+
]
|
|
317
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
318
|
+
assert result.returncode == 0, result.stderr
|
|
319
|
+
return json.loads(result.stdout)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class TestInfraErrorUpdate:
|
|
323
|
+
def test_infra_error_keeps_pending_without_consuming_retry(self, tmp_path):
|
|
324
|
+
features = [_make_feature("F-001", "Root", status="in_progress")]
|
|
325
|
+
fl_path = _write_fl(tmp_path, features)
|
|
326
|
+
state_dir = _init_state(tmp_path, ["F-001"])
|
|
327
|
+
status_path = os.path.join(state_dir, "features", "F-001", "status.json")
|
|
328
|
+
fs = load_feature_status(state_dir, "F-001")
|
|
329
|
+
fs["retry_count"] = 2
|
|
330
|
+
write_json_file(status_path, fs)
|
|
331
|
+
|
|
332
|
+
result = _run_update(fl_path, state_dir, "F-001", "infra_error", "session-infra", max_retries=3)
|
|
333
|
+
|
|
334
|
+
assert result["new_status"] == "pending"
|
|
335
|
+
assert result["retry_count"] == 2
|
|
336
|
+
assert result["restart_policy"] == "infra_retry"
|
|
337
|
+
assert _read_statuses(fl_path)["F-001"] == "pending"
|
|
338
|
+
|
|
339
|
+
fs = load_feature_status(state_dir, "F-001")
|
|
340
|
+
assert fs["retry_count"] == 2
|
|
341
|
+
assert fs["infra_error_count"] == 1
|
|
342
|
+
assert fs["last_infra_error_session_id"] == "session-infra"
|
|
343
|
+
|
|
344
|
+
|
|
306
345
|
class TestUnskipByFeatureId:
|
|
307
346
|
"""Unskip with --feature-id targets a specific failed feature + downstream."""
|
|
308
347
|
|
|
@@ -353,7 +353,7 @@ pending, in_progress, completed, failed, skipped
|
|
|
353
353
|
| `LOG_CLEANUP_ENABLED` | boolean | 1 | Periodic session log cleanup |
|
|
354
354
|
| `LOG_RETENTION_DAYS` | integer | 14 | Delete session logs older than N days |
|
|
355
355
|
| `LOG_MAX_TOTAL_MB` | integer | 1024 | Keep total logs under N MB |
|
|
356
|
-
| `STOP_ON_FAILURE` | boolean | 0 | Stop after
|
|
356
|
+
| `STOP_ON_FAILURE` | boolean | 0 | Stop after a task exhausts retries |
|
|
357
357
|
| `ENABLE_DEPLOY` | boolean | 0 | Start deploy session after all tasks complete |
|
|
358
358
|
| `DEV_BRANCH` | string | auto-generated | Optional custom dev branch name |
|
|
359
359
|
| `AUTO_PUSH` | boolean | 0 | Push original branch after successful merge |
|
|
@@ -145,6 +145,25 @@ function Invoke-PrizmPythonText {
|
|
|
145
145
|
if ($LASTEXITCODE -ne 0) { throw "Python command failed: $($Arguments -join ' ')" }
|
|
146
146
|
}
|
|
147
147
|
|
|
148
|
+
function Test-PrizmInfraError {
|
|
149
|
+
param([string]$SessionLog, [string]$ProgressJson)
|
|
150
|
+
$parts = @()
|
|
151
|
+
if ($SessionLog -and (Test-Path $SessionLog)) {
|
|
152
|
+
try {
|
|
153
|
+
$text = Get-Content $SessionLog -Raw -ErrorAction Stop
|
|
154
|
+
if ($text.Length -gt 65536) { $text = $text.Substring($text.Length - 65536) }
|
|
155
|
+
$parts += $text
|
|
156
|
+
} catch {}
|
|
157
|
+
}
|
|
158
|
+
if ($ProgressJson -and (Test-Path $ProgressJson)) {
|
|
159
|
+
try { $parts += (Get-Content $ProgressJson -Raw -ErrorAction Stop) } catch {}
|
|
160
|
+
}
|
|
161
|
+
if ($parts.Count -eq 0) { return $false }
|
|
162
|
+
|
|
163
|
+
$haystack = $parts -join "`n"
|
|
164
|
+
return ($haystack -match '(?i)auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded')
|
|
165
|
+
}
|
|
166
|
+
|
|
148
167
|
function Get-PrizmConfigValue {
|
|
149
168
|
param([string]$ConfigPath, [string]$Key)
|
|
150
169
|
if (-not (Test-Path $ConfigPath)) { return $null }
|
|
@@ -618,10 +618,16 @@ function Invoke-PrizmPipeline {
|
|
|
618
618
|
}
|
|
619
619
|
Stop-PrizmProgressParser $parserProcess
|
|
620
620
|
|
|
621
|
+
$wasInfraError = ($exitCode -ne 0 -and (Test-PrizmInfraError -SessionLog $sessionLog -ProgressJson $progressJson))
|
|
622
|
+
|
|
621
623
|
$status = 'crashed'
|
|
622
624
|
if ($wasTimedOut) {
|
|
623
625
|
$status = 'timed_out'
|
|
624
626
|
Write-PrizmWarn "AI session timed out after $timeoutSeconds seconds"
|
|
627
|
+
} elseif ($wasInfraError) {
|
|
628
|
+
$status = 'infra_error'
|
|
629
|
+
Write-PrizmWarn "AI session failed due to AI CLI/provider infrastructure error"
|
|
630
|
+
Write-PrizmWarn "Infrastructure errors are retried without consuming code retry budget"
|
|
625
631
|
} elseif ($wasStaleKilled -or (Test-Path $staleKillMarker)) {
|
|
626
632
|
Write-PrizmWarn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
627
633
|
Write-PrizmWarn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
|
|
@@ -645,8 +651,12 @@ function Invoke-PrizmPipeline {
|
|
|
645
651
|
}
|
|
646
652
|
|
|
647
653
|
$mergeSucceeded = $true
|
|
654
|
+
$itemListStatus = ''
|
|
648
655
|
if ($status -eq 'success') {
|
|
649
|
-
Invoke-
|
|
656
|
+
$updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
|
|
657
|
+
if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
|
|
658
|
+
$itemListStatus = [string]$updateResult.new_status
|
|
659
|
+
}
|
|
650
660
|
|
|
651
661
|
if (Test-PrizmGitDirty $paths.ProjectRoot) {
|
|
652
662
|
if ($hadDirtyBaseline) {
|
|
@@ -676,7 +686,10 @@ function Invoke-PrizmPipeline {
|
|
|
676
686
|
}
|
|
677
687
|
|
|
678
688
|
if ($status -ne 'success') {
|
|
679
|
-
Invoke-
|
|
689
|
+
$updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
|
|
690
|
+
if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
|
|
691
|
+
$itemListStatus = [string]$updateResult.new_status
|
|
692
|
+
}
|
|
680
693
|
if ($isGitRepository) {
|
|
681
694
|
Invoke-PrizmGitCommitPath $paths.ProjectRoot $listPath "chore($CurrentItemId): update $idName status" | Out-Null
|
|
682
695
|
}
|
|
@@ -687,6 +700,7 @@ function Invoke-PrizmPipeline {
|
|
|
687
700
|
} else {
|
|
688
701
|
Write-PrizmError "$Kind item failed: $CurrentItemId. Log: $sessionLog"
|
|
689
702
|
}
|
|
703
|
+
$script:PRIZM_ITEM_LIST_STATUS = $itemListStatus
|
|
690
704
|
$script:PRIZM_ITEM_EXIT_CODE = if ($status -eq 'success' -and $mergeSucceeded) { 0 } else { 1 }
|
|
691
705
|
return
|
|
692
706
|
}
|
|
@@ -748,9 +762,11 @@ function Invoke-PrizmPipeline {
|
|
|
748
762
|
$global:PRIZM_EXIT_CODE = $lastExitCode
|
|
749
763
|
return
|
|
750
764
|
}
|
|
751
|
-
if ($lastExitCode -ne 0 -and $stopOnFailure) {
|
|
765
|
+
if ($lastExitCode -ne 0 -and $stopOnFailure -and $script:PRIZM_ITEM_LIST_STATUS -eq 'failed') {
|
|
752
766
|
$global:PRIZM_EXIT_CODE = $lastExitCode
|
|
753
767
|
return
|
|
768
|
+
} elseif ($lastExitCode -ne 0 -and $stopOnFailure) {
|
|
769
|
+
Write-PrizmInfo "STOP_ON_FAILURE: $nextItemId is $($script:PRIZM_ITEM_LIST_STATUS); retry budget not exhausted, continuing."
|
|
754
770
|
}
|
|
755
771
|
}
|
|
756
772
|
}
|
|
@@ -41,6 +41,7 @@ SESSION_STATUS_VALUES = [
|
|
|
41
41
|
"failed",
|
|
42
42
|
"crashed",
|
|
43
43
|
"timed_out",
|
|
44
|
+
"infra_error",
|
|
44
45
|
"commit_missing",
|
|
45
46
|
"docs_missing",
|
|
46
47
|
"merge_conflict",
|
|
@@ -280,6 +281,16 @@ def action_update(args, bug_list_path, state_dir):
|
|
|
280
281
|
bs["sessions"] = []
|
|
281
282
|
bs["last_session_id"] = None
|
|
282
283
|
|
|
284
|
+
err = update_bug_in_list(bug_list_path, bug_id, new_status)
|
|
285
|
+
if err:
|
|
286
|
+
error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
|
|
287
|
+
return
|
|
288
|
+
elif session_status == "infra_error":
|
|
289
|
+
new_status = "pending"
|
|
290
|
+
bs["infra_error_count"] = bs.get("infra_error_count", 0) + 1
|
|
291
|
+
bs["last_infra_error_session_id"] = session_id
|
|
292
|
+
bs["resume_from_phase"] = None
|
|
293
|
+
|
|
283
294
|
err = update_bug_in_list(bug_list_path, bug_id, new_status)
|
|
284
295
|
if err:
|
|
285
296
|
error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
|
|
@@ -333,6 +344,10 @@ def action_update(args, bug_list_path, state_dir):
|
|
|
333
344
|
if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
|
|
334
345
|
summary["degraded_reason"] = session_status
|
|
335
346
|
summary["restart_policy"] = "finalization_retry"
|
|
347
|
+
elif session_status == "infra_error":
|
|
348
|
+
summary["restart_policy"] = "infra_retry"
|
|
349
|
+
summary["infra_error_count"] = bs.get("infra_error_count", 0)
|
|
350
|
+
summary["artifacts_preserved"] = True
|
|
336
351
|
elif session_status != "success":
|
|
337
352
|
summary["restart_policy"] = "full_restart"
|
|
338
353
|
summary["cleanup_performed"] = cleaned
|
|
@@ -45,6 +45,7 @@ SESSION_STATUS_VALUES = [
|
|
|
45
45
|
"failed",
|
|
46
46
|
"crashed",
|
|
47
47
|
"timed_out",
|
|
48
|
+
"infra_error",
|
|
48
49
|
"commit_missing",
|
|
49
50
|
"docs_missing",
|
|
50
51
|
"merge_conflict",
|
|
@@ -645,6 +646,19 @@ def action_update(args, feature_list_path, state_dir):
|
|
|
645
646
|
fs["sessions"] = []
|
|
646
647
|
fs["last_session_id"] = None
|
|
647
648
|
|
|
649
|
+
err = update_feature_in_list(feature_list_path, feature_id, new_status)
|
|
650
|
+
if err:
|
|
651
|
+
error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
|
|
652
|
+
return
|
|
653
|
+
elif session_status == "infra_error":
|
|
654
|
+
# AI CLI/provider outage, auth failure, gateway error, etc.
|
|
655
|
+
# This is outside the code's control, so keep the item pending without
|
|
656
|
+
# consuming the task's retry budget.
|
|
657
|
+
new_status = "pending"
|
|
658
|
+
fs["infra_error_count"] = fs.get("infra_error_count", 0) + 1
|
|
659
|
+
fs["last_infra_error_session_id"] = session_id
|
|
660
|
+
fs["resume_from_phase"] = None
|
|
661
|
+
|
|
648
662
|
err = update_feature_in_list(feature_list_path, feature_id, new_status)
|
|
649
663
|
if err:
|
|
650
664
|
error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
|
|
@@ -701,6 +715,10 @@ def action_update(args, feature_list_path, state_dir):
|
|
|
701
715
|
if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
|
|
702
716
|
summary["degraded_reason"] = session_status
|
|
703
717
|
summary["restart_policy"] = "finalization_retry"
|
|
718
|
+
elif session_status == "infra_error":
|
|
719
|
+
summary["restart_policy"] = "infra_retry"
|
|
720
|
+
summary["infra_error_count"] = fs.get("infra_error_count", 0)
|
|
721
|
+
summary["artifacts_preserved"] = True
|
|
704
722
|
elif session_status != "success":
|
|
705
723
|
summary["restart_policy"] = "preserve_and_retry"
|
|
706
724
|
summary["artifacts_preserved"] = True
|
|
@@ -42,6 +42,7 @@ SESSION_STATUS_VALUES = [
|
|
|
42
42
|
"failed",
|
|
43
43
|
"crashed",
|
|
44
44
|
"timed_out",
|
|
45
|
+
"infra_error",
|
|
45
46
|
"commit_missing",
|
|
46
47
|
"docs_missing",
|
|
47
48
|
"merge_conflict",
|
|
@@ -314,6 +315,16 @@ def action_update(args, refactor_list_path, state_dir):
|
|
|
314
315
|
rs["sessions"] = []
|
|
315
316
|
rs["last_session_id"] = None
|
|
316
317
|
|
|
318
|
+
err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
|
|
319
|
+
if err:
|
|
320
|
+
error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
|
|
321
|
+
return
|
|
322
|
+
elif session_status == "infra_error":
|
|
323
|
+
new_status = "pending"
|
|
324
|
+
rs["infra_error_count"] = rs.get("infra_error_count", 0) + 1
|
|
325
|
+
rs["last_infra_error_session_id"] = session_id
|
|
326
|
+
rs["resume_from_phase"] = None
|
|
327
|
+
|
|
317
328
|
err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
|
|
318
329
|
if err:
|
|
319
330
|
error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
|
|
@@ -376,6 +387,10 @@ def action_update(args, refactor_list_path, state_dir):
|
|
|
376
387
|
if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
|
|
377
388
|
summary["degraded_reason"] = session_status
|
|
378
389
|
summary["restart_policy"] = "finalization_retry"
|
|
390
|
+
elif session_status == "infra_error":
|
|
391
|
+
summary["restart_policy"] = "infra_retry"
|
|
392
|
+
summary["infra_error_count"] = rs.get("infra_error_count", 0)
|
|
393
|
+
summary["artifacts_preserved"] = True
|
|
379
394
|
elif session_status != "success":
|
|
380
395
|
summary["restart_policy"] = "full_restart"
|
|
381
396
|
summary["cleanup_performed"] = cleaned
|