prizmkit 1.1.66 → 1.1.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled/VERSION.json +3 -3
- package/bundled/adapters/codex/settings-adapter.js +1 -1
- package/bundled/dev-pipeline/.env.example +3 -0
- package/bundled/dev-pipeline/SCHEMA_ANALYSIS.md +3 -1
- package/bundled/dev-pipeline/lib/common.sh +61 -18
- package/bundled/dev-pipeline/lib/heartbeat.sh +104 -11
- package/bundled/dev-pipeline/run-bugfix.sh +26 -5
- package/bundled/dev-pipeline/run-feature.sh +20 -3
- package/bundled/dev-pipeline/run-refactor.sh +26 -5
- package/bundled/dev-pipeline/scripts/parse-stream-progress.py +144 -12
- package/bundled/dev-pipeline/scripts/update-bug-status.py +15 -0
- package/bundled/dev-pipeline/scripts/update-feature-status.py +18 -0
- package/bundled/dev-pipeline/scripts/update-refactor-status.py +15 -0
- package/bundled/dev-pipeline/tests/test_auto_skip.py +39 -0
- package/bundled/dev-pipeline-windows/.env.example +3 -2
- package/bundled/dev-pipeline-windows/SCHEMA_ANALYSIS.md +4 -3
- package/bundled/dev-pipeline-windows/lib/common.ps1 +97 -5
- package/bundled/dev-pipeline-windows/lib/pipeline.ps1 +31 -7
- package/bundled/dev-pipeline-windows/run-recovery.ps1 +8 -1
- package/bundled/dev-pipeline-windows/scripts/parse-stream-progress.py +144 -12
- package/bundled/dev-pipeline-windows/scripts/update-bug-status.py +15 -0
- package/bundled/dev-pipeline-windows/scripts/update-feature-status.py +18 -0
- package/bundled/dev-pipeline-windows/scripts/update-refactor-status.py +15 -0
- package/bundled/skills/_metadata.json +1 -1
- package/package.json +1 -1
- package/src/scaffold.js +1 -1
|
@@ -23,6 +23,7 @@ import tempfile
|
|
|
23
23
|
import time
|
|
24
24
|
from collections import Counter
|
|
25
25
|
from datetime import datetime, timezone
|
|
26
|
+
from pathlib import Path
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
# Ordered pipeline phases — index defines forward-only progression.
|
|
@@ -76,6 +77,13 @@ class ProgressTracker:
|
|
|
76
77
|
self.event_format = ""
|
|
77
78
|
self.active_subagent_count = 0
|
|
78
79
|
self.subagent_status_counts = Counter()
|
|
80
|
+
self.codex_child_thread_ids = set()
|
|
81
|
+
self.child_session_files = []
|
|
82
|
+
self.child_total_bytes = 0
|
|
83
|
+
self.child_activity_signature = ""
|
|
84
|
+
self.last_child_activity_at = ""
|
|
85
|
+
self._codex_child_session_paths = {}
|
|
86
|
+
self._last_child_scan_at = 0.0
|
|
79
87
|
self._text_buffer = ""
|
|
80
88
|
self._in_tool_use = False
|
|
81
89
|
self._current_tool_input_parts = []
|
|
@@ -113,6 +121,9 @@ class ProgressTracker:
|
|
|
113
121
|
|
|
114
122
|
elif item_type == "collab_tool_call":
|
|
115
123
|
tool_name = item.get("tool", "collab")
|
|
124
|
+
self._record_codex_child_thread_ids(
|
|
125
|
+
item.get("receiver_thread_ids")
|
|
126
|
+
)
|
|
116
127
|
if event_type == "item.started":
|
|
117
128
|
self.current_tool = tool_name
|
|
118
129
|
self.tool_call_counts[tool_name] += 1
|
|
@@ -345,8 +356,117 @@ class ProgressTracker:
|
|
|
345
356
|
self.subagent_status_counts = counts
|
|
346
357
|
self.active_subagent_count = active
|
|
347
358
|
|
|
359
|
+
def _record_codex_child_thread_ids(self, thread_ids):
|
|
360
|
+
"""Remember Codex child thread IDs reported by collab tool calls."""
|
|
361
|
+
if not isinstance(thread_ids, list):
|
|
362
|
+
return
|
|
363
|
+
for thread_id in thread_ids:
|
|
364
|
+
if isinstance(thread_id, str) and thread_id.strip():
|
|
365
|
+
self.codex_child_thread_ids.add(thread_id.strip())
|
|
366
|
+
|
|
367
|
+
def _codex_sessions_dir(self):
|
|
368
|
+
"""Return the Codex sessions directory for the current environment."""
|
|
369
|
+
codex_home = os.environ.get("CODEX_HOME")
|
|
370
|
+
if codex_home:
|
|
371
|
+
return Path(codex_home).expanduser() / "sessions"
|
|
372
|
+
return Path.home() / ".codex" / "sessions"
|
|
373
|
+
|
|
374
|
+
def _find_codex_child_session_file(self, thread_id):
|
|
375
|
+
"""Find a Codex transcript file for a child thread ID."""
|
|
376
|
+
sessions_dir = self._codex_sessions_dir()
|
|
377
|
+
if not sessions_dir.exists():
|
|
378
|
+
return None
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
matches = list(sessions_dir.rglob(f"*{thread_id}.jsonl"))
|
|
382
|
+
except OSError:
|
|
383
|
+
return None
|
|
384
|
+
|
|
385
|
+
if not matches:
|
|
386
|
+
return None
|
|
387
|
+
|
|
388
|
+
try:
|
|
389
|
+
matches.sort(key=lambda path: path.stat().st_mtime, reverse=True)
|
|
390
|
+
except OSError:
|
|
391
|
+
pass
|
|
392
|
+
return str(matches[0])
|
|
393
|
+
|
|
394
|
+
def refresh_child_session_activity(self, force=False):
|
|
395
|
+
"""Refresh Codex child transcript file stats.
|
|
396
|
+
|
|
397
|
+
The heartbeat monitor uses this activity signature to treat subagent
|
|
398
|
+
transcript growth as real progress while the parent Codex session is
|
|
399
|
+
blocked in `wait`.
|
|
400
|
+
"""
|
|
401
|
+
previous_signature = self.child_activity_signature
|
|
402
|
+
|
|
403
|
+
if not self.codex_child_thread_ids:
|
|
404
|
+
self.child_session_files = []
|
|
405
|
+
self.child_total_bytes = 0
|
|
406
|
+
self.child_activity_signature = ""
|
|
407
|
+
self.last_child_activity_at = ""
|
|
408
|
+
return previous_signature != self.child_activity_signature
|
|
409
|
+
|
|
410
|
+
now = time.monotonic()
|
|
411
|
+
should_scan = (
|
|
412
|
+
force
|
|
413
|
+
or self._last_child_scan_at == 0.0
|
|
414
|
+
or (now - self._last_child_scan_at >= 2.0)
|
|
415
|
+
)
|
|
416
|
+
if should_scan:
|
|
417
|
+
for thread_id in sorted(self.codex_child_thread_ids):
|
|
418
|
+
path = self._codex_child_session_paths.get(thread_id)
|
|
419
|
+
if not path or not os.path.exists(path):
|
|
420
|
+
found = self._find_codex_child_session_file(thread_id)
|
|
421
|
+
if found:
|
|
422
|
+
self._codex_child_session_paths[thread_id] = found
|
|
423
|
+
self._last_child_scan_at = now
|
|
424
|
+
|
|
425
|
+
files = []
|
|
426
|
+
signature_parts = []
|
|
427
|
+
total_bytes = 0
|
|
428
|
+
max_mtime = 0.0
|
|
429
|
+
|
|
430
|
+
for thread_id in sorted(self.codex_child_thread_ids):
|
|
431
|
+
path = self._codex_child_session_paths.get(thread_id)
|
|
432
|
+
if not path:
|
|
433
|
+
continue
|
|
434
|
+
try:
|
|
435
|
+
stat = os.stat(path)
|
|
436
|
+
except OSError:
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
total_bytes += stat.st_size
|
|
440
|
+
max_mtime = max(max_mtime, stat.st_mtime)
|
|
441
|
+
signature_parts.append(
|
|
442
|
+
f"{thread_id}:{stat.st_size}:{getattr(stat, 'st_mtime_ns', int(stat.st_mtime * 1_000_000_000))}"
|
|
443
|
+
)
|
|
444
|
+
files.append(
|
|
445
|
+
{
|
|
446
|
+
"thread_id": thread_id,
|
|
447
|
+
"path": path,
|
|
448
|
+
"size": stat.st_size,
|
|
449
|
+
"mtime": datetime.fromtimestamp(
|
|
450
|
+
stat.st_mtime, timezone.utc
|
|
451
|
+
).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
452
|
+
}
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
self.child_session_files = files
|
|
456
|
+
self.child_total_bytes = total_bytes
|
|
457
|
+
self.child_activity_signature = "|".join(signature_parts)
|
|
458
|
+
self.last_child_activity_at = (
|
|
459
|
+
datetime.fromtimestamp(max_mtime, timezone.utc).strftime(
|
|
460
|
+
"%Y-%m-%dT%H:%M:%SZ"
|
|
461
|
+
)
|
|
462
|
+
if max_mtime
|
|
463
|
+
else ""
|
|
464
|
+
)
|
|
465
|
+
return previous_signature != self.child_activity_signature
|
|
466
|
+
|
|
348
467
|
def to_dict(self):
|
|
349
468
|
"""Export current state as a dictionary for JSON serialization."""
|
|
469
|
+
self.refresh_child_session_activity()
|
|
350
470
|
tool_calls = [
|
|
351
471
|
{"name": name, "count": count}
|
|
352
472
|
for name, count in self.tool_call_counts.most_common()
|
|
@@ -367,6 +487,11 @@ class ProgressTracker:
|
|
|
367
487
|
"total_tool_calls": self.total_tool_calls,
|
|
368
488
|
"active_subagent_count": self.active_subagent_count,
|
|
369
489
|
"subagent_states": subagent_states,
|
|
490
|
+
"child_thread_ids": sorted(self.codex_child_thread_ids),
|
|
491
|
+
"child_session_files": self.child_session_files,
|
|
492
|
+
"child_total_bytes": self.child_total_bytes,
|
|
493
|
+
"child_activity_signature": self.child_activity_signature,
|
|
494
|
+
"last_child_activity_at": self.last_child_activity_at,
|
|
370
495
|
"last_text_snippet": self.last_text_snippet,
|
|
371
496
|
"is_active": self.is_active,
|
|
372
497
|
"errors": self.errors[-10:], # Keep last 10 errors
|
|
@@ -397,6 +522,15 @@ def tail_and_parse(session_log, progress_file, poll_interval=0.5):
|
|
|
397
522
|
tracker = ProgressTracker()
|
|
398
523
|
last_write_state = None
|
|
399
524
|
|
|
525
|
+
def state_key(state):
|
|
526
|
+
return (
|
|
527
|
+
state["message_count"],
|
|
528
|
+
state["current_tool"],
|
|
529
|
+
state["current_phase"],
|
|
530
|
+
state["total_tool_calls"],
|
|
531
|
+
state.get("child_activity_signature", ""),
|
|
532
|
+
)
|
|
533
|
+
|
|
400
534
|
# Wait for log file to appear
|
|
401
535
|
wait_count = 0
|
|
402
536
|
while not os.path.exists(session_log):
|
|
@@ -428,22 +562,20 @@ def tail_and_parse(session_log, progress_file, poll_interval=0.5):
|
|
|
428
562
|
|
|
429
563
|
# Write progress if state changed
|
|
430
564
|
current_state = tracker.to_dict()
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
current_state["current_tool"],
|
|
434
|
-
current_state["current_phase"],
|
|
435
|
-
current_state["total_tool_calls"],
|
|
436
|
-
)
|
|
437
|
-
if state_key != last_write_state:
|
|
565
|
+
current_state_key = state_key(current_state)
|
|
566
|
+
if current_state_key != last_write_state:
|
|
438
567
|
atomic_write_json(current_state, progress_file)
|
|
439
|
-
last_write_state =
|
|
568
|
+
last_write_state = current_state_key
|
|
440
569
|
else:
|
|
441
570
|
idle_count += 1
|
|
442
|
-
#
|
|
443
|
-
#
|
|
444
|
-
if idle_count ==
|
|
571
|
+
# Every 2 seconds of no parent log data, refresh child Codex
|
|
572
|
+
# transcript stats and write if child activity advanced.
|
|
573
|
+
if idle_count % 4 == 0:
|
|
445
574
|
current_state = tracker.to_dict()
|
|
446
|
-
|
|
575
|
+
current_state_key = state_key(current_state)
|
|
576
|
+
if current_state_key != last_write_state or idle_count == 4:
|
|
577
|
+
atomic_write_json(current_state, progress_file)
|
|
578
|
+
last_write_state = current_state_key
|
|
447
579
|
|
|
448
580
|
# After 3600 idle cycles (30 min), mark inactive and exit
|
|
449
581
|
if idle_count > 3600:
|
|
@@ -41,6 +41,7 @@ SESSION_STATUS_VALUES = [
|
|
|
41
41
|
"failed",
|
|
42
42
|
"crashed",
|
|
43
43
|
"timed_out",
|
|
44
|
+
"infra_error",
|
|
44
45
|
"commit_missing",
|
|
45
46
|
"docs_missing",
|
|
46
47
|
"merge_conflict",
|
|
@@ -280,6 +281,16 @@ def action_update(args, bug_list_path, state_dir):
|
|
|
280
281
|
bs["sessions"] = []
|
|
281
282
|
bs["last_session_id"] = None
|
|
282
283
|
|
|
284
|
+
err = update_bug_in_list(bug_list_path, bug_id, new_status)
|
|
285
|
+
if err:
|
|
286
|
+
error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
|
|
287
|
+
return
|
|
288
|
+
elif session_status == "infra_error":
|
|
289
|
+
new_status = "pending"
|
|
290
|
+
bs["infra_error_count"] = bs.get("infra_error_count", 0) + 1
|
|
291
|
+
bs["last_infra_error_session_id"] = session_id
|
|
292
|
+
bs["resume_from_phase"] = None
|
|
293
|
+
|
|
283
294
|
err = update_bug_in_list(bug_list_path, bug_id, new_status)
|
|
284
295
|
if err:
|
|
285
296
|
error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
|
|
@@ -333,6 +344,10 @@ def action_update(args, bug_list_path, state_dir):
|
|
|
333
344
|
if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
|
|
334
345
|
summary["degraded_reason"] = session_status
|
|
335
346
|
summary["restart_policy"] = "finalization_retry"
|
|
347
|
+
elif session_status == "infra_error":
|
|
348
|
+
summary["restart_policy"] = "infra_retry"
|
|
349
|
+
summary["infra_error_count"] = bs.get("infra_error_count", 0)
|
|
350
|
+
summary["artifacts_preserved"] = True
|
|
336
351
|
elif session_status != "success":
|
|
337
352
|
summary["restart_policy"] = "full_restart"
|
|
338
353
|
summary["cleanup_performed"] = cleaned
|
|
@@ -45,6 +45,7 @@ SESSION_STATUS_VALUES = [
|
|
|
45
45
|
"failed",
|
|
46
46
|
"crashed",
|
|
47
47
|
"timed_out",
|
|
48
|
+
"infra_error",
|
|
48
49
|
"commit_missing",
|
|
49
50
|
"docs_missing",
|
|
50
51
|
"merge_conflict",
|
|
@@ -645,6 +646,19 @@ def action_update(args, feature_list_path, state_dir):
|
|
|
645
646
|
fs["sessions"] = []
|
|
646
647
|
fs["last_session_id"] = None
|
|
647
648
|
|
|
649
|
+
err = update_feature_in_list(feature_list_path, feature_id, new_status)
|
|
650
|
+
if err:
|
|
651
|
+
error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
|
|
652
|
+
return
|
|
653
|
+
elif session_status == "infra_error":
|
|
654
|
+
# AI CLI/provider outage, auth failure, gateway error, etc.
|
|
655
|
+
# This is outside the code's control, so keep the item pending without
|
|
656
|
+
# consuming the task's retry budget.
|
|
657
|
+
new_status = "pending"
|
|
658
|
+
fs["infra_error_count"] = fs.get("infra_error_count", 0) + 1
|
|
659
|
+
fs["last_infra_error_session_id"] = session_id
|
|
660
|
+
fs["resume_from_phase"] = None
|
|
661
|
+
|
|
648
662
|
err = update_feature_in_list(feature_list_path, feature_id, new_status)
|
|
649
663
|
if err:
|
|
650
664
|
error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
|
|
@@ -701,6 +715,10 @@ def action_update(args, feature_list_path, state_dir):
|
|
|
701
715
|
if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
|
|
702
716
|
summary["degraded_reason"] = session_status
|
|
703
717
|
summary["restart_policy"] = "finalization_retry"
|
|
718
|
+
elif session_status == "infra_error":
|
|
719
|
+
summary["restart_policy"] = "infra_retry"
|
|
720
|
+
summary["infra_error_count"] = fs.get("infra_error_count", 0)
|
|
721
|
+
summary["artifacts_preserved"] = True
|
|
704
722
|
elif session_status != "success":
|
|
705
723
|
summary["restart_policy"] = "preserve_and_retry"
|
|
706
724
|
summary["artifacts_preserved"] = True
|
|
@@ -42,6 +42,7 @@ SESSION_STATUS_VALUES = [
|
|
|
42
42
|
"failed",
|
|
43
43
|
"crashed",
|
|
44
44
|
"timed_out",
|
|
45
|
+
"infra_error",
|
|
45
46
|
"commit_missing",
|
|
46
47
|
"docs_missing",
|
|
47
48
|
"merge_conflict",
|
|
@@ -314,6 +315,16 @@ def action_update(args, refactor_list_path, state_dir):
|
|
|
314
315
|
rs["sessions"] = []
|
|
315
316
|
rs["last_session_id"] = None
|
|
316
317
|
|
|
318
|
+
err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
|
|
319
|
+
if err:
|
|
320
|
+
error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
|
|
321
|
+
return
|
|
322
|
+
elif session_status == "infra_error":
|
|
323
|
+
new_status = "pending"
|
|
324
|
+
rs["infra_error_count"] = rs.get("infra_error_count", 0) + 1
|
|
325
|
+
rs["last_infra_error_session_id"] = session_id
|
|
326
|
+
rs["resume_from_phase"] = None
|
|
327
|
+
|
|
317
328
|
err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
|
|
318
329
|
if err:
|
|
319
330
|
error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
|
|
@@ -376,6 +387,10 @@ def action_update(args, refactor_list_path, state_dir):
|
|
|
376
387
|
if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
|
|
377
388
|
summary["degraded_reason"] = session_status
|
|
378
389
|
summary["restart_policy"] = "finalization_retry"
|
|
390
|
+
elif session_status == "infra_error":
|
|
391
|
+
summary["restart_policy"] = "infra_retry"
|
|
392
|
+
summary["infra_error_count"] = rs.get("infra_error_count", 0)
|
|
393
|
+
summary["artifacts_preserved"] = True
|
|
379
394
|
elif session_status != "success":
|
|
380
395
|
summary["restart_policy"] = "full_restart"
|
|
381
396
|
summary["cleanup_performed"] = cleaned
|
|
@@ -303,6 +303,45 @@ def _run_get_next(fl_path, state_dir):
|
|
|
303
303
|
return result.stdout.strip()
|
|
304
304
|
|
|
305
305
|
|
|
306
|
+
def _run_update(fl_path, state_dir, feature_id, session_status, session_id="session-1", max_retries=3):
|
|
307
|
+
cmd = [
|
|
308
|
+
"python3", _SCRIPT,
|
|
309
|
+
"--feature-list", fl_path,
|
|
310
|
+
"--state-dir", state_dir,
|
|
311
|
+
"--feature-id", feature_id,
|
|
312
|
+
"--session-status", session_status,
|
|
313
|
+
"--session-id", session_id,
|
|
314
|
+
"--max-retries", str(max_retries),
|
|
315
|
+
"--action", "update",
|
|
316
|
+
]
|
|
317
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
318
|
+
assert result.returncode == 0, result.stderr
|
|
319
|
+
return json.loads(result.stdout)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class TestInfraErrorUpdate:
|
|
323
|
+
def test_infra_error_keeps_pending_without_consuming_retry(self, tmp_path):
|
|
324
|
+
features = [_make_feature("F-001", "Root", status="in_progress")]
|
|
325
|
+
fl_path = _write_fl(tmp_path, features)
|
|
326
|
+
state_dir = _init_state(tmp_path, ["F-001"])
|
|
327
|
+
status_path = os.path.join(state_dir, "features", "F-001", "status.json")
|
|
328
|
+
fs = load_feature_status(state_dir, "F-001")
|
|
329
|
+
fs["retry_count"] = 2
|
|
330
|
+
write_json_file(status_path, fs)
|
|
331
|
+
|
|
332
|
+
result = _run_update(fl_path, state_dir, "F-001", "infra_error", "session-infra", max_retries=3)
|
|
333
|
+
|
|
334
|
+
assert result["new_status"] == "pending"
|
|
335
|
+
assert result["retry_count"] == 2
|
|
336
|
+
assert result["restart_policy"] == "infra_retry"
|
|
337
|
+
assert _read_statuses(fl_path)["F-001"] == "pending"
|
|
338
|
+
|
|
339
|
+
fs = load_feature_status(state_dir, "F-001")
|
|
340
|
+
assert fs["retry_count"] == 2
|
|
341
|
+
assert fs["infra_error_count"] == 1
|
|
342
|
+
assert fs["last_infra_error_session_id"] == "session-infra"
|
|
343
|
+
|
|
344
|
+
|
|
306
345
|
class TestUnskipByFeatureId:
|
|
307
346
|
"""Unskip with --feature-id targets a specific failed feature + downstream."""
|
|
308
347
|
|
|
@@ -22,9 +22,10 @@
|
|
|
22
22
|
# SESSION_TIMEOUT=0 # Session timeout in seconds (0 = no limit)
|
|
23
23
|
# VERBOSE=1 # Verbose logging (1=on, 0=off)
|
|
24
24
|
# HEARTBEAT_INTERVAL=30 # Poll interval for session progress/stale checks
|
|
25
|
-
# STALE_KILL_THRESHOLD=900 # Auto-kill session after N seconds without log progress (0 = disabled)
|
|
25
|
+
# STALE_KILL_THRESHOLD=900 # Auto-kill session after N seconds without parent log progress (0 = disabled)
|
|
26
26
|
# STALE_KILL_GRACE_SECONDS=10 # Grace period after stale-kill before force-stopping the job
|
|
27
|
-
#
|
|
27
|
+
# CODEX_WAIT_STALE_KILL_THRESHOLD=3600 # Longer no-log window while Codex waits on subagents
|
|
28
|
+
# CODEX_SUBAGENT_TIMEOUT_SECONDS=3300 # Codex subagent max runtime; defaults to wait threshold - 300
|
|
28
29
|
# LOG_CLEANUP_ENABLED=1 # Run periodic session log cleanup
|
|
29
30
|
# LOG_RETENTION_DAYS=14 # Delete session logs older than N days
|
|
30
31
|
# LOG_MAX_TOTAL_MB=1024 # Keep total logs under N MB via oldest-first cleanup
|
|
@@ -346,13 +346,14 @@ pending, in_progress, completed, failed, skipped
|
|
|
346
346
|
| `SESSION_TIMEOUT` | integer | 0 | 0 = no limit |
|
|
347
347
|
| `VERBOSE` | integer | (not specified) | 1=on, 0=off |
|
|
348
348
|
| `HEARTBEAT_INTERVAL` | integer | 30 | Poll interval for session progress/stale checks |
|
|
349
|
-
| `STALE_KILL_THRESHOLD` | integer | 900 | Auto-kill after N seconds without log progress; 0 disables |
|
|
349
|
+
| `STALE_KILL_THRESHOLD` | integer | 900 | Auto-kill after N seconds without parent log progress; 0 disables |
|
|
350
350
|
| `STALE_KILL_GRACE_SECONDS` | integer | 10 | Grace period after stale-kill before force-stopping |
|
|
351
|
-
| `
|
|
351
|
+
| `CODEX_WAIT_STALE_KILL_THRESHOLD` | integer | 3600 | Longer no-log stale window while Codex waits on subagents |
|
|
352
|
+
| `CODEX_SUBAGENT_TIMEOUT_SECONDS` | integer | 3300 | Codex subagent max runtime |
|
|
352
353
|
| `LOG_CLEANUP_ENABLED` | boolean | 1 | Periodic session log cleanup |
|
|
353
354
|
| `LOG_RETENTION_DAYS` | integer | 14 | Delete session logs older than N days |
|
|
354
355
|
| `LOG_MAX_TOTAL_MB` | integer | 1024 | Keep total logs under N MB |
|
|
355
|
-
| `STOP_ON_FAILURE` | boolean | 0 | Stop after
|
|
356
|
+
| `STOP_ON_FAILURE` | boolean | 0 | Stop after a task exhausts retries |
|
|
356
357
|
| `ENABLE_DEPLOY` | boolean | 0 | Start deploy session after all tasks complete |
|
|
357
358
|
| `DEV_BRANCH` | string | auto-generated | Optional custom dev branch name |
|
|
358
359
|
| `AUTO_PUSH` | boolean | 0 | Push original branch after successful merge |
|
|
@@ -145,6 +145,25 @@ function Invoke-PrizmPythonText {
|
|
|
145
145
|
if ($LASTEXITCODE -ne 0) { throw "Python command failed: $($Arguments -join ' ')" }
|
|
146
146
|
}
|
|
147
147
|
|
|
148
|
+
function Test-PrizmInfraError {
|
|
149
|
+
param([string]$SessionLog, [string]$ProgressJson)
|
|
150
|
+
$parts = @()
|
|
151
|
+
if ($SessionLog -and (Test-Path $SessionLog)) {
|
|
152
|
+
try {
|
|
153
|
+
$text = Get-Content $SessionLog -Raw -ErrorAction Stop
|
|
154
|
+
if ($text.Length -gt 65536) { $text = $text.Substring($text.Length - 65536) }
|
|
155
|
+
$parts += $text
|
|
156
|
+
} catch {}
|
|
157
|
+
}
|
|
158
|
+
if ($ProgressJson -and (Test-Path $ProgressJson)) {
|
|
159
|
+
try { $parts += (Get-Content $ProgressJson -Raw -ErrorAction Stop) } catch {}
|
|
160
|
+
}
|
|
161
|
+
if ($parts.Count -eq 0) { return $false }
|
|
162
|
+
|
|
163
|
+
$haystack = $parts -join "`n"
|
|
164
|
+
return ($haystack -match '(?i)auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded')
|
|
165
|
+
}
|
|
166
|
+
|
|
148
167
|
function Get-PrizmConfigValue {
|
|
149
168
|
param([string]$ConfigPath, [string]$Key)
|
|
150
169
|
if (-not (Test-Path $ConfigPath)) { return $null }
|
|
@@ -345,13 +364,86 @@ function Get-PrizmCodexSubagentTimeoutSeconds {
|
|
|
345
364
|
return $configuredTimeout
|
|
346
365
|
}
|
|
347
366
|
|
|
348
|
-
$
|
|
349
|
-
$
|
|
350
|
-
if ([int]::TryParse($
|
|
351
|
-
return ($
|
|
367
|
+
$waitThreshold = 0
|
|
368
|
+
$waitThresholdText = if ($env:CODEX_WAIT_STALE_KILL_THRESHOLD) { $env:CODEX_WAIT_STALE_KILL_THRESHOLD } else { '3600' }
|
|
369
|
+
if ([int]::TryParse($waitThresholdText, [ref]$waitThreshold) -and $waitThreshold -gt 600) {
|
|
370
|
+
return ($waitThreshold - 300)
|
|
352
371
|
}
|
|
353
372
|
|
|
354
|
-
return
|
|
373
|
+
return 3300
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
function Get-PrizmEffectiveStaleKillThreshold {
|
|
377
|
+
param(
|
|
378
|
+
[string]$ProgressFile,
|
|
379
|
+
[int]$BaseThreshold
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
if ($BaseThreshold -le 0) { return $BaseThreshold }
|
|
383
|
+
if (-not (Test-Path $ProgressFile)) { return $BaseThreshold }
|
|
384
|
+
|
|
385
|
+
try {
|
|
386
|
+
$progress = Get-Content $ProgressFile -Raw -ErrorAction Stop | ConvertFrom-Json -ErrorAction Stop
|
|
387
|
+
} catch {
|
|
388
|
+
return $BaseThreshold
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
$spawnCount = 0
|
|
392
|
+
if ($progress.event_format -eq 'codex-json' -and $progress.current_tool -eq 'wait' -and $progress.tool_calls) {
|
|
393
|
+
foreach ($tool in @($progress.tool_calls)) {
|
|
394
|
+
if ($tool.name -eq 'spawn_agent') {
|
|
395
|
+
$count = 0
|
|
396
|
+
if ([int]::TryParse([string]$tool.count, [ref]$count)) { $spawnCount += $count }
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
if ($spawnCount -le 0) { return $BaseThreshold }
|
|
402
|
+
|
|
403
|
+
$waitThreshold = 0
|
|
404
|
+
if ([int]::TryParse($env:CODEX_WAIT_STALE_KILL_THRESHOLD, [ref]$waitThreshold) -and $waitThreshold -gt $BaseThreshold) {
|
|
405
|
+
return $waitThreshold
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
return [Math]::Max($BaseThreshold * 4, 3600)
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
function Get-PrizmProgressChildActivity {
|
|
412
|
+
param([string]$ProgressFile)
|
|
413
|
+
|
|
414
|
+
$empty = [pscustomobject]@{
|
|
415
|
+
Signature = ''
|
|
416
|
+
TotalBytes = 0
|
|
417
|
+
SessionCount = 0
|
|
418
|
+
}
|
|
419
|
+
if (-not (Test-Path $ProgressFile)) { return $empty }
|
|
420
|
+
|
|
421
|
+
try {
|
|
422
|
+
$progress = Get-Content $ProgressFile -Raw -ErrorAction Stop | ConvertFrom-Json -ErrorAction Stop
|
|
423
|
+
} catch {
|
|
424
|
+
return $empty
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
$signature = ''
|
|
428
|
+
if ($progress.PSObject.Properties['child_activity_signature'] -and $progress.child_activity_signature) {
|
|
429
|
+
$signature = [string]$progress.child_activity_signature
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
$totalBytes = [int64]0
|
|
433
|
+
if ($progress.PSObject.Properties['child_total_bytes']) {
|
|
434
|
+
[int64]::TryParse([string]$progress.child_total_bytes, [ref]$totalBytes) | Out-Null
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
$sessionCount = 0
|
|
438
|
+
if ($progress.PSObject.Properties['child_session_files'] -and $progress.child_session_files) {
|
|
439
|
+
$sessionCount = @($progress.child_session_files).Count
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
return [pscustomobject]@{
|
|
443
|
+
Signature = $signature
|
|
444
|
+
TotalBytes = $totalBytes
|
|
445
|
+
SessionCount = $sessionCount
|
|
446
|
+
}
|
|
355
447
|
}
|
|
356
448
|
|
|
357
449
|
function Test-PrizmCodexJsonSupport {
|
|
@@ -552,6 +552,7 @@ function Invoke-PrizmPipeline {
|
|
|
552
552
|
$elapsedSeconds = 0
|
|
553
553
|
$staleSeconds = 0
|
|
554
554
|
$previousLogSize = 0
|
|
555
|
+
$previousChildActivitySignature = ''
|
|
555
556
|
$wasTimedOut = $false
|
|
556
557
|
$staleKillMarker = Join-Path $logsDir 'stale-kill.json'
|
|
557
558
|
$wasStaleKilled = $false
|
|
@@ -568,7 +569,13 @@ function Invoke-PrizmPipeline {
|
|
|
568
569
|
}
|
|
569
570
|
$growth = $currentLogSize - $previousLogSize
|
|
570
571
|
$previousLogSize = $currentLogSize
|
|
571
|
-
|
|
572
|
+
|
|
573
|
+
$childActivity = Get-PrizmProgressChildActivity -ProgressFile $progressJson
|
|
574
|
+
$childSignature = [string]$childActivity.Signature
|
|
575
|
+
$childAdvanced = ($childSignature -and $childSignature -ne $previousChildActivitySignature)
|
|
576
|
+
$previousChildActivitySignature = $childSignature
|
|
577
|
+
|
|
578
|
+
if ($growth -gt 0 -or $childAdvanced) {
|
|
572
579
|
$staleSeconds = 0
|
|
573
580
|
} else {
|
|
574
581
|
$staleSeconds += $waitSeconds
|
|
@@ -580,10 +587,11 @@ function Invoke-PrizmPipeline {
|
|
|
580
587
|
break
|
|
581
588
|
}
|
|
582
589
|
|
|
583
|
-
|
|
590
|
+
$effectiveStaleKillThreshold = Get-PrizmEffectiveStaleKillThreshold -ProgressFile $progressJson -BaseThreshold $staleKillThreshold
|
|
591
|
+
if ($effectiveStaleKillThreshold -gt 0 -and $staleSeconds -ge $effectiveStaleKillThreshold) {
|
|
584
592
|
$wasStaleKilled = $true
|
|
585
|
-
Write-PrizmWarn "Session stale-killed (no progress for ${
|
|
586
|
-
Write-PrizmStaleKillMarker $staleKillMarker $staleSeconds $
|
|
593
|
+
Write-PrizmWarn "Session stale-killed (no progress for ${effectiveStaleKillThreshold}s)"
|
|
594
|
+
Write-PrizmStaleKillMarker $staleKillMarker $staleSeconds $effectiveStaleKillThreshold
|
|
587
595
|
Stop-PrizmSessionProcess $pidPath
|
|
588
596
|
if ($staleKillGraceSeconds -gt 0) { Start-Sleep -Seconds $staleKillGraceSeconds }
|
|
589
597
|
break
|
|
@@ -610,10 +618,16 @@ function Invoke-PrizmPipeline {
|
|
|
610
618
|
}
|
|
611
619
|
Stop-PrizmProgressParser $parserProcess
|
|
612
620
|
|
|
621
|
+
$wasInfraError = ($exitCode -ne 0 -and (Test-PrizmInfraError -SessionLog $sessionLog -ProgressJson $progressJson))
|
|
622
|
+
|
|
613
623
|
$status = 'crashed'
|
|
614
624
|
if ($wasTimedOut) {
|
|
615
625
|
$status = 'timed_out'
|
|
616
626
|
Write-PrizmWarn "AI session timed out after $timeoutSeconds seconds"
|
|
627
|
+
} elseif ($wasInfraError) {
|
|
628
|
+
$status = 'infra_error'
|
|
629
|
+
Write-PrizmWarn "AI session failed due to AI CLI/provider infrastructure error"
|
|
630
|
+
Write-PrizmWarn "Infrastructure errors are retried without consuming code retry budget"
|
|
617
631
|
} elseif ($wasStaleKilled -or (Test-Path $staleKillMarker)) {
|
|
618
632
|
Write-PrizmWarn "Session was stale-killed by heartbeat monitor (no progress for too long)"
|
|
619
633
|
Write-PrizmWarn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
|
|
@@ -637,8 +651,12 @@ function Invoke-PrizmPipeline {
|
|
|
637
651
|
}
|
|
638
652
|
|
|
639
653
|
$mergeSucceeded = $true
|
|
654
|
+
$itemListStatus = ''
|
|
640
655
|
if ($status -eq 'success') {
|
|
641
|
-
Invoke-
|
|
656
|
+
$updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
|
|
657
|
+
if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
|
|
658
|
+
$itemListStatus = [string]$updateResult.new_status
|
|
659
|
+
}
|
|
642
660
|
|
|
643
661
|
if (Test-PrizmGitDirty $paths.ProjectRoot) {
|
|
644
662
|
if ($hadDirtyBaseline) {
|
|
@@ -668,7 +686,10 @@ function Invoke-PrizmPipeline {
|
|
|
668
686
|
}
|
|
669
687
|
|
|
670
688
|
if ($status -ne 'success') {
|
|
671
|
-
Invoke-
|
|
689
|
+
$updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
|
|
690
|
+
if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
|
|
691
|
+
$itemListStatus = [string]$updateResult.new_status
|
|
692
|
+
}
|
|
672
693
|
if ($isGitRepository) {
|
|
673
694
|
Invoke-PrizmGitCommitPath $paths.ProjectRoot $listPath "chore($CurrentItemId): update $idName status" | Out-Null
|
|
674
695
|
}
|
|
@@ -679,6 +700,7 @@ function Invoke-PrizmPipeline {
|
|
|
679
700
|
} else {
|
|
680
701
|
Write-PrizmError "$Kind item failed: $CurrentItemId. Log: $sessionLog"
|
|
681
702
|
}
|
|
703
|
+
$script:PRIZM_ITEM_LIST_STATUS = $itemListStatus
|
|
682
704
|
$script:PRIZM_ITEM_EXIT_CODE = if ($status -eq 'success' -and $mergeSucceeded) { 0 } else { 1 }
|
|
683
705
|
return
|
|
684
706
|
}
|
|
@@ -740,9 +762,11 @@ function Invoke-PrizmPipeline {
|
|
|
740
762
|
$global:PRIZM_EXIT_CODE = $lastExitCode
|
|
741
763
|
return
|
|
742
764
|
}
|
|
743
|
-
if ($lastExitCode -ne 0 -and $stopOnFailure) {
|
|
765
|
+
if ($lastExitCode -ne 0 -and $stopOnFailure -and $script:PRIZM_ITEM_LIST_STATUS -eq 'failed') {
|
|
744
766
|
$global:PRIZM_EXIT_CODE = $lastExitCode
|
|
745
767
|
return
|
|
768
|
+
} elseif ($lastExitCode -ne 0 -and $stopOnFailure) {
|
|
769
|
+
Write-PrizmInfo "STOP_ON_FAILURE: $nextItemId is $($script:PRIZM_ITEM_LIST_STATUS); retry budget not exhausted, continuing."
|
|
746
770
|
}
|
|
747
771
|
}
|
|
748
772
|
}
|
|
@@ -110,6 +110,7 @@ $job = Start-Job -ScriptBlock {
|
|
|
110
110
|
$elapsedSeconds = 0
|
|
111
111
|
$staleSeconds = 0
|
|
112
112
|
$previousLogSize = 0
|
|
113
|
+
$previousChildActivitySignature = ''
|
|
113
114
|
$wasTimedOut = $false
|
|
114
115
|
$wasStaleKilled = $false
|
|
115
116
|
while ($true) {
|
|
@@ -123,7 +124,13 @@ while ($true) {
|
|
|
123
124
|
if (Test-Path $logPath) { $currentLogSize = [int64](Get-Item $logPath).Length }
|
|
124
125
|
$growth = $currentLogSize - $previousLogSize
|
|
125
126
|
$previousLogSize = $currentLogSize
|
|
126
|
-
|
|
127
|
+
|
|
128
|
+
$childActivity = Get-PrizmProgressChildActivity -ProgressFile $progressPath
|
|
129
|
+
$childSignature = [string]$childActivity.Signature
|
|
130
|
+
$childAdvanced = ($childSignature -and $childSignature -ne $previousChildActivitySignature)
|
|
131
|
+
$previousChildActivitySignature = $childSignature
|
|
132
|
+
|
|
133
|
+
if ($growth -gt 0 -or $childAdvanced) { $staleSeconds = 0 } else { $staleSeconds += $waitSeconds }
|
|
127
134
|
|
|
128
135
|
if ($timeoutSeconds -gt 0 -and $elapsedSeconds -ge $timeoutSeconds) {
|
|
129
136
|
$wasTimedOut = $true
|