prizmkit 1.1.66 → 1.1.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. package/bundled/VERSION.json +3 -3
  2. package/bundled/adapters/codex/settings-adapter.js +1 -1
  3. package/bundled/dev-pipeline/.env.example +3 -0
  4. package/bundled/dev-pipeline/SCHEMA_ANALYSIS.md +3 -1
  5. package/bundled/dev-pipeline/lib/common.sh +61 -18
  6. package/bundled/dev-pipeline/lib/heartbeat.sh +104 -11
  7. package/bundled/dev-pipeline/run-bugfix.sh +26 -5
  8. package/bundled/dev-pipeline/run-feature.sh +20 -3
  9. package/bundled/dev-pipeline/run-refactor.sh +26 -5
  10. package/bundled/dev-pipeline/scripts/parse-stream-progress.py +144 -12
  11. package/bundled/dev-pipeline/scripts/update-bug-status.py +15 -0
  12. package/bundled/dev-pipeline/scripts/update-feature-status.py +18 -0
  13. package/bundled/dev-pipeline/scripts/update-refactor-status.py +15 -0
  14. package/bundled/dev-pipeline/tests/test_auto_skip.py +39 -0
  15. package/bundled/dev-pipeline-windows/.env.example +3 -2
  16. package/bundled/dev-pipeline-windows/SCHEMA_ANALYSIS.md +4 -3
  17. package/bundled/dev-pipeline-windows/lib/common.ps1 +97 -5
  18. package/bundled/dev-pipeline-windows/lib/pipeline.ps1 +31 -7
  19. package/bundled/dev-pipeline-windows/run-recovery.ps1 +8 -1
  20. package/bundled/dev-pipeline-windows/scripts/parse-stream-progress.py +144 -12
  21. package/bundled/dev-pipeline-windows/scripts/update-bug-status.py +15 -0
  22. package/bundled/dev-pipeline-windows/scripts/update-feature-status.py +18 -0
  23. package/bundled/dev-pipeline-windows/scripts/update-refactor-status.py +15 -0
  24. package/bundled/skills/_metadata.json +1 -1
  25. package/package.json +1 -1
  26. package/src/scaffold.js +1 -1
@@ -23,6 +23,7 @@ import tempfile
23
23
  import time
24
24
  from collections import Counter
25
25
  from datetime import datetime, timezone
26
+ from pathlib import Path
26
27
 
27
28
 
28
29
  # Ordered pipeline phases — index defines forward-only progression.
@@ -76,6 +77,13 @@ class ProgressTracker:
76
77
  self.event_format = ""
77
78
  self.active_subagent_count = 0
78
79
  self.subagent_status_counts = Counter()
80
+ self.codex_child_thread_ids = set()
81
+ self.child_session_files = []
82
+ self.child_total_bytes = 0
83
+ self.child_activity_signature = ""
84
+ self.last_child_activity_at = ""
85
+ self._codex_child_session_paths = {}
86
+ self._last_child_scan_at = 0.0
79
87
  self._text_buffer = ""
80
88
  self._in_tool_use = False
81
89
  self._current_tool_input_parts = []
@@ -113,6 +121,9 @@ class ProgressTracker:
113
121
 
114
122
  elif item_type == "collab_tool_call":
115
123
  tool_name = item.get("tool", "collab")
124
+ self._record_codex_child_thread_ids(
125
+ item.get("receiver_thread_ids")
126
+ )
116
127
  if event_type == "item.started":
117
128
  self.current_tool = tool_name
118
129
  self.tool_call_counts[tool_name] += 1
@@ -345,8 +356,117 @@ class ProgressTracker:
345
356
  self.subagent_status_counts = counts
346
357
  self.active_subagent_count = active
347
358
 
359
+ def _record_codex_child_thread_ids(self, thread_ids):
360
+ """Remember Codex child thread IDs reported by collab tool calls."""
361
+ if not isinstance(thread_ids, list):
362
+ return
363
+ for thread_id in thread_ids:
364
+ if isinstance(thread_id, str) and thread_id.strip():
365
+ self.codex_child_thread_ids.add(thread_id.strip())
366
+
367
+ def _codex_sessions_dir(self):
368
+ """Return the Codex sessions directory for the current environment."""
369
+ codex_home = os.environ.get("CODEX_HOME")
370
+ if codex_home:
371
+ return Path(codex_home).expanduser() / "sessions"
372
+ return Path.home() / ".codex" / "sessions"
373
+
374
+ def _find_codex_child_session_file(self, thread_id):
375
+ """Find a Codex transcript file for a child thread ID."""
376
+ sessions_dir = self._codex_sessions_dir()
377
+ if not sessions_dir.exists():
378
+ return None
379
+
380
+ try:
381
+ matches = list(sessions_dir.rglob(f"*{thread_id}.jsonl"))
382
+ except OSError:
383
+ return None
384
+
385
+ if not matches:
386
+ return None
387
+
388
+ try:
389
+ matches.sort(key=lambda path: path.stat().st_mtime, reverse=True)
390
+ except OSError:
391
+ pass
392
+ return str(matches[0])
393
+
394
+ def refresh_child_session_activity(self, force=False):
395
+ """Refresh Codex child transcript file stats.
396
+
397
+ The heartbeat monitor uses this activity signature to treat subagent
398
+ transcript growth as real progress while the parent Codex session is
399
+ blocked in `wait`.
400
+ """
401
+ previous_signature = self.child_activity_signature
402
+
403
+ if not self.codex_child_thread_ids:
404
+ self.child_session_files = []
405
+ self.child_total_bytes = 0
406
+ self.child_activity_signature = ""
407
+ self.last_child_activity_at = ""
408
+ return previous_signature != self.child_activity_signature
409
+
410
+ now = time.monotonic()
411
+ should_scan = (
412
+ force
413
+ or self._last_child_scan_at == 0.0
414
+ or (now - self._last_child_scan_at >= 2.0)
415
+ )
416
+ if should_scan:
417
+ for thread_id in sorted(self.codex_child_thread_ids):
418
+ path = self._codex_child_session_paths.get(thread_id)
419
+ if not path or not os.path.exists(path):
420
+ found = self._find_codex_child_session_file(thread_id)
421
+ if found:
422
+ self._codex_child_session_paths[thread_id] = found
423
+ self._last_child_scan_at = now
424
+
425
+ files = []
426
+ signature_parts = []
427
+ total_bytes = 0
428
+ max_mtime = 0.0
429
+
430
+ for thread_id in sorted(self.codex_child_thread_ids):
431
+ path = self._codex_child_session_paths.get(thread_id)
432
+ if not path:
433
+ continue
434
+ try:
435
+ stat = os.stat(path)
436
+ except OSError:
437
+ continue
438
+
439
+ total_bytes += stat.st_size
440
+ max_mtime = max(max_mtime, stat.st_mtime)
441
+ signature_parts.append(
442
+ f"{thread_id}:{stat.st_size}:{getattr(stat, 'st_mtime_ns', int(stat.st_mtime * 1_000_000_000))}"
443
+ )
444
+ files.append(
445
+ {
446
+ "thread_id": thread_id,
447
+ "path": path,
448
+ "size": stat.st_size,
449
+ "mtime": datetime.fromtimestamp(
450
+ stat.st_mtime, timezone.utc
451
+ ).strftime("%Y-%m-%dT%H:%M:%SZ"),
452
+ }
453
+ )
454
+
455
+ self.child_session_files = files
456
+ self.child_total_bytes = total_bytes
457
+ self.child_activity_signature = "|".join(signature_parts)
458
+ self.last_child_activity_at = (
459
+ datetime.fromtimestamp(max_mtime, timezone.utc).strftime(
460
+ "%Y-%m-%dT%H:%M:%SZ"
461
+ )
462
+ if max_mtime
463
+ else ""
464
+ )
465
+ return previous_signature != self.child_activity_signature
466
+
348
467
  def to_dict(self):
349
468
  """Export current state as a dictionary for JSON serialization."""
469
+ self.refresh_child_session_activity()
350
470
  tool_calls = [
351
471
  {"name": name, "count": count}
352
472
  for name, count in self.tool_call_counts.most_common()
@@ -367,6 +487,11 @@ class ProgressTracker:
367
487
  "total_tool_calls": self.total_tool_calls,
368
488
  "active_subagent_count": self.active_subagent_count,
369
489
  "subagent_states": subagent_states,
490
+ "child_thread_ids": sorted(self.codex_child_thread_ids),
491
+ "child_session_files": self.child_session_files,
492
+ "child_total_bytes": self.child_total_bytes,
493
+ "child_activity_signature": self.child_activity_signature,
494
+ "last_child_activity_at": self.last_child_activity_at,
370
495
  "last_text_snippet": self.last_text_snippet,
371
496
  "is_active": self.is_active,
372
497
  "errors": self.errors[-10:], # Keep last 10 errors
@@ -397,6 +522,15 @@ def tail_and_parse(session_log, progress_file, poll_interval=0.5):
397
522
  tracker = ProgressTracker()
398
523
  last_write_state = None
399
524
 
525
+ def state_key(state):
526
+ return (
527
+ state["message_count"],
528
+ state["current_tool"],
529
+ state["current_phase"],
530
+ state["total_tool_calls"],
531
+ state.get("child_activity_signature", ""),
532
+ )
533
+
400
534
  # Wait for log file to appear
401
535
  wait_count = 0
402
536
  while not os.path.exists(session_log):
@@ -428,22 +562,20 @@ def tail_and_parse(session_log, progress_file, poll_interval=0.5):
428
562
 
429
563
  # Write progress if state changed
430
564
  current_state = tracker.to_dict()
431
- state_key = (
432
- current_state["message_count"],
433
- current_state["current_tool"],
434
- current_state["current_phase"],
435
- current_state["total_tool_calls"],
436
- )
437
- if state_key != last_write_state:
565
+ current_state_key = state_key(current_state)
566
+ if current_state_key != last_write_state:
438
567
  atomic_write_json(current_state, progress_file)
439
- last_write_state = state_key
568
+ last_write_state = current_state_key
440
569
  else:
441
570
  idle_count += 1
442
- # After 2 seconds of no new data, write current state anyway
443
- # (ensures progress.json stays fresh)
444
- if idle_count == 4:
571
+ # Every 2 seconds of no parent log data, refresh child Codex
572
+ # transcript stats and write if child activity advanced.
573
+ if idle_count % 4 == 0:
445
574
  current_state = tracker.to_dict()
446
- atomic_write_json(current_state, progress_file)
575
+ current_state_key = state_key(current_state)
576
+ if current_state_key != last_write_state or idle_count == 4:
577
+ atomic_write_json(current_state, progress_file)
578
+ last_write_state = current_state_key
447
579
 
448
580
  # After 3600 idle cycles (30 min), mark inactive and exit
449
581
  if idle_count > 3600:
@@ -41,6 +41,7 @@ SESSION_STATUS_VALUES = [
41
41
  "failed",
42
42
  "crashed",
43
43
  "timed_out",
44
+ "infra_error",
44
45
  "commit_missing",
45
46
  "docs_missing",
46
47
  "merge_conflict",
@@ -280,6 +281,16 @@ def action_update(args, bug_list_path, state_dir):
280
281
  bs["sessions"] = []
281
282
  bs["last_session_id"] = None
282
283
 
284
+ err = update_bug_in_list(bug_list_path, bug_id, new_status)
285
+ if err:
286
+ error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
287
+ return
288
+ elif session_status == "infra_error":
289
+ new_status = "pending"
290
+ bs["infra_error_count"] = bs.get("infra_error_count", 0) + 1
291
+ bs["last_infra_error_session_id"] = session_id
292
+ bs["resume_from_phase"] = None
293
+
283
294
  err = update_bug_in_list(bug_list_path, bug_id, new_status)
284
295
  if err:
285
296
  error_out("Failed to update .prizmkit/plans/bug-fix-list.json: {}".format(err))
@@ -333,6 +344,10 @@ def action_update(args, bug_list_path, state_dir):
333
344
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
334
345
  summary["degraded_reason"] = session_status
335
346
  summary["restart_policy"] = "finalization_retry"
347
+ elif session_status == "infra_error":
348
+ summary["restart_policy"] = "infra_retry"
349
+ summary["infra_error_count"] = bs.get("infra_error_count", 0)
350
+ summary["artifacts_preserved"] = True
336
351
  elif session_status != "success":
337
352
  summary["restart_policy"] = "full_restart"
338
353
  summary["cleanup_performed"] = cleaned
@@ -45,6 +45,7 @@ SESSION_STATUS_VALUES = [
45
45
  "failed",
46
46
  "crashed",
47
47
  "timed_out",
48
+ "infra_error",
48
49
  "commit_missing",
49
50
  "docs_missing",
50
51
  "merge_conflict",
@@ -645,6 +646,19 @@ def action_update(args, feature_list_path, state_dir):
645
646
  fs["sessions"] = []
646
647
  fs["last_session_id"] = None
647
648
 
649
+ err = update_feature_in_list(feature_list_path, feature_id, new_status)
650
+ if err:
651
+ error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
652
+ return
653
+ elif session_status == "infra_error":
654
+ # AI CLI/provider outage, auth failure, gateway error, etc.
655
+ # This is outside the code's control, so keep the item pending without
656
+ # consuming the task's retry budget.
657
+ new_status = "pending"
658
+ fs["infra_error_count"] = fs.get("infra_error_count", 0) + 1
659
+ fs["last_infra_error_session_id"] = session_id
660
+ fs["resume_from_phase"] = None
661
+
648
662
  err = update_feature_in_list(feature_list_path, feature_id, new_status)
649
663
  if err:
650
664
  error_out("Failed to update .prizmkit/plans/feature-list.json: {}".format(err))
@@ -701,6 +715,10 @@ def action_update(args, feature_list_path, state_dir):
701
715
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
702
716
  summary["degraded_reason"] = session_status
703
717
  summary["restart_policy"] = "finalization_retry"
718
+ elif session_status == "infra_error":
719
+ summary["restart_policy"] = "infra_retry"
720
+ summary["infra_error_count"] = fs.get("infra_error_count", 0)
721
+ summary["artifacts_preserved"] = True
704
722
  elif session_status != "success":
705
723
  summary["restart_policy"] = "preserve_and_retry"
706
724
  summary["artifacts_preserved"] = True
@@ -42,6 +42,7 @@ SESSION_STATUS_VALUES = [
42
42
  "failed",
43
43
  "crashed",
44
44
  "timed_out",
45
+ "infra_error",
45
46
  "commit_missing",
46
47
  "docs_missing",
47
48
  "merge_conflict",
@@ -314,6 +315,16 @@ def action_update(args, refactor_list_path, state_dir):
314
315
  rs["sessions"] = []
315
316
  rs["last_session_id"] = None
316
317
 
318
+ err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
319
+ if err:
320
+ error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
321
+ return
322
+ elif session_status == "infra_error":
323
+ new_status = "pending"
324
+ rs["infra_error_count"] = rs.get("infra_error_count", 0) + 1
325
+ rs["last_infra_error_session_id"] = session_id
326
+ rs["resume_from_phase"] = None
327
+
317
328
  err = update_refactor_in_list(refactor_list_path, refactor_id, new_status)
318
329
  if err:
319
330
  error_out("Failed to update .prizmkit/plans/refactor-list.json: {}".format(err))
@@ -376,6 +387,10 @@ def action_update(args, refactor_list_path, state_dir):
376
387
  if session_status in ("commit_missing", "docs_missing", "merge_conflict"):
377
388
  summary["degraded_reason"] = session_status
378
389
  summary["restart_policy"] = "finalization_retry"
390
+ elif session_status == "infra_error":
391
+ summary["restart_policy"] = "infra_retry"
392
+ summary["infra_error_count"] = rs.get("infra_error_count", 0)
393
+ summary["artifacts_preserved"] = True
379
394
  elif session_status != "success":
380
395
  summary["restart_policy"] = "full_restart"
381
396
  summary["cleanup_performed"] = cleaned
@@ -303,6 +303,45 @@ def _run_get_next(fl_path, state_dir):
303
303
  return result.stdout.strip()
304
304
 
305
305
 
306
+ def _run_update(fl_path, state_dir, feature_id, session_status, session_id="session-1", max_retries=3):
307
+ cmd = [
308
+ "python3", _SCRIPT,
309
+ "--feature-list", fl_path,
310
+ "--state-dir", state_dir,
311
+ "--feature-id", feature_id,
312
+ "--session-status", session_status,
313
+ "--session-id", session_id,
314
+ "--max-retries", str(max_retries),
315
+ "--action", "update",
316
+ ]
317
+ result = subprocess.run(cmd, capture_output=True, text=True)
318
+ assert result.returncode == 0, result.stderr
319
+ return json.loads(result.stdout)
320
+
321
+
322
+ class TestInfraErrorUpdate:
323
+ def test_infra_error_keeps_pending_without_consuming_retry(self, tmp_path):
324
+ features = [_make_feature("F-001", "Root", status="in_progress")]
325
+ fl_path = _write_fl(tmp_path, features)
326
+ state_dir = _init_state(tmp_path, ["F-001"])
327
+ status_path = os.path.join(state_dir, "features", "F-001", "status.json")
328
+ fs = load_feature_status(state_dir, "F-001")
329
+ fs["retry_count"] = 2
330
+ write_json_file(status_path, fs)
331
+
332
+ result = _run_update(fl_path, state_dir, "F-001", "infra_error", "session-infra", max_retries=3)
333
+
334
+ assert result["new_status"] == "pending"
335
+ assert result["retry_count"] == 2
336
+ assert result["restart_policy"] == "infra_retry"
337
+ assert _read_statuses(fl_path)["F-001"] == "pending"
338
+
339
+ fs = load_feature_status(state_dir, "F-001")
340
+ assert fs["retry_count"] == 2
341
+ assert fs["infra_error_count"] == 1
342
+ assert fs["last_infra_error_session_id"] == "session-infra"
343
+
344
+
306
345
  class TestUnskipByFeatureId:
307
346
  """Unskip with --feature-id targets a specific failed feature + downstream."""
308
347
 
@@ -22,9 +22,10 @@
22
22
  # SESSION_TIMEOUT=0 # Session timeout in seconds (0 = no limit)
23
23
  # VERBOSE=1 # Verbose logging (1=on, 0=off)
24
24
  # HEARTBEAT_INTERVAL=30 # Poll interval for session progress/stale checks
25
- # STALE_KILL_THRESHOLD=900 # Auto-kill session after N seconds without log progress (0 = disabled)
25
+ # STALE_KILL_THRESHOLD=900 # Auto-kill session after N seconds without parent log progress (0 = disabled)
26
26
  # STALE_KILL_GRACE_SECONDS=10 # Grace period after stale-kill before force-stopping the job
27
- # CODEX_SUBAGENT_TIMEOUT_SECONDS=840 # Codex subagent max runtime; defaults to stale threshold - 60
27
+ # CODEX_WAIT_STALE_KILL_THRESHOLD=3600 # Longer no-log window while Codex waits on subagents
28
+ # CODEX_SUBAGENT_TIMEOUT_SECONDS=3300 # Codex subagent max runtime; defaults to wait threshold - 300
28
29
  # LOG_CLEANUP_ENABLED=1 # Run periodic session log cleanup
29
30
  # LOG_RETENTION_DAYS=14 # Delete session logs older than N days
30
31
  # LOG_MAX_TOTAL_MB=1024 # Keep total logs under N MB via oldest-first cleanup
@@ -346,13 +346,14 @@ pending, in_progress, completed, failed, skipped
346
346
  | `SESSION_TIMEOUT` | integer | 0 | 0 = no limit |
347
347
  | `VERBOSE` | integer | (not specified) | 1=on, 0=off |
348
348
  | `HEARTBEAT_INTERVAL` | integer | 30 | Poll interval for session progress/stale checks |
349
- | `STALE_KILL_THRESHOLD` | integer | 900 | Auto-kill after N seconds without log progress; 0 disables |
349
+ | `STALE_KILL_THRESHOLD` | integer | 900 | Auto-kill after N seconds without parent log progress; 0 disables |
350
350
  | `STALE_KILL_GRACE_SECONDS` | integer | 10 | Grace period after stale-kill before force-stopping |
351
- | `CODEX_SUBAGENT_TIMEOUT_SECONDS` | integer | 840 | Codex subagent max runtime |
351
+ | `CODEX_WAIT_STALE_KILL_THRESHOLD` | integer | 3600 | Longer no-log stale window while Codex waits on subagents |
352
+ | `CODEX_SUBAGENT_TIMEOUT_SECONDS` | integer | 3300 | Codex subagent max runtime |
352
353
  | `LOG_CLEANUP_ENABLED` | boolean | 1 | Periodic session log cleanup |
353
354
  | `LOG_RETENTION_DAYS` | integer | 14 | Delete session logs older than N days |
354
355
  | `LOG_MAX_TOTAL_MB` | integer | 1024 | Keep total logs under N MB |
355
- | `STOP_ON_FAILURE` | boolean | 0 | Stop after the first failed task |
356
+ | `STOP_ON_FAILURE` | boolean | 0 | Stop after a task exhausts retries |
356
357
  | `ENABLE_DEPLOY` | boolean | 0 | Start deploy session after all tasks complete |
357
358
  | `DEV_BRANCH` | string | auto-generated | Optional custom dev branch name |
358
359
  | `AUTO_PUSH` | boolean | 0 | Push original branch after successful merge |
@@ -145,6 +145,25 @@ function Invoke-PrizmPythonText {
145
145
  if ($LASTEXITCODE -ne 0) { throw "Python command failed: $($Arguments -join ' ')" }
146
146
  }
147
147
 
148
+ function Test-PrizmInfraError {
149
+ param([string]$SessionLog, [string]$ProgressJson)
150
+ $parts = @()
151
+ if ($SessionLog -and (Test-Path $SessionLog)) {
152
+ try {
153
+ $text = Get-Content $SessionLog -Raw -ErrorAction Stop
154
+ if ($text.Length -gt 65536) { $text = $text.Substring($text.Length - 65536) }
155
+ $parts += $text
156
+ } catch {}
157
+ }
158
+ if ($ProgressJson -and (Test-Path $ProgressJson)) {
159
+ try { $parts += (Get-Content $ProgressJson -Raw -ErrorAction Stop) } catch {}
160
+ }
161
+ if ($parts.Count -eq 0) { return $false }
162
+
163
+ $haystack = $parts -join "`n"
164
+ return ($haystack -match '(?i)auth_unavailable|no auth available|502 Bad Gateway|503 Service Unavailable|504 Gateway Timeout|gateway timeout|upstream (connect )?error|connection reset|ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|rate limit|rate_limit|temporarily unavailable|overloaded')
165
+ }
166
+
148
167
  function Get-PrizmConfigValue {
149
168
  param([string]$ConfigPath, [string]$Key)
150
169
  if (-not (Test-Path $ConfigPath)) { return $null }
@@ -345,13 +364,86 @@ function Get-PrizmCodexSubagentTimeoutSeconds {
345
364
  return $configuredTimeout
346
365
  }
347
366
 
348
- $outerThreshold = 0
349
- $outerThresholdText = if ($env:STALE_KILL_THRESHOLD) { $env:STALE_KILL_THRESHOLD } else { $env:SESSION_TIMEOUT }
350
- if ([int]::TryParse($outerThresholdText, [ref]$outerThreshold) -and $outerThreshold -gt 120) {
351
- return ($outerThreshold - 60)
367
+ $waitThreshold = 0
368
+ $waitThresholdText = if ($env:CODEX_WAIT_STALE_KILL_THRESHOLD) { $env:CODEX_WAIT_STALE_KILL_THRESHOLD } else { '3600' }
369
+ if ([int]::TryParse($waitThresholdText, [ref]$waitThreshold) -and $waitThreshold -gt 600) {
370
+ return ($waitThreshold - 300)
352
371
  }
353
372
 
354
- return 840
373
+ return 3300
374
+ }
375
+
376
+ function Get-PrizmEffectiveStaleKillThreshold {
377
+ param(
378
+ [string]$ProgressFile,
379
+ [int]$BaseThreshold
380
+ )
381
+
382
+ if ($BaseThreshold -le 0) { return $BaseThreshold }
383
+ if (-not (Test-Path $ProgressFile)) { return $BaseThreshold }
384
+
385
+ try {
386
+ $progress = Get-Content $ProgressFile -Raw -ErrorAction Stop | ConvertFrom-Json -ErrorAction Stop
387
+ } catch {
388
+ return $BaseThreshold
389
+ }
390
+
391
+ $spawnCount = 0
392
+ if ($progress.event_format -eq 'codex-json' -and $progress.current_tool -eq 'wait' -and $progress.tool_calls) {
393
+ foreach ($tool in @($progress.tool_calls)) {
394
+ if ($tool.name -eq 'spawn_agent') {
395
+ $count = 0
396
+ if ([int]::TryParse([string]$tool.count, [ref]$count)) { $spawnCount += $count }
397
+ }
398
+ }
399
+ }
400
+
401
+ if ($spawnCount -le 0) { return $BaseThreshold }
402
+
403
+ $waitThreshold = 0
404
+ if ([int]::TryParse($env:CODEX_WAIT_STALE_KILL_THRESHOLD, [ref]$waitThreshold) -and $waitThreshold -gt $BaseThreshold) {
405
+ return $waitThreshold
406
+ }
407
+
408
+ return [Math]::Max($BaseThreshold * 4, 3600)
409
+ }
410
+
411
+ function Get-PrizmProgressChildActivity {
412
+ param([string]$ProgressFile)
413
+
414
+ $empty = [pscustomobject]@{
415
+ Signature = ''
416
+ TotalBytes = 0
417
+ SessionCount = 0
418
+ }
419
+ if (-not (Test-Path $ProgressFile)) { return $empty }
420
+
421
+ try {
422
+ $progress = Get-Content $ProgressFile -Raw -ErrorAction Stop | ConvertFrom-Json -ErrorAction Stop
423
+ } catch {
424
+ return $empty
425
+ }
426
+
427
+ $signature = ''
428
+ if ($progress.PSObject.Properties['child_activity_signature'] -and $progress.child_activity_signature) {
429
+ $signature = [string]$progress.child_activity_signature
430
+ }
431
+
432
+ $totalBytes = [int64]0
433
+ if ($progress.PSObject.Properties['child_total_bytes']) {
434
+ [int64]::TryParse([string]$progress.child_total_bytes, [ref]$totalBytes) | Out-Null
435
+ }
436
+
437
+ $sessionCount = 0
438
+ if ($progress.PSObject.Properties['child_session_files'] -and $progress.child_session_files) {
439
+ $sessionCount = @($progress.child_session_files).Count
440
+ }
441
+
442
+ return [pscustomobject]@{
443
+ Signature = $signature
444
+ TotalBytes = $totalBytes
445
+ SessionCount = $sessionCount
446
+ }
355
447
  }
356
448
 
357
449
  function Test-PrizmCodexJsonSupport {
@@ -552,6 +552,7 @@ function Invoke-PrizmPipeline {
552
552
  $elapsedSeconds = 0
553
553
  $staleSeconds = 0
554
554
  $previousLogSize = 0
555
+ $previousChildActivitySignature = ''
555
556
  $wasTimedOut = $false
556
557
  $staleKillMarker = Join-Path $logsDir 'stale-kill.json'
557
558
  $wasStaleKilled = $false
@@ -568,7 +569,13 @@ function Invoke-PrizmPipeline {
568
569
  }
569
570
  $growth = $currentLogSize - $previousLogSize
570
571
  $previousLogSize = $currentLogSize
571
- if ($growth -gt 0) {
572
+
573
+ $childActivity = Get-PrizmProgressChildActivity -ProgressFile $progressJson
574
+ $childSignature = [string]$childActivity.Signature
575
+ $childAdvanced = ($childSignature -and $childSignature -ne $previousChildActivitySignature)
576
+ $previousChildActivitySignature = $childSignature
577
+
578
+ if ($growth -gt 0 -or $childAdvanced) {
572
579
  $staleSeconds = 0
573
580
  } else {
574
581
  $staleSeconds += $waitSeconds
@@ -580,10 +587,11 @@ function Invoke-PrizmPipeline {
580
587
  break
581
588
  }
582
589
 
583
- if ($staleKillThreshold -gt 0 -and $staleSeconds -ge $staleKillThreshold) {
590
+ $effectiveStaleKillThreshold = Get-PrizmEffectiveStaleKillThreshold -ProgressFile $progressJson -BaseThreshold $staleKillThreshold
591
+ if ($effectiveStaleKillThreshold -gt 0 -and $staleSeconds -ge $effectiveStaleKillThreshold) {
584
592
  $wasStaleKilled = $true
585
- Write-PrizmWarn "Session stale-killed (no progress for ${staleKillThreshold}s)"
586
- Write-PrizmStaleKillMarker $staleKillMarker $staleSeconds $staleKillThreshold
593
+ Write-PrizmWarn "Session stale-killed (no progress for ${effectiveStaleKillThreshold}s)"
594
+ Write-PrizmStaleKillMarker $staleKillMarker $staleSeconds $effectiveStaleKillThreshold
587
595
  Stop-PrizmSessionProcess $pidPath
588
596
  if ($staleKillGraceSeconds -gt 0) { Start-Sleep -Seconds $staleKillGraceSeconds }
589
597
  break
@@ -610,10 +618,16 @@ function Invoke-PrizmPipeline {
610
618
  }
611
619
  Stop-PrizmProgressParser $parserProcess
612
620
 
621
+ $wasInfraError = ($exitCode -ne 0 -and (Test-PrizmInfraError -SessionLog $sessionLog -ProgressJson $progressJson))
622
+
613
623
  $status = 'crashed'
614
624
  if ($wasTimedOut) {
615
625
  $status = 'timed_out'
616
626
  Write-PrizmWarn "AI session timed out after $timeoutSeconds seconds"
627
+ } elseif ($wasInfraError) {
628
+ $status = 'infra_error'
629
+ Write-PrizmWarn "AI session failed due to AI CLI/provider infrastructure error"
630
+ Write-PrizmWarn "Infrastructure errors are retried without consuming code retry budget"
617
631
  } elseif ($wasStaleKilled -or (Test-Path $staleKillMarker)) {
618
632
  Write-PrizmWarn "Session was stale-killed by heartbeat monitor (no progress for too long)"
619
633
  Write-PrizmWarn "Stale-killed sessions are treated as failed; dev branch is preserved for inspection"
@@ -637,8 +651,12 @@ function Invoke-PrizmPipeline {
637
651
  }
638
652
 
639
653
  $mergeSucceeded = $true
654
+ $itemListStatus = ''
640
655
  if ($status -eq 'success') {
641
- Invoke-PrizmPythonText $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
656
+ $updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
657
+ if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
658
+ $itemListStatus = [string]$updateResult.new_status
659
+ }
642
660
 
643
661
  if (Test-PrizmGitDirty $paths.ProjectRoot) {
644
662
  if ($hadDirtyBaseline) {
@@ -668,7 +686,10 @@ function Invoke-PrizmPipeline {
668
686
  }
669
687
 
670
688
  if ($status -ne 'success') {
671
- Invoke-PrizmPythonText $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
689
+ $updateResult = Invoke-PrizmPythonJson $python (@((Join-Path $paths.ScriptsDir $updateScript), $listOption, $listPath, '--state-dir', $stateDir, '--action', 'update', $idOption, $CurrentItemId, '--session-id', $sessionId, '--session-status', $status) + $maxRetryArgs)
690
+ if ($updateResult -and $updateResult.PSObject.Properties['new_status']) {
691
+ $itemListStatus = [string]$updateResult.new_status
692
+ }
672
693
  if ($isGitRepository) {
673
694
  Invoke-PrizmGitCommitPath $paths.ProjectRoot $listPath "chore($CurrentItemId): update $idName status" | Out-Null
674
695
  }
@@ -679,6 +700,7 @@ function Invoke-PrizmPipeline {
679
700
  } else {
680
701
  Write-PrizmError "$Kind item failed: $CurrentItemId. Log: $sessionLog"
681
702
  }
703
+ $script:PRIZM_ITEM_LIST_STATUS = $itemListStatus
682
704
  $script:PRIZM_ITEM_EXIT_CODE = if ($status -eq 'success' -and $mergeSucceeded) { 0 } else { 1 }
683
705
  return
684
706
  }
@@ -740,9 +762,11 @@ function Invoke-PrizmPipeline {
740
762
  $global:PRIZM_EXIT_CODE = $lastExitCode
741
763
  return
742
764
  }
743
- if ($lastExitCode -ne 0 -and $stopOnFailure) {
765
+ if ($lastExitCode -ne 0 -and $stopOnFailure -and $script:PRIZM_ITEM_LIST_STATUS -eq 'failed') {
744
766
  $global:PRIZM_EXIT_CODE = $lastExitCode
745
767
  return
768
+ } elseif ($lastExitCode -ne 0 -and $stopOnFailure) {
769
+ Write-PrizmInfo "STOP_ON_FAILURE: $nextItemId is $($script:PRIZM_ITEM_LIST_STATUS); retry budget not exhausted, continuing."
746
770
  }
747
771
  }
748
772
  }
@@ -110,6 +110,7 @@ $job = Start-Job -ScriptBlock {
110
110
  $elapsedSeconds = 0
111
111
  $staleSeconds = 0
112
112
  $previousLogSize = 0
113
+ $previousChildActivitySignature = ''
113
114
  $wasTimedOut = $false
114
115
  $wasStaleKilled = $false
115
116
  while ($true) {
@@ -123,7 +124,13 @@ while ($true) {
123
124
  if (Test-Path $logPath) { $currentLogSize = [int64](Get-Item $logPath).Length }
124
125
  $growth = $currentLogSize - $previousLogSize
125
126
  $previousLogSize = $currentLogSize
126
- if ($growth -gt 0) { $staleSeconds = 0 } else { $staleSeconds += $waitSeconds }
127
+
128
+ $childActivity = Get-PrizmProgressChildActivity -ProgressFile $progressPath
129
+ $childSignature = [string]$childActivity.Signature
130
+ $childAdvanced = ($childSignature -and $childSignature -ne $previousChildActivitySignature)
131
+ $previousChildActivitySignature = $childSignature
132
+
133
+ if ($growth -gt 0 -or $childAdvanced) { $staleSeconds = 0 } else { $staleSeconds += $waitSeconds }
127
134
 
128
135
  if ($timeoutSeconds -gt 0 -and $elapsedSeconds -ge $timeoutSeconds) {
129
136
  $wasTimedOut = $true