prizmkit 1.1.66 → 1.1.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  {
2
- "frameworkVersion": "1.1.66",
3
- "bundledAt": "2026-06-08T19:45:48.630Z",
4
- "bundledFrom": "940cbd4"
2
+ "frameworkVersion": "1.1.67",
3
+ "bundledAt": "2026-06-09T02:37:28.761Z",
4
+ "bundledFrom": "d4b8c30"
5
5
  }
@@ -20,7 +20,7 @@ project_doc_fallback_filenames = ["CLAUDE.md", "CODEBUDDY.md"]
20
20
 
21
21
  [agents]
22
22
  max_depth = 1
23
- job_max_runtime_seconds = 840
23
+ job_max_runtime_seconds = 3300
24
24
  `;
25
25
 
26
26
  await writeFile(configPath, configToml);
@@ -41,6 +41,9 @@
41
41
  # ─── Logging & Heartbeat ─────────────────────────────────────────────
42
42
  # HEARTBEAT_INTERVAL=30 # Heartbeat log interval in seconds
43
43
  # HEARTBEAT_STALE_THRESHOLD=600 # Max seconds without heartbeat before marking stale
44
+ # STALE_KILL_THRESHOLD=900 # Auto-kill after N seconds without parent log progress (0 = disabled)
45
+ # CODEX_WAIT_STALE_KILL_THRESHOLD=3600 # Longer no-log window while Codex waits on subagents
46
+ # CODEX_SUBAGENT_TIMEOUT_SECONDS=3300 # Codex subagent max runtime; defaults to wait threshold - 300
44
47
  # LOG_CLEANUP_ENABLED=1 # Periodic log cleanup (1=on, 0=off)
45
48
  # LOG_RETENTION_DAYS=14 # Delete logs older than N days
46
49
  # LOG_MAX_TOTAL_MB=1024 # Keep total logs under N MB via oldest-first cleanup
@@ -353,6 +353,9 @@ pending, in_progress, completed, failed, skipped
353
353
  | `DEV_BRANCH` | string | auto-generated | Custom branch name |
354
354
  | `HEARTBEAT_INTERVAL` | integer | 30 | Heartbeat log interval (s) |
355
355
  | `HEARTBEAT_STALE_THRESHOLD` | integer | 600 | Max seconds without heartbeat |
356
+ | `STALE_KILL_THRESHOLD` | integer | 900 | Auto-kill after N seconds without parent log progress |
357
+ | `CODEX_WAIT_STALE_KILL_THRESHOLD` | integer | 3600 | Longer no-log stale window while Codex waits on subagents |
358
+ | `CODEX_SUBAGENT_TIMEOUT_SECONDS` | integer | 3300 | Codex subagent max runtime |
356
359
  | `LOG_CLEANUP_ENABLED` | integer | 1 | Periodic cleanup |
357
360
  | `LOG_RETENTION_DAYS` | integer | 14 | Delete logs older than N days |
358
361
  | `LOG_MAX_TOTAL_MB` | integer | 1024 | Max total logs (MB) |
@@ -532,4 +535,3 @@ Located in `/dev-pipeline/templates/`:
532
535
  - Agent: 6 files
533
536
  - Base/Shared: 7 files
534
537
  - Singleton: 3 files
535
-
@@ -344,6 +344,23 @@ prizm_detect_cli_and_platform() {
344
344
  # command substitution; the background process must remain a child of the
345
345
  # runner shell so wait/heartbeat/trap handling works correctly.
346
346
  PRIZM_AI_PID=""
347
+
348
+ _prizm_codex_subagent_timeout_seconds() {
349
+ local configured="${CODEX_SUBAGENT_TIMEOUT_SECONDS:-}"
350
+ if [[ "$configured" =~ ^[0-9]+$ && "$configured" -gt 0 ]]; then
351
+ printf '%s\n' "$configured"
352
+ return 0
353
+ fi
354
+
355
+ local wait_threshold="${CODEX_WAIT_STALE_KILL_THRESHOLD:-3600}"
356
+ if [[ "$wait_threshold" =~ ^[0-9]+$ && "$wait_threshold" -gt 600 ]]; then
357
+ printf '%s\n' "$((wait_threshold - 300))"
358
+ return 0
359
+ fi
360
+
361
+ printf '%s\n' 3300
362
+ }
363
+
347
364
  prizm_start_ai_session() {
348
365
  local prompt_path="$1"
349
366
  local log_path="$2"
@@ -370,15 +387,8 @@ prizm_start_ai_session() {
370
387
  ;;
371
388
  codex)
372
389
  local codex_args=(--ask-for-approval never --sandbox danger-full-access)
373
- local codex_subagent_timeout="${CODEX_SUBAGENT_TIMEOUT_SECONDS:-}"
374
- if [[ -z "$codex_subagent_timeout" ]]; then
375
- local outer_stale_threshold="${STALE_KILL_THRESHOLD:-900}"
376
- if [[ "$outer_stale_threshold" =~ ^[0-9]+$ && "$outer_stale_threshold" -gt 120 ]]; then
377
- codex_subagent_timeout=$((outer_stale_threshold - 60))
378
- else
379
- codex_subagent_timeout=840
380
- fi
381
- fi
390
+ local codex_subagent_timeout
391
+ codex_subagent_timeout="$(_prizm_codex_subagent_timeout_seconds)"
382
392
  if [[ "$codex_subagent_timeout" =~ ^[0-9]+$ && "$codex_subagent_timeout" -gt 0 ]]; then
383
393
  codex_args+=(--config "agents.job_max_runtime_seconds=$codex_subagent_timeout")
384
394
  fi
@@ -430,15 +440,8 @@ prizm_run_ai_session() {
430
440
  ;;
431
441
  codex)
432
442
  local codex_args=(--ask-for-approval never --sandbox danger-full-access)
433
- local codex_subagent_timeout="${CODEX_SUBAGENT_TIMEOUT_SECONDS:-}"
434
- if [[ -z "$codex_subagent_timeout" ]]; then
435
- local outer_stale_threshold="${STALE_KILL_THRESHOLD:-900}"
436
- if [[ "$outer_stale_threshold" =~ ^[0-9]+$ && "$outer_stale_threshold" -gt 120 ]]; then
437
- codex_subagent_timeout=$((outer_stale_threshold - 60))
438
- else
439
- codex_subagent_timeout=840
440
- fi
441
- fi
443
+ local codex_subagent_timeout
444
+ codex_subagent_timeout="$(_prizm_codex_subagent_timeout_seconds)"
442
445
  if [[ "$codex_subagent_timeout" =~ ^[0-9]+$ && "$codex_subagent_timeout" -gt 0 ]]; then
443
446
  codex_args+=(--config "agents.job_max_runtime_seconds=$codex_subagent_timeout")
444
447
  fi
@@ -41,6 +41,7 @@ start_heartbeat() {
41
41
  (
42
42
  local elapsed=0
43
43
  local prev_size=0
44
+ local prev_child_activity_signature=""
44
45
  local stale_seconds=0
45
46
  while kill -0 "$cli_pid" 2>/dev/null; do
46
47
  sleep "$heartbeat_interval"
@@ -57,8 +58,41 @@ start_heartbeat() {
57
58
  local growth=$((cur_size - prev_size))
58
59
  prev_size=$cur_size
59
60
 
60
- # Track progress staleness (no log growth = stale)
61
- if [[ $growth -eq 0 ]]; then
61
+ local child_activity_signature=""
62
+ local child_total_bytes=0
63
+ local child_session_count=0
64
+ if [[ -f "$progress_json" ]]; then
65
+ local child_activity_data
66
+ child_activity_data=$(python3 - "$progress_json" <<'PY' 2>/dev/null || true
67
+ import json
68
+ import sys
69
+
70
+ try:
71
+ with open(sys.argv[1], "r", encoding="utf-8") as fh:
72
+ progress = json.load(fh)
73
+ except Exception:
74
+ sys.exit(0)
75
+
76
+ signature = str(progress.get("child_activity_signature") or "")
77
+ total_bytes = int(progress.get("child_total_bytes") or 0)
78
+ session_count = len(progress.get("child_session_files") or [])
79
+ print(f"{signature}\t{total_bytes}\t{session_count}")
80
+ PY
81
+ )
82
+ if [[ -n "$child_activity_data" ]]; then
83
+ IFS=$'\t' read -r child_activity_signature child_total_bytes child_session_count <<< "$child_activity_data"
84
+ fi
85
+ fi
86
+
87
+ local child_growth=0
88
+ if [[ -n "$child_activity_signature" && "$child_activity_signature" != "$prev_child_activity_signature" ]]; then
89
+ child_growth=1
90
+ fi
91
+ prev_child_activity_signature="$child_activity_signature"
92
+
93
+ # Track progress staleness. A Codex parent can sit in `wait`
94
+ # while child transcripts keep growing, so child activity counts.
95
+ if [[ $growth -eq 0 && $child_growth -eq 0 ]]; then
62
96
  stale_seconds=$((stale_seconds + heartbeat_interval))
63
97
  else
64
98
  stale_seconds=0
@@ -72,28 +106,87 @@ start_heartbeat() {
72
106
  else
73
107
  size_display="${cur_size}B"
74
108
  fi
109
+ local child_display=""
110
+ if [[ ${child_total_bytes:-0} -gt 0 ]]; then
111
+ local child_size_display
112
+ if [[ $child_total_bytes -gt 1048576 ]]; then
113
+ child_size_display="$((child_total_bytes / 1048576))MB"
114
+ elif [[ $child_total_bytes -gt 1024 ]]; then
115
+ child_size_display="$((child_total_bytes / 1024))KB"
116
+ else
117
+ child_size_display="${child_total_bytes}B"
118
+ fi
119
+ child_display=" | child: ${child_size_display}"
120
+ if [[ ${child_session_count:-0} -gt 1 ]]; then
121
+ child_display="${child_display}/${child_session_count}"
122
+ fi
123
+ fi
75
124
 
76
125
  local mins=$((elapsed / 60))
77
126
  local secs=$((elapsed % 60))
78
127
 
79
128
  local status_icon
80
- if [[ $growth -gt 0 ]]; then
129
+ if [[ $growth -gt 0 || $child_growth -gt 0 ]]; then
81
130
  status_icon="${GREEN}▶${NC}"
82
131
  else
83
132
  status_icon="${YELLOW}⏸${NC}"
84
133
  fi
85
134
 
86
- # Stale-kill: auto-terminate process if no progress for too long
87
- if [[ $stale_kill_threshold -gt 0 && $stale_seconds -ge $stale_kill_threshold ]]; then
135
+ local effective_stale_kill_threshold="$stale_kill_threshold"
136
+ if [[ $stale_kill_threshold -gt 0 && -f "$progress_json" ]]; then
137
+ local codex_wait_threshold
138
+ codex_wait_threshold=$(python3 - "$progress_json" "$stale_kill_threshold" <<'PY' 2>/dev/null || true
139
+ import json
140
+ import os
141
+ import sys
142
+
143
+ progress_path = sys.argv[1]
144
+ base_threshold = int(sys.argv[2])
145
+
146
+ with open(progress_path, "r", encoding="utf-8") as fh:
147
+ progress = json.load(fh)
148
+
149
+ spawn_count = 0
150
+ for tool in progress.get("tool_calls", []):
151
+ if isinstance(tool, dict) and tool.get("name") == "spawn_agent":
152
+ try:
153
+ spawn_count += int(tool.get("count", 0))
154
+ except (TypeError, ValueError):
155
+ pass
156
+
157
+ if (
158
+ progress.get("event_format") == "codex-json"
159
+ and progress.get("current_tool") == "wait"
160
+ and spawn_count > 0
161
+ ):
162
+ configured = os.environ.get("CODEX_WAIT_STALE_KILL_THRESHOLD", "")
163
+ try:
164
+ wait_threshold = int(configured)
165
+ except ValueError:
166
+ wait_threshold = max(base_threshold * 4, 3600)
167
+ if wait_threshold > base_threshold:
168
+ print(wait_threshold)
169
+ PY
170
+ )
171
+ if [[ "$codex_wait_threshold" =~ ^[0-9]+$ && "$codex_wait_threshold" -gt "$stale_kill_threshold" ]]; then
172
+ effective_stale_kill_threshold="$codex_wait_threshold"
173
+ fi
174
+ fi
175
+
176
+ # Stale-kill: auto-terminate process if no progress for too long.
177
+ # Codex parent sessions can sit on the `wait` tool while a spawned
178
+ # subagent is still doing useful work. Give that valid wait a longer
179
+ # stale window; normal single-agent stalls still use the base limit.
180
+ if [[ $effective_stale_kill_threshold -gt 0 && $stale_seconds -ge $effective_stale_kill_threshold ]]; then
88
181
  local stale_mins=$((stale_seconds / 60))
89
- echo -e " ${RED}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${RED}STALE-KILL: no progress for ${stale_mins}m (threshold: ${stale_kill_threshold}s)${NC}"
182
+ echo -e " ${RED}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${RED}STALE-KILL: no progress for ${stale_mins}m (threshold: ${effective_stale_kill_threshold}s)${NC}"
90
183
  echo -e " ${RED}[HEARTBEAT]${NC} Killing AI CLI process $cli_pid (stale session)..."
91
184
  # Write the marker before killing. Some CLIs exit quickly, and the
92
185
  # parent runner may stop this heartbeat process immediately after
93
186
  # wait(1) returns.
94
187
  local _marker_dir
95
188
  _marker_dir="$(dirname "$session_log")"
96
- echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"stale_session\", \"stale_seconds\": $stale_seconds, \"threshold\": $stale_kill_threshold}" > "$_marker_dir/stale-kill.json" 2>/dev/null || true
189
+ echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"stale_session\", \"stale_seconds\": $stale_seconds, \"threshold\": $effective_stale_kill_threshold}" > "$_marker_dir/stale-kill.json" 2>/dev/null || true
97
190
  kill -TERM "$cli_pid" 2>/dev/null || true
98
191
  # Give process 10s to exit gracefully, then force kill
99
192
  local stale_kill_grace_seconds="${STALE_KILL_GRACE_SECONDS:-10}"
@@ -109,9 +202,9 @@ start_heartbeat() {
109
202
 
110
203
  # Build staleness hint for display
111
204
  local stale_hint=""
112
- if [[ $stale_kill_threshold -gt 0 && $stale_seconds -gt 0 ]]; then
205
+ if [[ $effective_stale_kill_threshold -gt 0 && $stale_seconds -gt 0 ]]; then
113
206
  local stale_mins=$((stale_seconds / 60))
114
- local threshold_mins=$((stale_kill_threshold / 60))
207
+ local threshold_mins=$((effective_stale_kill_threshold / 60))
115
208
  stale_hint=" | stale: ${stale_mins}m/${threshold_mins}m"
116
209
  fi
117
210
 
@@ -134,7 +227,7 @@ try:
134
227
  except Exception:
135
228
  sys.exit(1)
136
229
  " "$progress_json" 2>/dev/null) && {
137
- echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${phase}${stale_hint}"
230
+ echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display}${child_display} | ${phase}${stale_hint}"
138
231
  continue
139
232
  }
140
233
  fi
@@ -145,7 +238,7 @@ except Exception:
145
238
  last_activity=$(tail -20 "$session_log" 2>/dev/null | grep -v '^$' | tail -1 | cut -c1-80 || echo "")
146
239
  fi
147
240
 
148
- echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s elapsed | log: ${size_display} (+${growth}B) | ${last_activity}${stale_hint}"
241
+ echo -e " ${status_icon} ${BLUE}[HEARTBEAT]${NC} ${mins}m${secs}s elapsed | log: ${size_display}${child_display} (+${growth}B) | ${last_activity}${stale_hint}"
149
242
  done
150
243
  ) &
151
244
  _HEARTBEAT_PID=$!
@@ -23,6 +23,7 @@ import tempfile
23
23
  import time
24
24
  from collections import Counter
25
25
  from datetime import datetime, timezone
26
+ from pathlib import Path
26
27
 
27
28
 
28
29
  # Ordered pipeline phases — index defines forward-only progression.
@@ -76,6 +77,13 @@ class ProgressTracker:
76
77
  self.event_format = ""
77
78
  self.active_subagent_count = 0
78
79
  self.subagent_status_counts = Counter()
80
+ self.codex_child_thread_ids = set()
81
+ self.child_session_files = []
82
+ self.child_total_bytes = 0
83
+ self.child_activity_signature = ""
84
+ self.last_child_activity_at = ""
85
+ self._codex_child_session_paths = {}
86
+ self._last_child_scan_at = 0.0
79
87
  self._text_buffer = ""
80
88
  self._in_tool_use = False
81
89
  self._current_tool_input_parts = []
@@ -113,6 +121,9 @@ class ProgressTracker:
113
121
 
114
122
  elif item_type == "collab_tool_call":
115
123
  tool_name = item.get("tool", "collab")
124
+ self._record_codex_child_thread_ids(
125
+ item.get("receiver_thread_ids")
126
+ )
116
127
  if event_type == "item.started":
117
128
  self.current_tool = tool_name
118
129
  self.tool_call_counts[tool_name] += 1
@@ -345,8 +356,117 @@ class ProgressTracker:
345
356
  self.subagent_status_counts = counts
346
357
  self.active_subagent_count = active
347
358
 
359
+ def _record_codex_child_thread_ids(self, thread_ids):
360
+ """Remember Codex child thread IDs reported by collab tool calls."""
361
+ if not isinstance(thread_ids, list):
362
+ return
363
+ for thread_id in thread_ids:
364
+ if isinstance(thread_id, str) and thread_id.strip():
365
+ self.codex_child_thread_ids.add(thread_id.strip())
366
+
367
+ def _codex_sessions_dir(self):
368
+ """Return the Codex sessions directory for the current environment."""
369
+ codex_home = os.environ.get("CODEX_HOME")
370
+ if codex_home:
371
+ return Path(codex_home).expanduser() / "sessions"
372
+ return Path.home() / ".codex" / "sessions"
373
+
374
+ def _find_codex_child_session_file(self, thread_id):
375
+ """Find a Codex transcript file for a child thread ID."""
376
+ sessions_dir = self._codex_sessions_dir()
377
+ if not sessions_dir.exists():
378
+ return None
379
+
380
+ try:
381
+ matches = list(sessions_dir.rglob(f"*{thread_id}.jsonl"))
382
+ except OSError:
383
+ return None
384
+
385
+ if not matches:
386
+ return None
387
+
388
+ try:
389
+ matches.sort(key=lambda path: path.stat().st_mtime, reverse=True)
390
+ except OSError:
391
+ pass
392
+ return str(matches[0])
393
+
394
+ def refresh_child_session_activity(self, force=False):
395
+ """Refresh Codex child transcript file stats.
396
+
397
+ The heartbeat monitor uses this activity signature to treat subagent
398
+ transcript growth as real progress while the parent Codex session is
399
+ blocked in `wait`.
400
+ """
401
+ previous_signature = self.child_activity_signature
402
+
403
+ if not self.codex_child_thread_ids:
404
+ self.child_session_files = []
405
+ self.child_total_bytes = 0
406
+ self.child_activity_signature = ""
407
+ self.last_child_activity_at = ""
408
+ return previous_signature != self.child_activity_signature
409
+
410
+ now = time.monotonic()
411
+ should_scan = (
412
+ force
413
+ or self._last_child_scan_at == 0.0
414
+ or (now - self._last_child_scan_at >= 2.0)
415
+ )
416
+ if should_scan:
417
+ for thread_id in sorted(self.codex_child_thread_ids):
418
+ path = self._codex_child_session_paths.get(thread_id)
419
+ if not path or not os.path.exists(path):
420
+ found = self._find_codex_child_session_file(thread_id)
421
+ if found:
422
+ self._codex_child_session_paths[thread_id] = found
423
+ self._last_child_scan_at = now
424
+
425
+ files = []
426
+ signature_parts = []
427
+ total_bytes = 0
428
+ max_mtime = 0.0
429
+
430
+ for thread_id in sorted(self.codex_child_thread_ids):
431
+ path = self._codex_child_session_paths.get(thread_id)
432
+ if not path:
433
+ continue
434
+ try:
435
+ stat = os.stat(path)
436
+ except OSError:
437
+ continue
438
+
439
+ total_bytes += stat.st_size
440
+ max_mtime = max(max_mtime, stat.st_mtime)
441
+ signature_parts.append(
442
+ f"{thread_id}:{stat.st_size}:{getattr(stat, 'st_mtime_ns', int(stat.st_mtime * 1_000_000_000))}"
443
+ )
444
+ files.append(
445
+ {
446
+ "thread_id": thread_id,
447
+ "path": path,
448
+ "size": stat.st_size,
449
+ "mtime": datetime.fromtimestamp(
450
+ stat.st_mtime, timezone.utc
451
+ ).strftime("%Y-%m-%dT%H:%M:%SZ"),
452
+ }
453
+ )
454
+
455
+ self.child_session_files = files
456
+ self.child_total_bytes = total_bytes
457
+ self.child_activity_signature = "|".join(signature_parts)
458
+ self.last_child_activity_at = (
459
+ datetime.fromtimestamp(max_mtime, timezone.utc).strftime(
460
+ "%Y-%m-%dT%H:%M:%SZ"
461
+ )
462
+ if max_mtime
463
+ else ""
464
+ )
465
+ return previous_signature != self.child_activity_signature
466
+
348
467
  def to_dict(self):
349
468
  """Export current state as a dictionary for JSON serialization."""
469
+ self.refresh_child_session_activity()
350
470
  tool_calls = [
351
471
  {"name": name, "count": count}
352
472
  for name, count in self.tool_call_counts.most_common()
@@ -367,6 +487,11 @@ class ProgressTracker:
367
487
  "total_tool_calls": self.total_tool_calls,
368
488
  "active_subagent_count": self.active_subagent_count,
369
489
  "subagent_states": subagent_states,
490
+ "child_thread_ids": sorted(self.codex_child_thread_ids),
491
+ "child_session_files": self.child_session_files,
492
+ "child_total_bytes": self.child_total_bytes,
493
+ "child_activity_signature": self.child_activity_signature,
494
+ "last_child_activity_at": self.last_child_activity_at,
370
495
  "last_text_snippet": self.last_text_snippet,
371
496
  "is_active": self.is_active,
372
497
  "errors": self.errors[-10:], # Keep last 10 errors
@@ -397,6 +522,15 @@ def tail_and_parse(session_log, progress_file, poll_interval=0.5):
397
522
  tracker = ProgressTracker()
398
523
  last_write_state = None
399
524
 
525
+ def state_key(state):
526
+ return (
527
+ state["message_count"],
528
+ state["current_tool"],
529
+ state["current_phase"],
530
+ state["total_tool_calls"],
531
+ state.get("child_activity_signature", ""),
532
+ )
533
+
400
534
  # Wait for log file to appear
401
535
  wait_count = 0
402
536
  while not os.path.exists(session_log):
@@ -428,22 +562,20 @@ def tail_and_parse(session_log, progress_file, poll_interval=0.5):
428
562
 
429
563
  # Write progress if state changed
430
564
  current_state = tracker.to_dict()
431
- state_key = (
432
- current_state["message_count"],
433
- current_state["current_tool"],
434
- current_state["current_phase"],
435
- current_state["total_tool_calls"],
436
- )
437
- if state_key != last_write_state:
565
+ current_state_key = state_key(current_state)
566
+ if current_state_key != last_write_state:
438
567
  atomic_write_json(current_state, progress_file)
439
- last_write_state = state_key
568
+ last_write_state = current_state_key
440
569
  else:
441
570
  idle_count += 1
442
- # After 2 seconds of no new data, write current state anyway
443
- # (ensures progress.json stays fresh)
444
- if idle_count == 4:
571
+ # Every 2 seconds of no parent log data, refresh child Codex
572
+ # transcript stats and write if child activity advanced.
573
+ if idle_count % 4 == 0:
445
574
  current_state = tracker.to_dict()
446
- atomic_write_json(current_state, progress_file)
575
+ current_state_key = state_key(current_state)
576
+ if current_state_key != last_write_state or idle_count == 4:
577
+ atomic_write_json(current_state, progress_file)
578
+ last_write_state = current_state_key
447
579
 
448
580
  # After 3600 idle cycles (30 min), mark inactive and exit
449
581
  if idle_count > 3600:
@@ -22,9 +22,10 @@
22
22
  # SESSION_TIMEOUT=0 # Session timeout in seconds (0 = no limit)
23
23
  # VERBOSE=1 # Verbose logging (1=on, 0=off)
24
24
  # HEARTBEAT_INTERVAL=30 # Poll interval for session progress/stale checks
25
- # STALE_KILL_THRESHOLD=900 # Auto-kill session after N seconds without log progress (0 = disabled)
25
+ # STALE_KILL_THRESHOLD=900 # Auto-kill session after N seconds without parent log progress (0 = disabled)
26
26
  # STALE_KILL_GRACE_SECONDS=10 # Grace period after stale-kill before force-stopping the job
27
- # CODEX_SUBAGENT_TIMEOUT_SECONDS=840 # Codex subagent max runtime; defaults to stale threshold - 60
27
+ # CODEX_WAIT_STALE_KILL_THRESHOLD=3600 # Longer no-log window while Codex waits on subagents
28
+ # CODEX_SUBAGENT_TIMEOUT_SECONDS=3300 # Codex subagent max runtime; defaults to wait threshold - 300
28
29
  # LOG_CLEANUP_ENABLED=1 # Run periodic session log cleanup
29
30
  # LOG_RETENTION_DAYS=14 # Delete session logs older than N days
30
31
  # LOG_MAX_TOTAL_MB=1024 # Keep total logs under N MB via oldest-first cleanup
@@ -346,9 +346,10 @@ pending, in_progress, completed, failed, skipped
346
346
  | `SESSION_TIMEOUT` | integer | 0 | 0 = no limit |
347
347
  | `VERBOSE` | integer | (not specified) | 1=on, 0=off |
348
348
  | `HEARTBEAT_INTERVAL` | integer | 30 | Poll interval for session progress/stale checks |
349
- | `STALE_KILL_THRESHOLD` | integer | 900 | Auto-kill after N seconds without log progress; 0 disables |
349
+ | `STALE_KILL_THRESHOLD` | integer | 900 | Auto-kill after N seconds without parent log progress; 0 disables |
350
350
  | `STALE_KILL_GRACE_SECONDS` | integer | 10 | Grace period after stale-kill before force-stopping |
351
- | `CODEX_SUBAGENT_TIMEOUT_SECONDS` | integer | 840 | Codex subagent max runtime |
351
+ | `CODEX_WAIT_STALE_KILL_THRESHOLD` | integer | 3600 | Longer no-log stale window while Codex waits on subagents |
352
+ | `CODEX_SUBAGENT_TIMEOUT_SECONDS` | integer | 3300 | Codex subagent max runtime |
352
353
  | `LOG_CLEANUP_ENABLED` | boolean | 1 | Periodic session log cleanup |
353
354
  | `LOG_RETENTION_DAYS` | integer | 14 | Delete session logs older than N days |
354
355
  | `LOG_MAX_TOTAL_MB` | integer | 1024 | Keep total logs under N MB |
@@ -345,13 +345,86 @@ function Get-PrizmCodexSubagentTimeoutSeconds {
345
345
  return $configuredTimeout
346
346
  }
347
347
 
348
- $outerThreshold = 0
349
- $outerThresholdText = if ($env:STALE_KILL_THRESHOLD) { $env:STALE_KILL_THRESHOLD } else { $env:SESSION_TIMEOUT }
350
- if ([int]::TryParse($outerThresholdText, [ref]$outerThreshold) -and $outerThreshold -gt 120) {
351
- return ($outerThreshold - 60)
348
+ $waitThreshold = 0
349
+ $waitThresholdText = if ($env:CODEX_WAIT_STALE_KILL_THRESHOLD) { $env:CODEX_WAIT_STALE_KILL_THRESHOLD } else { '3600' }
350
+ if ([int]::TryParse($waitThresholdText, [ref]$waitThreshold) -and $waitThreshold -gt 600) {
351
+ return ($waitThreshold - 300)
352
352
  }
353
353
 
354
- return 840
354
+ return 3300
355
+ }
356
+
357
+ function Get-PrizmEffectiveStaleKillThreshold {
358
+ param(
359
+ [string]$ProgressFile,
360
+ [int]$BaseThreshold
361
+ )
362
+
363
+ if ($BaseThreshold -le 0) { return $BaseThreshold }
364
+ if (-not (Test-Path $ProgressFile)) { return $BaseThreshold }
365
+
366
+ try {
367
+ $progress = Get-Content $ProgressFile -Raw -ErrorAction Stop | ConvertFrom-Json -ErrorAction Stop
368
+ } catch {
369
+ return $BaseThreshold
370
+ }
371
+
372
+ $spawnCount = 0
373
+ if ($progress.event_format -eq 'codex-json' -and $progress.current_tool -eq 'wait' -and $progress.tool_calls) {
374
+ foreach ($tool in @($progress.tool_calls)) {
375
+ if ($tool.name -eq 'spawn_agent') {
376
+ $count = 0
377
+ if ([int]::TryParse([string]$tool.count, [ref]$count)) { $spawnCount += $count }
378
+ }
379
+ }
380
+ }
381
+
382
+ if ($spawnCount -le 0) { return $BaseThreshold }
383
+
384
+ $waitThreshold = 0
385
+ if ([int]::TryParse($env:CODEX_WAIT_STALE_KILL_THRESHOLD, [ref]$waitThreshold) -and $waitThreshold -gt $BaseThreshold) {
386
+ return $waitThreshold
387
+ }
388
+
389
+ return [Math]::Max($BaseThreshold * 4, 3600)
390
+ }
391
+
392
+ function Get-PrizmProgressChildActivity {
393
+ param([string]$ProgressFile)
394
+
395
+ $empty = [pscustomobject]@{
396
+ Signature = ''
397
+ TotalBytes = 0
398
+ SessionCount = 0
399
+ }
400
+ if (-not (Test-Path $ProgressFile)) { return $empty }
401
+
402
+ try {
403
+ $progress = Get-Content $ProgressFile -Raw -ErrorAction Stop | ConvertFrom-Json -ErrorAction Stop
404
+ } catch {
405
+ return $empty
406
+ }
407
+
408
+ $signature = ''
409
+ if ($progress.PSObject.Properties['child_activity_signature'] -and $progress.child_activity_signature) {
410
+ $signature = [string]$progress.child_activity_signature
411
+ }
412
+
413
+ $totalBytes = [int64]0
414
+ if ($progress.PSObject.Properties['child_total_bytes']) {
415
+ [int64]::TryParse([string]$progress.child_total_bytes, [ref]$totalBytes) | Out-Null
416
+ }
417
+
418
+ $sessionCount = 0
419
+ if ($progress.PSObject.Properties['child_session_files'] -and $progress.child_session_files) {
420
+ $sessionCount = @($progress.child_session_files).Count
421
+ }
422
+
423
+ return [pscustomobject]@{
424
+ Signature = $signature
425
+ TotalBytes = $totalBytes
426
+ SessionCount = $sessionCount
427
+ }
355
428
  }
356
429
 
357
430
  function Test-PrizmCodexJsonSupport {
@@ -552,6 +552,7 @@ function Invoke-PrizmPipeline {
552
552
  $elapsedSeconds = 0
553
553
  $staleSeconds = 0
554
554
  $previousLogSize = 0
555
+ $previousChildActivitySignature = ''
555
556
  $wasTimedOut = $false
556
557
  $staleKillMarker = Join-Path $logsDir 'stale-kill.json'
557
558
  $wasStaleKilled = $false
@@ -568,7 +569,13 @@ function Invoke-PrizmPipeline {
568
569
  }
569
570
  $growth = $currentLogSize - $previousLogSize
570
571
  $previousLogSize = $currentLogSize
571
- if ($growth -gt 0) {
572
+
573
+ $childActivity = Get-PrizmProgressChildActivity -ProgressFile $progressJson
574
+ $childSignature = [string]$childActivity.Signature
575
+ $childAdvanced = ($childSignature -and $childSignature -ne $previousChildActivitySignature)
576
+ $previousChildActivitySignature = $childSignature
577
+
578
+ if ($growth -gt 0 -or $childAdvanced) {
572
579
  $staleSeconds = 0
573
580
  } else {
574
581
  $staleSeconds += $waitSeconds
@@ -580,10 +587,11 @@ function Invoke-PrizmPipeline {
580
587
  break
581
588
  }
582
589
 
583
- if ($staleKillThreshold -gt 0 -and $staleSeconds -ge $staleKillThreshold) {
590
+ $effectiveStaleKillThreshold = Get-PrizmEffectiveStaleKillThreshold -ProgressFile $progressJson -BaseThreshold $staleKillThreshold
591
+ if ($effectiveStaleKillThreshold -gt 0 -and $staleSeconds -ge $effectiveStaleKillThreshold) {
584
592
  $wasStaleKilled = $true
585
- Write-PrizmWarn "Session stale-killed (no progress for ${staleKillThreshold}s)"
586
- Write-PrizmStaleKillMarker $staleKillMarker $staleSeconds $staleKillThreshold
593
+ Write-PrizmWarn "Session stale-killed (no progress for ${effectiveStaleKillThreshold}s)"
594
+ Write-PrizmStaleKillMarker $staleKillMarker $staleSeconds $effectiveStaleKillThreshold
587
595
  Stop-PrizmSessionProcess $pidPath
588
596
  if ($staleKillGraceSeconds -gt 0) { Start-Sleep -Seconds $staleKillGraceSeconds }
589
597
  break
@@ -110,6 +110,7 @@ $job = Start-Job -ScriptBlock {
110
110
  $elapsedSeconds = 0
111
111
  $staleSeconds = 0
112
112
  $previousLogSize = 0
113
+ $previousChildActivitySignature = ''
113
114
  $wasTimedOut = $false
114
115
  $wasStaleKilled = $false
115
116
  while ($true) {
@@ -123,7 +124,13 @@ while ($true) {
123
124
  if (Test-Path $logPath) { $currentLogSize = [int64](Get-Item $logPath).Length }
124
125
  $growth = $currentLogSize - $previousLogSize
125
126
  $previousLogSize = $currentLogSize
126
- if ($growth -gt 0) { $staleSeconds = 0 } else { $staleSeconds += $waitSeconds }
127
+
128
+ $childActivity = Get-PrizmProgressChildActivity -ProgressFile $progressPath
129
+ $childSignature = [string]$childActivity.Signature
130
+ $childAdvanced = ($childSignature -and $childSignature -ne $previousChildActivitySignature)
131
+ $previousChildActivitySignature = $childSignature
132
+
133
+ if ($growth -gt 0 -or $childAdvanced) { $staleSeconds = 0 } else { $staleSeconds += $waitSeconds }
127
134
 
128
135
  if ($timeoutSeconds -gt 0 -and $elapsedSeconds -ge $timeoutSeconds) {
129
136
  $wasTimedOut = $true
@@ -23,6 +23,7 @@ import tempfile
23
23
  import time
24
24
  from collections import Counter
25
25
  from datetime import datetime, timezone
26
+ from pathlib import Path
26
27
 
27
28
 
28
29
  # Ordered pipeline phases — index defines forward-only progression.
@@ -76,6 +77,13 @@ class ProgressTracker:
76
77
  self.event_format = ""
77
78
  self.active_subagent_count = 0
78
79
  self.subagent_status_counts = Counter()
80
+ self.codex_child_thread_ids = set()
81
+ self.child_session_files = []
82
+ self.child_total_bytes = 0
83
+ self.child_activity_signature = ""
84
+ self.last_child_activity_at = ""
85
+ self._codex_child_session_paths = {}
86
+ self._last_child_scan_at = 0.0
79
87
  self._text_buffer = ""
80
88
  self._in_tool_use = False
81
89
  self._current_tool_input_parts = []
@@ -113,6 +121,9 @@ class ProgressTracker:
113
121
 
114
122
  elif item_type == "collab_tool_call":
115
123
  tool_name = item.get("tool", "collab")
124
+ self._record_codex_child_thread_ids(
125
+ item.get("receiver_thread_ids")
126
+ )
116
127
  if event_type == "item.started":
117
128
  self.current_tool = tool_name
118
129
  self.tool_call_counts[tool_name] += 1
@@ -345,8 +356,117 @@ class ProgressTracker:
345
356
  self.subagent_status_counts = counts
346
357
  self.active_subagent_count = active
347
358
 
359
+ def _record_codex_child_thread_ids(self, thread_ids):
360
+ """Remember Codex child thread IDs reported by collab tool calls."""
361
+ if not isinstance(thread_ids, list):
362
+ return
363
+ for thread_id in thread_ids:
364
+ if isinstance(thread_id, str) and thread_id.strip():
365
+ self.codex_child_thread_ids.add(thread_id.strip())
366
+
367
+ def _codex_sessions_dir(self):
368
+ """Return the Codex sessions directory for the current environment."""
369
+ codex_home = os.environ.get("CODEX_HOME")
370
+ if codex_home:
371
+ return Path(codex_home).expanduser() / "sessions"
372
+ return Path.home() / ".codex" / "sessions"
373
+
374
+ def _find_codex_child_session_file(self, thread_id):
375
+ """Find a Codex transcript file for a child thread ID."""
376
+ sessions_dir = self._codex_sessions_dir()
377
+ if not sessions_dir.exists():
378
+ return None
379
+
380
+ try:
381
+ matches = list(sessions_dir.rglob(f"*{thread_id}.jsonl"))
382
+ except OSError:
383
+ return None
384
+
385
+ if not matches:
386
+ return None
387
+
388
+ try:
389
+ matches.sort(key=lambda path: path.stat().st_mtime, reverse=True)
390
+ except OSError:
391
+ pass
392
+ return str(matches[0])
393
+
394
+ def refresh_child_session_activity(self, force=False):
395
+ """Refresh Codex child transcript file stats.
396
+
397
+ The heartbeat monitor uses this activity signature to treat subagent
398
+ transcript growth as real progress while the parent Codex session is
399
+ blocked in `wait`.
400
+ """
401
+ previous_signature = self.child_activity_signature
402
+
403
+ if not self.codex_child_thread_ids:
404
+ self.child_session_files = []
405
+ self.child_total_bytes = 0
406
+ self.child_activity_signature = ""
407
+ self.last_child_activity_at = ""
408
+ return previous_signature != self.child_activity_signature
409
+
410
+ now = time.monotonic()
411
+ should_scan = (
412
+ force
413
+ or self._last_child_scan_at == 0.0
414
+ or (now - self._last_child_scan_at >= 2.0)
415
+ )
416
+ if should_scan:
417
+ for thread_id in sorted(self.codex_child_thread_ids):
418
+ path = self._codex_child_session_paths.get(thread_id)
419
+ if not path or not os.path.exists(path):
420
+ found = self._find_codex_child_session_file(thread_id)
421
+ if found:
422
+ self._codex_child_session_paths[thread_id] = found
423
+ self._last_child_scan_at = now
424
+
425
+ files = []
426
+ signature_parts = []
427
+ total_bytes = 0
428
+ max_mtime = 0.0
429
+
430
+ for thread_id in sorted(self.codex_child_thread_ids):
431
+ path = self._codex_child_session_paths.get(thread_id)
432
+ if not path:
433
+ continue
434
+ try:
435
+ stat = os.stat(path)
436
+ except OSError:
437
+ continue
438
+
439
+ total_bytes += stat.st_size
440
+ max_mtime = max(max_mtime, stat.st_mtime)
441
+ signature_parts.append(
442
+ f"{thread_id}:{stat.st_size}:{getattr(stat, 'st_mtime_ns', int(stat.st_mtime * 1_000_000_000))}"
443
+ )
444
+ files.append(
445
+ {
446
+ "thread_id": thread_id,
447
+ "path": path,
448
+ "size": stat.st_size,
449
+ "mtime": datetime.fromtimestamp(
450
+ stat.st_mtime, timezone.utc
451
+ ).strftime("%Y-%m-%dT%H:%M:%SZ"),
452
+ }
453
+ )
454
+
455
+ self.child_session_files = files
456
+ self.child_total_bytes = total_bytes
457
+ self.child_activity_signature = "|".join(signature_parts)
458
+ self.last_child_activity_at = (
459
+ datetime.fromtimestamp(max_mtime, timezone.utc).strftime(
460
+ "%Y-%m-%dT%H:%M:%SZ"
461
+ )
462
+ if max_mtime
463
+ else ""
464
+ )
465
+ return previous_signature != self.child_activity_signature
466
+
348
467
  def to_dict(self):
349
468
  """Export current state as a dictionary for JSON serialization."""
469
+ self.refresh_child_session_activity()
350
470
  tool_calls = [
351
471
  {"name": name, "count": count}
352
472
  for name, count in self.tool_call_counts.most_common()
@@ -367,6 +487,11 @@ class ProgressTracker:
367
487
  "total_tool_calls": self.total_tool_calls,
368
488
  "active_subagent_count": self.active_subagent_count,
369
489
  "subagent_states": subagent_states,
490
+ "child_thread_ids": sorted(self.codex_child_thread_ids),
491
+ "child_session_files": self.child_session_files,
492
+ "child_total_bytes": self.child_total_bytes,
493
+ "child_activity_signature": self.child_activity_signature,
494
+ "last_child_activity_at": self.last_child_activity_at,
370
495
  "last_text_snippet": self.last_text_snippet,
371
496
  "is_active": self.is_active,
372
497
  "errors": self.errors[-10:], # Keep last 10 errors
@@ -397,6 +522,15 @@ def tail_and_parse(session_log, progress_file, poll_interval=0.5):
397
522
  tracker = ProgressTracker()
398
523
  last_write_state = None
399
524
 
525
+ def state_key(state):
526
+ return (
527
+ state["message_count"],
528
+ state["current_tool"],
529
+ state["current_phase"],
530
+ state["total_tool_calls"],
531
+ state.get("child_activity_signature", ""),
532
+ )
533
+
400
534
  # Wait for log file to appear
401
535
  wait_count = 0
402
536
  while not os.path.exists(session_log):
@@ -428,22 +562,20 @@ def tail_and_parse(session_log, progress_file, poll_interval=0.5):
428
562
 
429
563
  # Write progress if state changed
430
564
  current_state = tracker.to_dict()
431
- state_key = (
432
- current_state["message_count"],
433
- current_state["current_tool"],
434
- current_state["current_phase"],
435
- current_state["total_tool_calls"],
436
- )
437
- if state_key != last_write_state:
565
+ current_state_key = state_key(current_state)
566
+ if current_state_key != last_write_state:
438
567
  atomic_write_json(current_state, progress_file)
439
- last_write_state = state_key
568
+ last_write_state = current_state_key
440
569
  else:
441
570
  idle_count += 1
442
- # After 2 seconds of no new data, write current state anyway
443
- # (ensures progress.json stays fresh)
444
- if idle_count == 4:
571
+ # Every 2 seconds of no parent log data, refresh child Codex
572
+ # transcript stats and write if child activity advanced.
573
+ if idle_count % 4 == 0:
445
574
  current_state = tracker.to_dict()
446
- atomic_write_json(current_state, progress_file)
575
+ current_state_key = state_key(current_state)
576
+ if current_state_key != last_write_state or idle_count == 4:
577
+ atomic_write_json(current_state, progress_file)
578
+ last_write_state = current_state_key
447
579
 
448
580
  # After 3600 idle cycles (30 min), mark inactive and exit
449
581
  if idle_count > 3600:
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "1.1.66",
2
+ "version": "1.1.67",
3
3
  "skills": {
4
4
  "prizm-kit": {
5
5
  "description": "Full-lifecycle dev toolkit. Covers spec-driven development, Prizm context docs, code quality, debugging, deployment, and knowledge management.",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "prizmkit",
3
- "version": "1.1.66",
3
+ "version": "1.1.67",
4
4
  "description": "Create a new PrizmKit-powered project with clean initialization — no framework dev files, just what you need.",
5
5
  "type": "module",
6
6
  "bin": {
package/src/scaffold.js CHANGED
@@ -576,7 +576,7 @@ project_doc_fallback_filenames = ["CLAUDE.md", "CODEBUDDY.md"]
576
576
 
577
577
  [agents]
578
578
  max_depth = 1
579
- job_max_runtime_seconds = 840
579
+ job_max_runtime_seconds = 3300
580
580
  `;
581
581
  await fs.writeFile(configPath, configToml);
582
582
  await fs.remove(legacySettingsPath);