npm - prizmkit - Versions diffs - 1.1.70 → 1.1.74 - Mend

prizmkit 1.1.70 → 1.1.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

package/bundled/VERSION.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "frameworkVersion": "1.1.70",
-  "bundledAt": "2026-06-10T03:59:21.944Z",
-  "bundledFrom": "e948b58"
+  "frameworkVersion": "1.1.74",
+  "bundledAt": "2026-06-12T13:22:51.065Z",
+  "bundledFrom": "8817522"
 }

package/bundled/agents/prizm-dev-team-dev.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 name: prizm-dev-team-dev
-description: PrizmKit-integrated module implementer (multi-instance). Follows /prizmkit-implement workflow with TDD, marks tasks [x] in plan.md Tasks section, works within assigned Git worktrees. Use when implementing specific feature modules.
+description: PrizmKit-integrated module implementer (multi-instance). Follows /prizmkit-implement workflow with TDD, marks tasks [x] in plan.md Tasks section. Works directly in the repository; uses git worktree only if explicitly instructed. Use when implementing specific feature modules.
 tools: Read, Write, Edit, Bash, Glob, Grep, TaskCreate, TaskGet, TaskUpdate, TaskList, SendMessage
 disallowedTools: Agent
 model: inherit
@@ -86,6 +86,16 @@ DEV-17: DO NOT re-read source files already listed in context-snapshot.md Sectio
 DEV-18: When tests fail, run `$TEST_CMD 2>&1 | tee /tmp/test-out.txt` ONCE, then grep `/tmp/test-out.txt` for failure details. Never re-run the full test suite just to apply a different grep filter to its output.
 DEV-19: Before writing any `.prizmkit/prizm-docs/` file, check if it exists. If it exists: only update durable fields (KEY_FILES, INTERFACES, DEPENDENCIES, file counts, RULES, TRAPS, DECISIONS) — never overwrite the full file. Never add CHANGELOG, UPDATED/date fields, or workflow metadata. Only create new L2 docs for sub-modules you are actively creating in this session.
 DEV-20: Internal tracking IDs are not product copy. Before writing UI text or UI-copy assertions, translate references like `F-003 guard` into product-language behavior such as `the high-risk guard`. Add regression coverage when a feature touches user-visible text.
+DEV-21: Before any Read with offset + limit, compute offset + limit. If the last tool_result for this file shows it has N lines, offset MUST be < N. Never request an offset >= known file length.
+DEV-22: When Edit fails with 'String to replace not found':
+        1. STOP editing immediately — do NOT retry the same Edit.
+        2. Run `grep -n` to locate the exact line of the target text.
+        3. Read with offset = max(grep_line - 20, 0), limit = 50.
+        4. Copy the exact text from the Read result into the Edit old_string.
+        5. Never guess or extrapolate an offset — grep first, then read, then edit.
+DEV-23: If 3 consecutive Read calls to the same file return 'shorter than provided offset' or 'Wasted call', STOP all work on that file. Send ESCALATION with the file path, current line count, and the offsets you attempted. The orchestrator will provide the correct content.
+DEV-24: Before editing a large file (>1000 lines), verify you know its current line count from the most recent tool_result. Old line counts from earlier turns may be stale if you have since edited the file.
+DEV-25: After every 3 successful Edit operations on a file, run the relevant test command for that file once to validate your changes compile and behave correctly. Do not defer all testing to the end.
 ```
 ### Workflow

package/bundled/dev-pipeline/lib/common.sh CHANGED Viewed

@@ -444,6 +444,433 @@ prizm_detect_infra_error() {
     return 1
 }
+# Detect AI runtime/provider request failures that are not caused by generated
+# project code. Unlike generic infra errors, these can be deterministic for the
+# current transcript (for example context_too_large), so the runner must first
+# check semantic completion before deciding whether to retry.
+prizm_detect_ai_runtime_error() {
+    local session_log="${1:-}"
+    local progress_json="${2:-}"
+    if [[ -n "$progress_json" && -f "$progress_json" ]]; then
+        local fatal_error_code
+        fatal_error_code=$(python3 - "$progress_json" <<'PY' 2>/dev/null || true
+import json
+import sys
+try:
+    with open(sys.argv[1], encoding="utf-8") as fh:
+        progress = json.load(fh)
+except Exception:
+    raise SystemExit(0)
+code = progress.get("fatal_error_code")
+if code:
+    print(str(code))
+PY
+)
+        if [[ -n "$fatal_error_code" ]]; then
+            return 0
+        fi
+    fi
+    local haystack=""
+    if [[ -n "$session_log" && -f "$session_log" ]]; then
+        haystack="$(tail -c 65536 "$session_log" 2>/dev/null || true)"
+    fi
+    if [[ -n "$progress_json" && -f "$progress_json" ]]; then
+        haystack+=$'\n'
+        haystack+="$(cat "$progress_json" 2>/dev/null || true)"
+    fi
+    [[ -n "$haystack" ]] || return 1
+    if printf '%s' "$haystack" | grep -Eiq \
+        'context_too_large|model_context_window_exceeded|input exceeds the context window|context window of this model|context window (was )?exceeded|exceeded (the )?context window|invalid_request_error.*context window|context window.*invalid_request_error'; then
+        if printf '%s' "$haystack" | grep -Eiq \
+            'api error|invalid_request_error|api_error_status|api_error_code|status[[:space:]]*[:=]?[[:space:]]*(400|413)|last_result_is_error[[:space:]"'\'':=]+true|is_error[[:space:]"'\'':=]+true'; then
+            return 0
+        fi
+    fi
+    return 1
+}
+prizm_feature_slug_from_list() {
+    local feature_list="$1"
+    local feature_id="$2"
+    python3 - "$feature_list" "$feature_id" <<'PY'
+import json
+import re
+import sys
+feature_list, feature_id = sys.argv[1], sys.argv[2]
+with open(feature_list, encoding="utf-8") as fh:
+    data = json.load(fh)
+for feature in data.get("features", []):
+    if feature.get("id") == feature_id:
+        number = feature.get("id", "").replace("F-", "").replace("f-", "").zfill(3)
+        title = str(feature.get("title", "")).lower()
+        title = re.sub(r"[^a-z0-9\s-]", "", title)
+        title = re.sub(r"[\s]+", "-", title.strip())
+        title = re.sub(r"-+", "-", title).strip("-")
+        print(f"{number}-{title}" if title else number)
+        raise SystemExit(0)
+raise SystemExit(1)
+PY
+}
+prizm_checkpoint_all_complete() {
+    local checkpoint_file="$1"
+    [[ -f "$checkpoint_file" ]] || return 1
+    python3 - "$checkpoint_file" <<'PY'
+import json
+import sys
+try:
+    with open(sys.argv[1], encoding="utf-8") as fh:
+        data = json.load(fh)
+except Exception:
+    raise SystemExit(2)
+steps = data.get("steps")
+if not isinstance(steps, list) or not steps:
+    raise SystemExit(1)
+for step in steps:
+    if not isinstance(step, dict) or step.get("status") not in ("completed", "skipped"):
+        raise SystemExit(1)
+raise SystemExit(0)
+PY
+}
+# Return the successful feature commit SHA if one exists in base_ref..HEAD.
+# Prefer messages containing the feature ID. If checkpoint is complete, allow an
+# older prompt variant only when the non-WIP commit identifies the feature title.
+prizm_find_feature_commit() {
+    local project_root="$1"
+    local base_ref="$2"
+    local feature_id="$3"
+    local allow_title_fallback="${4:-false}"
+    local feature_title="${5:-}"
+    git -C "$project_root" rev-parse --is-inside-work-tree >/dev/null 2>&1 || return 1
+    local range="${base_ref}..HEAD"
+    if [[ -z "$base_ref" ]] || ! git -C "$project_root" rev-parse --verify "$base_ref" >/dev/null 2>&1; then
+        range="HEAD"
+    fi
+    local feature_commit
+    feature_commit=$(git -C "$project_root" log "$range" --format='%H%x09%s' 2>/dev/null \
+        | awk -F '\t' -v fid="$feature_id" 'index($2, fid) > 0 && $2 !~ /^wip(\(|:)/ { print $1; exit }')
+    if [[ -n "$feature_commit" ]]; then
+        printf '%s\n' "$feature_commit"
+        return 0
+    fi
+    if [[ "$allow_title_fallback" == "true" && -n "$feature_title" ]]; then
+        feature_commit=$(git -C "$project_root" log "$range" --format='%H%x09%s' 2>/dev/null \
+            | python3 -c '
+import re
+import sys
+title = sys.argv[1]
+def words(text):
+    return [w for w in re.split(r"[^a-z0-9]+", text.lower()) if len(w) >= 3]
+title_words = words(title)
+if not title_words:
+    raise SystemExit(1)
+# Require all title words for short titles, or a strong majority for longer
+# titles so punctuation/articles do not make older commits unmatchable.
+required = len(title_words) if len(title_words) <= 3 else max(3, int(len(title_words) * 0.75 + 0.999))
+for line in sys.stdin:
+    commit, sep, subject = line.rstrip("\n").partition("\t")
+    if not sep or re.match(r"^wip(\(|:)", subject):
+        continue
+    subject_words = set(words(subject))
+    if sum(1 for word in title_words if word in subject_words) >= required:
+        print(commit)
+        raise SystemExit(0)
+raise SystemExit(1)
+' "$feature_title")
+        if [[ -n "$feature_commit" ]]; then
+            printf '%s\n' "$feature_commit"
+            return 0
+        fi
+    fi
+    return 1
+}
+# Semantic completion means the durable workflow checkpoint is complete and a
+# non-WIP feature commit exists. This intentionally runs before exit-code based
+# failure classification so delayed post-commit model errors cannot strand a
+# completed feature on its dev branch.
+PRIZM_SEMANTIC_FEATURE_SLUG=""
+PRIZM_SEMANTIC_COMMIT_SHA=""
+prizm_feature_semantically_complete() {
+    local feature_list="$1"
+    local feature_id="$2"
+    local project_root="$3"
+    local base_ref="$4"
+    local prizmkit_dir="$5"
+    PRIZM_SEMANTIC_FEATURE_SLUG=""
+    PRIZM_SEMANTIC_COMMIT_SHA=""
+    local feature_slug
+    feature_slug=$(prizm_feature_slug_from_list "$feature_list" "$feature_id" 2>/dev/null) || return 1
+    local checkpoint_file="$prizmkit_dir/specs/${feature_slug}/workflow-checkpoint.json"
+    prizm_checkpoint_all_complete "$checkpoint_file" || return 1
+    local feature_title
+    feature_title=$(python3 - "$feature_list" "$feature_id" <<'PY' 2>/dev/null || true
+import json
+import sys
+with open(sys.argv[1], encoding="utf-8") as fh:
+    data = json.load(fh)
+for feature in data.get("features", []):
+    if feature.get("id") == sys.argv[2]:
+        print(feature.get("title", ""))
+        break
+PY
+)
+    local commit_sha
+    commit_sha=$(prizm_find_feature_commit "$project_root" "$base_ref" "$feature_id" true "$feature_title" 2>/dev/null) || return 1
+    PRIZM_SEMANTIC_FEATURE_SLUG="$feature_slug"
+    PRIZM_SEMANTIC_COMMIT_SHA="$commit_sha"
+    return 0
+}
+prizm_preserve_post_completion_dirty() {
+    local project_root="$1"
+    local artifact_dir="$2"
+    local feature_id="$3"
+    local session_id="${4:-}"
+    git -C "$project_root" rev-parse --is-inside-work-tree >/dev/null 2>&1 || return 1
+    local dirty_status
+    dirty_status=$(git -C "$project_root" status --porcelain --untracked-files=all 2>/dev/null || true)
+    [[ -n "$dirty_status" ]] || return 0
+    mkdir -p "$artifact_dir" || return 1
+    local status_file="$artifact_dir/post-completion-status.txt"
+    local patch_file="$artifact_dir/post-completion-dirty.patch"
+    local staged_patch_file="$artifact_dir/post-completion-staged.patch"
+    local untracked_manifest="$artifact_dir/post-completion-untracked.txt"
+    local findings_file="$artifact_dir/post-completion-findings.md"
+    local untracked_dir="$artifact_dir/untracked"
+    printf '%s\n' "$dirty_status" > "$status_file" || return 1
+    git -C "$project_root" diff --binary > "$patch_file" 2>/dev/null || return 1
+    git -C "$project_root" diff --cached --binary > "$staged_patch_file" 2>/dev/null || return 1
+    : > "$untracked_manifest" || return 1
+    local untracked_tmp
+    untracked_tmp=$(mktemp 2>/dev/null || mktemp -t prizm-untracked) || return 1
+    git -C "$project_root" ls-files --others --exclude-standard -z > "$untracked_tmp" 2>/dev/null || {
+        rm -f "$untracked_tmp"
+        return 1
+    }
+    if [[ -s "$untracked_tmp" ]]; then
+        mkdir -p "$untracked_dir" || {
+            rm -f "$untracked_tmp"
+            return 1
+        }
+        while IFS= read -r -d '' rel_path; do
+            [[ -n "$rel_path" ]] || continue
+            printf '%s\n' "$rel_path" >> "$untracked_manifest" || {
+                rm -f "$untracked_tmp"
+                return 1
+            }
+            local source_path="$project_root/$rel_path"
+            local dest_path="$untracked_dir/$rel_path"
+            mkdir -p "$(dirname "$dest_path")" || {
+                rm -f "$untracked_tmp"
+                return 1
+            }
+            if [[ -f "$source_path" ]]; then
+                cp -p "$source_path" "$dest_path" || {
+                    rm -f "$untracked_tmp"
+                    return 1
+                }
+            elif [[ -d "$source_path" ]]; then
+                mkdir -p "$dest_path" || {
+                    rm -f "$untracked_tmp"
+                    return 1
+                }
+            fi
+        done < "$untracked_tmp"
+    fi
+    cat > "$findings_file" <<EOF
+# Post-completion dirty changes preserved
+- Feature: $feature_id
+- Session: ${session_id:-unknown}
+- Reason: workflow checkpoint and feature commit were already complete, but delayed post-commit activity left the working tree dirty.
+## Recovery guidance
+The finalized feature commit was kept unchanged for merge. Review these follow-up artifacts separately; do not assume they were merged:
+- \`post-completion-status.txt\` — original dirty working tree status
+- \`post-completion-dirty.patch\` — unstaged tracked changes
+- \`post-completion-staged.patch\` — staged changes
+- \`post-completion-untracked.txt\` and \`untracked/\` — untracked files copied before cleanup
+EOF
+    git -C "$project_root" reset --hard >/dev/null 2>&1 || {
+        rm -f "$untracked_tmp"
+        return 1
+    }
+    while IFS= read -r -d '' rel_path; do
+        [[ -n "$rel_path" ]] || continue
+        case "$rel_path" in
+            .prizmkit/*) continue ;;
+        esac
+        rm -f "$project_root/$rel_path" 2>/dev/null || true
+    done < "$untracked_tmp"
+    rm -f "$untracked_tmp"
+    dirty_status=$(git -C "$project_root" status --porcelain --untracked-files=all 2>/dev/null | grep -v '^?? .prizmkit/' || true)
+    [[ -z "$dirty_status" ]] || return 1
+    return 0
+}
+prizm_synthesize_failure_log() {
+    local failure_log="$1"
+    local feature_id="$2"
+    local session_id="$3"
+    local session_status="$4"
+    local exit_code="$5"
+    local stale_kill_marker="$6"
+    local progress_json="$7"
+    local checkpoint_file="$8"
+    local project_root="$9"
+    local base_ref="${10:-}"
+    [[ -n "$failure_log" ]] || return 0
+    [[ -f "$failure_log" ]] && return 0
+    mkdir -p "$(dirname "$failure_log")" || return 0
+    local progress_summary="Progress data unavailable."
+    if [[ -f "$progress_json" ]]; then
+        progress_summary=$(python3 - "$progress_json" <<'PY' 2>/dev/null || true
+import json
+import sys
+try:
+    with open(sys.argv[1], encoding="utf-8") as fh:
+        data = json.load(fh)
+except Exception as exc:
+    print(f"Progress parse error: {exc}")
+    raise SystemExit(0)
+fields = [
+    ("fatal_error_code", data.get("fatal_error_code")),
+    ("api_error_status", data.get("api_error_status")),
+    ("api_error_code", data.get("api_error_code")),
+    ("current_phase", data.get("current_phase")),
+    ("current_tool", data.get("current_tool")),
+    ("last_text_snippet", data.get("last_text_snippet")),
+    ("terminal_result_text", data.get("terminal_result_text")),
+]
+for key, value in fields:
+    if value not in (None, "", []):
+        text = str(value).replace("\n", " ")
+        print(f"- {key}: {text[:500]}")
+PY
+)
+        [[ -n "$progress_summary" ]] || progress_summary="Progress data contained no terminal fields."
+    fi
+    local stale_summary="No stale-kill marker."
+    if [[ -f "$stale_kill_marker" ]]; then
+        stale_summary="$(cat "$stale_kill_marker" 2>/dev/null || true)"
+    fi
+    local checkpoint_summary="No checkpoint file found."
+    if [[ -f "$checkpoint_file" ]]; then
+        checkpoint_summary=$(python3 - "$checkpoint_file" <<'PY' 2>/dev/null || true
+import json
+import sys
+try:
+    with open(sys.argv[1], encoding="utf-8") as fh:
+        data = json.load(fh)
+except Exception as exc:
+    print(f"Checkpoint parse error: {exc}")
+    raise SystemExit(0)
+steps = data.get("steps") or []
+complete = sum(1 for step in steps if isinstance(step, dict) and step.get("status") in ("completed", "skipped"))
+print(f"{complete}/{len(steps)} steps completed_or_skipped")
+for step in steps:
+    if isinstance(step, dict) and step.get("status") not in ("completed", "skipped"):
+        print(f"- incomplete: {step.get('id')} {step.get('skill')} = {step.get('status')}")
+PY
+)
+    fi
+    local latest_commit="unavailable"
+    local feature_commit="no"
+    if git -C "$project_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
+        latest_commit=$(git -C "$project_root" rev-parse --short HEAD 2>/dev/null || echo unavailable)
+        if prizm_find_feature_commit "$project_root" "$base_ref" "$feature_id" false >/dev/null 2>&1; then
+            feature_commit="yes"
+        fi
+    fi
+    local dirty_summary="unavailable"
+    dirty_summary=$(git -C "$project_root" status --short 2>/dev/null || true)
+    [[ -n "$dirty_summary" ]] || dirty_summary="clean"
+    cat > "$failure_log" <<EOF
+# Runtime-synthesized failure log
+## Session
+- feature_id: $feature_id
+- session_id: ${session_id:-unknown}
+- session_status: $session_status
+- exit_code: $exit_code
+## Stale kill marker
+\`\`\`json
+$stale_summary
+\`\`\`
+## Progress
+$progress_summary
+## Checkpoint
+$checkpoint_summary
+## Git state
+- feature_commit_exists: $feature_commit
+- latest_commit: $latest_commit
+\`\`\`text
+$dirty_summary
+\`\`\`
+## Recommended recovery action
+- If this is an AI runtime/provider error before checkpoint completion, retry the session with a fresh context.
+- If checkpoint completion and a feature commit both exist, inspect post-completion artifacts and finalize manually rather than rebuilding from scratch.
+- If the working tree is dirty, preserve or review those changes before any reset or merge.
+EOF
+}
 prizm_extract_update_new_status() {
     python3 -c "
 import json, sys

package/bundled/dev-pipeline/lib/heartbeat.sh CHANGED Viewed

@@ -90,9 +90,79 @@ PY
             fi
             prev_child_activity_signature="$child_activity_signature"
+            local effective_stale_kill_threshold="$stale_kill_threshold"
+            if [[ $stale_kill_threshold -gt 0 && -f "$progress_json" ]]; then
+                local codex_wait_threshold
+                codex_wait_threshold=$(python3 - "$progress_json" "$stale_kill_threshold" <<'PY' 2>/dev/null || true
+import json
+import os
+import sys
+progress_path = sys.argv[1]
+base_threshold = int(sys.argv[2])
+with open(progress_path, "r", encoding="utf-8") as fh:
+    progress = json.load(fh)
+spawn_count = 0
+for tool in progress.get("tool_calls", []):
+    if isinstance(tool, dict) and tool.get("name") == "spawn_agent":
+        try:
+            spawn_count += int(tool.get("count", 0))
+        except (TypeError, ValueError):
+            pass
+if (
+    progress.get("event_format") == "codex-json"
+    and progress.get("current_tool") == "wait"
+    and spawn_count > 0
+):
+    configured = os.environ.get("CODEX_WAIT_STALE_KILL_THRESHOLD", "")
+    try:
+        wait_threshold = int(configured)
+    except ValueError:
+        wait_threshold = max(base_threshold * 4, 3600)
+    if wait_threshold > base_threshold:
+        print(wait_threshold)
+PY
+                )
+                if [[ "$codex_wait_threshold" =~ ^[0-9]+$ && "$codex_wait_threshold" -gt "$stale_kill_threshold" ]]; then
+                    effective_stale_kill_threshold="$codex_wait_threshold"
+                fi
+            fi
+            # Check for error-loop: agent is actively producing output but results are
+            # all read-offset errors or wasted calls. This is a stuck agent that appears
+            # "active" by log growth but is accomplishing nothing.
+            local error_loop_detected=false
+            if [[ $effective_stale_kill_threshold -gt 0 && $growth -gt 0 && -f "$progress_json" ]]; then
+                local error_loop_flag
+                error_loop_flag=$(python3 - "$progress_json" <<'PY' 2>/dev/null || true
+import json, sys
+try:
+    with open(sys.argv[1], encoding="utf-8") as fh:
+        progress = json.load(fh)
+except Exception:
+    raise SystemExit(0)
+errors = progress.get("errors", [])
+if isinstance(errors, list) and len(errors) >= 5:
+    recent = errors[-5:]
+    if all(isinstance(e, dict) and e.get("type") in ("read_offset_overflow", "wasted_call") for e in recent):
+        print("error_loop")
+PY
+                )
+                if [[ "$error_loop_flag" == "error_loop" ]]; then
+                    error_loop_detected=true
+                fi
+            fi
             # Track progress staleness. Parent sessions can sit in a wait/polling
             # tool while child transcripts keep growing, so child activity counts.
-            if [[ $growth -eq 0 && $child_growth -eq 0 ]]; then
+            # Error loops bypass normal growth-as-progress because the log is only
+            # growing with repeated failed reads or wasted calls.
+            if [[ "$error_loop_detected" == "true" ]]; then
+                stale_seconds=$effective_stale_kill_threshold
+            elif [[ $growth -eq 0 && $child_growth -eq 0 ]]; then
                 stale_seconds=$((stale_seconds + heartbeat_interval))
             else
                 stale_seconds=0
@@ -132,44 +202,39 @@ PY
                 status_icon="${YELLOW}⏸${NC}"
             fi
-            local effective_stale_kill_threshold="$stale_kill_threshold"
-            if [[ $stale_kill_threshold -gt 0 && -f "$progress_json" ]]; then
-                local codex_wait_threshold
-                codex_wait_threshold=$(python3 - "$progress_json" "$stale_kill_threshold" <<'PY' 2>/dev/null || true
+            # Fatal provider/runtime errors are terminal; do not wait for the
+            # stale window when progress.json already proves the model cannot
+            # continue (for example context_too_large).
+            if [[ -f "$progress_json" ]]; then
+                local fatal_error_code=""
+                fatal_error_code=$(python3 - "$progress_json" <<'PY' 2>/dev/null || true
 import json
-import os
 import sys
-progress_path = sys.argv[1]
-base_threshold = int(sys.argv[2])
-with open(progress_path, "r", encoding="utf-8") as fh:
-    progress = json.load(fh)
-spawn_count = 0
-for tool in progress.get("tool_calls", []):
-    if isinstance(tool, dict) and tool.get("name") == "spawn_agent":
-        try:
-            spawn_count += int(tool.get("count", 0))
-        except (TypeError, ValueError):
-            pass
-if (
-    progress.get("event_format") == "codex-json"
-    and progress.get("current_tool") == "wait"
-    and spawn_count > 0
-):
-    configured = os.environ.get("CODEX_WAIT_STALE_KILL_THRESHOLD", "")
-    try:
-        wait_threshold = int(configured)
-    except ValueError:
-        wait_threshold = max(base_threshold * 4, 3600)
-    if wait_threshold > base_threshold:
-        print(wait_threshold)
+try:
+    with open(sys.argv[1], encoding="utf-8") as fh:
+        progress = json.load(fh)
+except Exception:
+    raise SystemExit(0)
+code = progress.get("fatal_error_code") or ""
+if code:
+    print(code)
 PY
-                )
-                if [[ "$codex_wait_threshold" =~ ^[0-9]+$ && "$codex_wait_threshold" -gt "$stale_kill_threshold" ]]; then
-                    effective_stale_kill_threshold="$codex_wait_threshold"
+)
+                if [[ -n "$fatal_error_code" ]]; then
+                    echo -e "  ${RED}[HEARTBEAT]${NC} ${mins}m${secs}s | log: ${size_display} | ${RED}FATAL: ${fatal_error_code}${NC}"
+                    local _marker_dir
+                    _marker_dir="$(dirname "$session_log")"
+                    echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"${fatal_error_code}\", \"fatal_error_code\": \"${fatal_error_code}\", \"stale_seconds\": $stale_seconds, \"threshold\": $effective_stale_kill_threshold}" > "$_marker_dir/fatal-error.json" 2>/dev/null || true
+                    echo "{\"killed_at\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\", \"reason\": \"${fatal_error_code}\", \"fatal_error_code\": \"${fatal_error_code}\", \"stale_seconds\": $stale_seconds, \"threshold\": $effective_stale_kill_threshold}" > "$_marker_dir/stale-kill.json" 2>/dev/null || true
+                    kill -TERM "$cli_pid" 2>/dev/null || true
+                    local fatal_kill_grace_seconds="${STALE_KILL_GRACE_SECONDS:-10}"
+                    if [[ $fatal_kill_grace_seconds -gt 0 ]]; then
+                        sleep "$fatal_kill_grace_seconds"
+                    fi
+                    if kill -0 "$cli_pid" 2>/dev/null; then
+                        kill -9 "$cli_pid" 2>/dev/null || true
+                    fi
+                    break
                 fi
             fi