npm - loki-mode - Versions diffs - 7.41.0 → 7.41.1 - Mend

loki-mode 7.41.0 → 7.41.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/SKILL.md +2 -2
package/VERSION +1 -1
package/autonomy/app-runner.sh +138 -3
package/autonomy/lib/claude-flags.sh +55 -3
package/autonomy/loki +70 -6
package/autonomy/run.sh +343 -10
package/dashboard/__init__.py +1 -1
package/dashboard/server.py +95 -2
package/dashboard/static/index.html +58 -32
package/docs/INSTALLATION.md +1 -1
package/loki-ts/dist/loki.js +2 -2
package/mcp/__init__.py +1 -1
package/package.json +1 -1
package/plugins/loki-mode/.claude-plugin/plugin.json +1 -1
package/skills/quality-gates.md +39 -2

package/autonomy/run.sh CHANGED Viewed

@@ -2560,20 +2560,24 @@ except Exception:
     [ -z "$branch" ] && branch="unknown"
     head_sha="$( (cd "${TARGET_DIR:-.}" && git rev-parse HEAD) 2>/dev/null || true )"
+    # Finding #596 (HIGH): exclude .loki/ and .git/ from the summary diff/stat and
+    # from the "Review the work" command we print, so the user is never told to
+    # review a .loki-bloated diff and the displayed counts match the gated diff.
+    local _summary_pathspec=(-- . ':(exclude).loki/' ':(exclude).git/' ':(exclude)**/.loki/**')
     if [ -n "$start_sha" ]; then
-        diff_stat="$( (cd "${TARGET_DIR:-.}" && git diff --stat "${start_sha}..HEAD") 2>/dev/null || true )"
+        diff_stat="$( (cd "${TARGET_DIR:-.}" && git diff --stat "${start_sha}..HEAD" "${_summary_pathspec[@]}") 2>/dev/null || true )"
         # Parse the git diff --shortstat tail for counts (locale-stable enough
         # for our display; failures leave the zeros in place).
         local shortstat
-        shortstat="$( (cd "${TARGET_DIR:-.}" && git diff --shortstat "${start_sha}..HEAD") 2>/dev/null || true )"
+        shortstat="$( (cd "${TARGET_DIR:-.}" && git diff --shortstat "${start_sha}..HEAD" "${_summary_pathspec[@]}") 2>/dev/null || true )"
         if [ -n "$shortstat" ]; then
             files_changed="$(printf '%s\n' "$shortstat" | grep -oE '[0-9]+ file' | grep -oE '[0-9]+' | head -1)"
             insertions="$(printf '%s\n' "$shortstat" | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' | head -1)"
             deletions="$(printf '%s\n' "$shortstat" | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' | head -1)"
         fi
-        review_cmd="git diff ${start_sha}..HEAD"
+        review_cmd="git diff ${start_sha}..HEAD -- . ':(exclude).loki/'"
     else
-        review_cmd="git diff HEAD"
+        review_cmd="git diff HEAD -- . ':(exclude).loki/'"
     fi
     [ -z "$files_changed" ] && files_changed=0
     [ -z "$insertions" ] && insertions=0
@@ -7166,6 +7170,11 @@ enforce_test_coverage() {
         cat > "$quality_dir/test-results.json" << TREOF
 {"timestamp":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","runner":"none","pass":true,"summary":"No test runner detected"}
 TREOF
+        # Finding #598: stamp the per-iteration freshness marker so a later
+        # completion-route capture (ensure_completion_test_evidence) reuses this
+        # run instead of re-running the suite. Single source of truth for "tests
+        # ran this iteration", set on every return path that writes results.
+        printf '%s\n' "${ITERATION_COUNT:-0}" > "$quality_dir/.test-results.iter" 2>/dev/null || true
         return 0
     fi
@@ -7175,6 +7184,8 @@ TREOF
     cat > "$quality_dir/test-results.json" << TREOF
 {"timestamp":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","runner":"$test_runner","pass":$test_passed,"min_coverage":$min_coverage,"summary":"$details"}
 TREOF
+    # Finding #598: stamp the per-iteration freshness marker (see above).
+    printf '%s\n' "${ITERATION_COUNT:-0}" > "$quality_dir/.test-results.iter" 2>/dev/null || true
     if [ "$test_passed" = "true" ]; then
         touch "$quality_dir/unit-tests.pass"
@@ -7189,6 +7200,82 @@ TREOF
     fi
 }
+# ============================================================================
+# Finding #598 (HIGH): ensure REAL test evidence exists before the
+# verified-completion evidence gate runs on a completion claim.
+#
+# The evidence gate (council_evidence_gate) blocks completion iff the diff is
+# empty OR tests are red. The test axis reads .loki/quality/test-results.json.
+# When that file is ABSENT or inconclusive (runner==none / unparsable), the gate
+# treats the test axis as pass-through, so completion could be claimed on a
+# nonzero diff alone with NO test evidence -- half-blind. This happens whenever
+# enforce_test_coverage did not run this iteration, e.g. LOKI_HARD_GATES=false or
+# PHASE_UNIT_TESTS=false (the gate at the quality-gate ladder is skipped) while
+# the completion-promise route still fires the evidence gate.
+#
+# Rather than letting absent evidence pass (Option 1, which would live in the
+# off-limits completion-council.sh), we GENERATE real evidence here (Option 2,
+# preferred + autonomous): run the project's own test command via the existing
+# detect-and-run enforce_test_coverage, which persists a fresh test-results.json.
+# The gate then reads true PASS/FAIL. If no test runner truly exists, the file
+# records runner:none and the test axis legitimately stays pass-through.
+#
+# Behavior:
+#   - Default ON. Opt out with LOKI_COMPLETION_TEST_CAPTURE=0.
+#   - Cheap: skips when a fresh test-results.json already exists for THIS
+#     iteration (freshness marker .loki/quality/.test-results.iter), so we never
+#     re-run the suite the quality-gate ladder already ran.
+#   - Best-effort: enforce_test_coverage returns nonzero on red tests; that is
+#     EXPECTED and must not crash the completion path. The gate is the decider,
+#     so we always swallow the rc with `|| true` and let the gate read the file.
+#   - CWD invariant: enforce_test_coverage writes ${TARGET_DIR}/.loki/...; the
+#     gate reads .loki/... relative to CWD. Both are invoked from the same loop
+#     body where CWD == TARGET_DIR (or TARGET_DIR=="."), matching the existing
+#     gate call sites.
+# ============================================================================
+ensure_completion_test_evidence() {
+    [ "${LOKI_COMPLETION_TEST_CAPTURE:-1}" = "0" ] && return 0
+    type enforce_test_coverage &>/dev/null || return 0
+    local loki_dir="${TARGET_DIR:-.}/.loki"
+    local quality_dir="$loki_dir/quality"
+    local tr_file="$quality_dir/test-results.json"
+    local iter_marker="$quality_dir/.test-results.iter"
+    local this_iter="${ITERATION_COUNT:-0}"
+    # Freshness guard: if results already exist for this iteration, reuse them.
+    if [ -f "$tr_file" ] && [ -f "$iter_marker" ]; then
+        local marked
+        marked="$(cat "$iter_marker" 2>/dev/null || echo "")"
+        if [ "$marked" = "$this_iter" ]; then
+            log_info "Completion test evidence: reusing this iteration's test-results.json"
+            return 0
+        fi
+    fi
+    log_info "Completion test evidence: capturing fresh test results before evidence gate (opt out: LOKI_COMPLETION_TEST_CAPTURE=0)"
+    # Record the test-results.json mtime BEFORE capture so we only mark this
+    # iteration "fresh" if enforce_test_coverage actually (re)wrote the file.
+    # Guards LOW-2 (bug-hunt): if capture is interrupted before writing while a
+    # prior-iteration file exists, the marker must NOT advance and let stale
+    # evidence read as fresh. Window is narrow but the check is cheap.
+    local _results_file="$quality_dir/test-results.json"
+    local _mtime_before=""
+    [ -f "$_results_file" ] && _mtime_before=$(stat -f %m "$_results_file" 2>/dev/null || stat -c %Y "$_results_file" 2>/dev/null || echo "")
+    # The gate decides on the persisted file; a red suite (nonzero rc) is expected
+    # and must not abort the completion path here.
+    enforce_test_coverage || true
+    mkdir -p "$quality_dir" 2>/dev/null || true
+    local _mtime_after=""
+    [ -f "$_results_file" ] && _mtime_after=$(stat -f %m "$_results_file" 2>/dev/null || stat -c %Y "$_results_file" 2>/dev/null || echo "")
+    # Only advance the freshness marker when the results file was actually
+    # produced/updated by THIS capture (mtime advanced or file newly created).
+    if [ -n "$_mtime_after" ] && [ "$_mtime_after" != "$_mtime_before" ]; then
+        printf '%s\n' "$this_iter" > "$iter_marker" 2>/dev/null || true
+    fi
+    return 0
+}
 # ============================================================================
 # Documentation Staleness Check (v6.75.0)
 # Checks if generated documentation is stale relative to HEAD
@@ -7252,7 +7339,11 @@ run_doc_quality_gate() {
         fi
     else
         score=$((score - 10))
-        issues+=("No generated documentation found (run 'loki docs generate')")
+        if [ "${LOKI_AUTO_DOCS:-true}" = "true" ]; then
+            issues+=("No generated documentation found (auto-generation did not complete)")
+        else
+            issues+=("No generated documentation found (auto-docs disabled; run 'loki docs generate')")
+        fi
     fi
     # Check 3: Package documentation (for npm/pip packages)
@@ -7277,6 +7368,52 @@ run_doc_quality_gate() {
     [ "$score" -ge 70 ]
 }
+# ============================================================================
+# Auto-Documentation Generation (intelligent default)
+# Generates the .loki/docs/ suite before the documentation gate evaluates so
+# the gate scores on real generated docs instead of nagging the user to run
+# 'loki docs generate' by hand. Default-on; opt out with LOKI_AUTO_DOCS=false.
+#
+# Bounded: runs at most once per run when docs are missing, and again only
+# when the existing docs are >10 commits stale (the same threshold the gate
+# and staleness check use). 'loki docs generate' writes its manifest
+# unconditionally (template fallback when no provider), so the missing-docs
+# trigger fires exactly once. Best-effort: never fails the iteration loop.
+# ============================================================================
+auto_generate_docs_if_needed() {
+    [ "${LOKI_AUTO_DOCS:-true}" = "true" ] || return 0
+    local project_dir="${TARGET_DIR:-.}"
+    local manifest="$project_dir/.loki/docs/docs-manifest.json"
+    local needs_gen=false
+    if [ ! -f "$manifest" ]; then
+        needs_gen=true
+    else
+        # Regenerate only when the existing docs are substantially stale.
+        local doc_sha
+        doc_sha=$(python3 -c "import json; print(json.load(open('$manifest')).get('git_sha', ''))" 2>/dev/null)
+        if [ -n "$doc_sha" ]; then
+            local behind
+            behind=$(git -C "$project_dir" rev-list --count "$doc_sha..HEAD" 2>/dev/null || echo "0")
+            [ "$behind" -gt 10 ] && needs_gen=true
+        fi
+    fi
+    [ "$needs_gen" = "true" ] || return 0
+    local loki_bin="$SCRIPT_DIR/loki"
+    [ -x "$loki_bin" ] || return 0
+    log_info "Auto-documentation: generating .loki/docs/ before documentation gate..."
+    # Synchronous so docs exist before the gate scores. Provider-agnostic:
+    # 'loki docs generate' picks the run's provider and falls back to
+    # template-based docs when no provider CLI is available.
+    "$loki_bin" docs generate "$project_dir" >/dev/null 2>&1 || \
+        log_warn "Auto-documentation: generation did not complete (gate will score on what exists)"
+}
 # ============================================================================
 # Magic Modules Debate Gate - Gate 12 (v6.77.0)
 # Runs when any .loki/magic/specs/*.md changed since last iteration.
@@ -7583,16 +7720,28 @@ run_code_review() {
     review_id="review-$(date -u +%Y%m%dT%H%M%SZ)-${ITERATION_COUNT:-0}"
     mkdir -p "$review_dir/$review_id"
-    # Get diff from last commit (staged changes)
+    # Get diff from last commit (staged changes).
+    #
+    # Finding #596 (HIGH): exclude .loki/ and .git/ from the review diff via git
+    # pathspec. When .loki/ is git-tracked the diff bloats (observed 2.18MB of
+    # runtime state), the reviewer prompt overflows, the model returns EMPTY, and
+    # every reviewer records NO_OUTPUT -> the gate passes with ZERO real review.
+    # The evidence gate already excludes .loki/ (see the grep -vE near the
+    # porcelain read); mirror that here so the code-review gate is never defeated
+    # by Loki's own state. Behavior is identical when .loki/ is untracked (the
+    # common case) because git ignores the exclude pathspec for paths it does not
+    # track. ':(exclude).git/' is harmless (git never diffs .git/) and is kept
+    # only for parity with the evidence-gate exclusion list.
+    local _review_pathspec=(-- . ':(exclude).loki/' ':(exclude).git/' ':(exclude)**/.loki/**')
     local diff_content
-    diff_content=$(git -C "${TARGET_DIR:-.}" diff HEAD~1 2>/dev/null || git -C "${TARGET_DIR:-.}" diff --cached 2>/dev/null || echo "")
+    diff_content=$(git -C "${TARGET_DIR:-.}" diff HEAD~1 "${_review_pathspec[@]}" 2>/dev/null || git -C "${TARGET_DIR:-.}" diff --cached "${_review_pathspec[@]}" 2>/dev/null || echo "")
     if [ -z "$diff_content" ]; then
         log_info "Code review: No diff to review, skipping"
         return 0
     fi
     local changed_files
-    changed_files=$(git -C "${TARGET_DIR:-.}" diff --name-only HEAD~1 2>/dev/null || git -C "${TARGET_DIR:-.}" diff --name-only --cached 2>/dev/null || echo "")
+    changed_files=$(git -C "${TARGET_DIR:-.}" diff --name-only HEAD~1 "${_review_pathspec[@]}" 2>/dev/null || git -C "${TARGET_DIR:-.}" diff --name-only --cached "${_review_pathspec[@]}" 2>/dev/null || echo "")
     log_header "CODE REVIEW: $review_id"
@@ -7980,6 +8129,14 @@ BUILD_PROMPT
     local pass_count=0
     local fail_count=0
     local verdicts_summary=""
+    # Finding #596 FIX A2 (HIGH): count REAL verdicts (a reviewer file that
+    # exists, is non-empty, AND carries a recognized VERDICT: PASS|FAIL line).
+    # A review where every reviewer produced no usable verdict (all NO_OUTPUT,
+    # e.g. the model returned EMPTY because a .loki-bloated prompt overflowed)
+    # must NOT silently pass with pass_count=0/fail_count=0/has_blocking=false.
+    # Such a review proves nothing, so we treat it as INCONCLUSIVE -> blocking.
+    local real_verdict_count=0
+    local no_output_count=0
     for i in $(seq 0 $((reviewer_count - 1))); do
         local reviewer_name
@@ -7989,6 +8146,7 @@ BUILD_PROMPT
         if [ ! -f "$review_output" ] || [ ! -s "$review_output" ]; then
             log_warn "Reviewer $reviewer_name produced no output"
             verdicts_summary="${verdicts_summary}${reviewer_name}:NO_OUTPUT "
+            ((no_output_count++))
             continue
         fi
@@ -7996,6 +8154,22 @@ BUILD_PROMPT
         local verdict
         verdict=$(grep -i "^VERDICT:" "$review_output" | head -1 | sed 's/^VERDICT:[[:space:]]*//' | tr '[:lower:]' '[:upper:]' | tr -d '[:space:]')
+        # FIX A2: a "real verdict" is the PRESENCE of a non-empty VERDICT: line,
+        # not a specific token. A non-empty file with NO VERDICT line (garbage or
+        # a truncated reply) previously counted as PASS and could approve the gate
+        # on a meaningless file; now it is a non-verdict (not real, not a pass).
+        # We deliberately keep the original non-FAIL=pass semantics for any file
+        # that DOES carry a verdict line (PASS, APPROVE, "PASS with concerns",
+        # etc. all count as pass) so verbose-but-real verdicts are never
+        # false-blocked. The only added block relative to shipped behavior is the
+        # zero-real-verdicts (all-empty) case.
+        if [ -z "$verdict" ]; then
+            log_warn "Reviewer $reviewer_name produced no VERDICT line (empty or unparseable reply)"
+            verdicts_summary="${verdicts_summary}${reviewer_name}:NO_VERDICT "
+            ((no_output_count++))
+            continue
+        fi
+        ((real_verdict_count++))
         if [ "$verdict" = "FAIL" ]; then
             ((fail_count++))
             # Check for Critical/High severity findings
@@ -8012,6 +8186,23 @@ BUILD_PROMPT
         verdicts_summary="${verdicts_summary}${reviewer_name}:${verdict:-UNKNOWN} "
     done
+    # Finding #596 FIX A2: zero real verdicts when reviewers were expected =>
+    # INCONCLUSIVE => blocking. Optional bounded retry first (LOKI_REVIEW_RETRY=1,
+    # default on) so a transient empty-output blip does not hard-block; the retry
+    # re-runs the whole review with the (now .loki-excluded) diff. Opt out of the
+    # block entirely with LOKI_REVIEW_INCONCLUSIVE_BLOCK=0 (records, never blocks).
+    local review_inconclusive=false
+    if [ "$reviewer_count" -gt 0 ] && [ "$real_verdict_count" -eq 0 ]; then
+        review_inconclusive=true
+        log_error "CODE REVIEW INCONCLUSIVE: 0 of $reviewer_count reviewers returned a usable verdict (no_output=$no_output_count)"
+        log_error "  An all-empty review proves nothing; refusing to pass the gate on zero real verdicts."
+        if [ "${LOKI_REVIEW_RETRY:-1}" = "1" ] && [ "${_LOKI_REVIEW_RETRYING:-0}" != "1" ]; then
+            log_warn "  Retrying code review once (LOKI_REVIEW_RETRY=1)..."
+            _LOKI_REVIEW_RETRYING=1 run_code_review
+            return $?
+        fi
+    fi
     # Save aggregate results via python3 + env vars (no shell interpolation in JSON)
     export LOKI_REVIEW_AGG_FILE="$review_dir/$review_id/aggregate.json"
     export LOKI_REVIEW_AGG_ID="$review_id"
@@ -8020,6 +8211,8 @@ BUILD_PROMPT
     export LOKI_REVIEW_AGG_FAIL="$fail_count"
     export LOKI_REVIEW_AGG_BLOCKING="$has_blocking"
     export LOKI_REVIEW_AGG_VERDICTS="$verdicts_summary"
+    export LOKI_REVIEW_AGG_REAL="$real_verdict_count"
+    export LOKI_REVIEW_AGG_INCONCLUSIVE="$review_inconclusive"
     python3 << 'AGG_SCRIPT'
 import json, os
 result = {
@@ -8028,6 +8221,8 @@ result = {
     "pass_count": int(os.environ["LOKI_REVIEW_AGG_PASS"]),
     "fail_count": int(os.environ["LOKI_REVIEW_AGG_FAIL"]),
     "has_blocking": os.environ["LOKI_REVIEW_AGG_BLOCKING"] == "true",
+    "real_verdict_count": int(os.environ["LOKI_REVIEW_AGG_REAL"]),
+    "inconclusive": os.environ["LOKI_REVIEW_AGG_INCONCLUSIVE"] == "true",
     "verdicts": os.environ["LOKI_REVIEW_AGG_VERDICTS"].strip()
 }
 with open(os.environ["LOKI_REVIEW_AGG_FILE"], "w") as f:
@@ -8035,12 +8230,15 @@ with open(os.environ["LOKI_REVIEW_AGG_FILE"], "w") as f:
 AGG_SCRIPT
     unset LOKI_REVIEW_AGG_FILE LOKI_REVIEW_AGG_ID LOKI_REVIEW_AGG_ITER
     unset LOKI_REVIEW_AGG_PASS LOKI_REVIEW_AGG_FAIL LOKI_REVIEW_AGG_BLOCKING LOKI_REVIEW_AGG_VERDICTS
+    unset LOKI_REVIEW_AGG_REAL LOKI_REVIEW_AGG_INCONCLUSIVE
     emit_event_json "code_review_complete" \
         "review_id=$review_id" \
         "pass_count=$pass_count" \
         "fail_count=$fail_count" \
         "has_blocking=$has_blocking" \
+        "real_verdict_count=$real_verdict_count" \
+        "inconclusive=$review_inconclusive" \
         "iteration=$ITERATION_COUNT"
     # Anti-sycophancy check: unanimous PASS is suspicious
@@ -8059,6 +8257,20 @@ AGG_SCRIPT
         return 1
     fi
+    # Finding #596 FIX A2: an inconclusive review (zero real verdicts, retry
+    # already exhausted or disabled) blocks unless explicitly opted out. This is
+    # the 'verified before done' promise: a review that produced no usable verdict
+    # cannot stand in for a real review.
+    if [ "$review_inconclusive" = "true" ]; then
+        if [ "${LOKI_REVIEW_INCONCLUSIVE_BLOCK:-1}" = "0" ]; then
+            log_warn "Code review inconclusive (0/$reviewer_count real verdicts) but LOKI_REVIEW_INCONCLUSIVE_BLOCK=0 - not blocking"
+            return 0
+        fi
+        log_error "CODE REVIEW BLOCKED: inconclusive (0/$reviewer_count reviewers returned a usable verdict)"
+        log_error "  Review details: $review_dir/$review_id/ ; opt out with LOKI_REVIEW_INCONCLUSIVE_BLOCK=0"
+        return 1
+    fi
     log_info "Code review passed ($pass_count/$reviewer_count PASS, $fail_count FAIL - no blocking issues)"
     return 0
 }
@@ -13758,6 +13970,12 @@ if __name__ == "__main__":
                     fi
                 fi
             fi
+            # Auto-generate docs (default-on) BEFORE the staleness check and the
+            # gate, so neither nags the user to run 'loki docs generate' by hand.
+            # Opt out with LOKI_AUTO_DOCS=false.
+            if [ "$ITERATION_COUNT" -gt 0 ]; then
+                auto_generate_docs_if_needed
+            fi
             # Documentation staleness check (v6.75.0)
             if [ "$ITERATION_COUNT" -gt 0 ]; then
                 run_doc_staleness_check
@@ -13858,6 +14076,15 @@ if __name__ == "__main__":
             # Completion Council check (v5.25.0) - multi-agent voting on completion
             # Runs before completion promise check since council is more comprehensive
             log_step "Post-iteration: checking completion council..."
+            # Finding #598 (HIGH): council_should_stop calls council_evidence_gate
+            # internally; ensure fresh test evidence exists first so its test axis
+            # is not half-blind when the quality-gate ladder did not run this
+            # iteration (e.g. LOKI_HARD_GATES=false). Idempotent: the freshness
+            # guard reuses results the ladder already wrote in the common case, so
+            # tests are never run twice per iteration. Best-effort, never blocks.
+            if type ensure_completion_test_evidence &>/dev/null; then
+                ensure_completion_test_evidence || true
+            fi
             if type council_should_stop &>/dev/null && council_should_stop; then
                 echo ""
                 log_header "COMPLETION COUNCIL: PROJECT COMPLETE"
@@ -13920,6 +14147,15 @@ if __name__ == "__main__":
             if [ "$_completion_claimed" = 1 ] && type council_reverify_checklist &>/dev/null; then
                 council_reverify_checklist 2>/dev/null || true
             fi
+            # Finding #598 (HIGH): generate real test evidence before the evidence
+            # gate fires, so the gate's test axis is never half-blind on absent
+            # test-results.json. Only on an actual completion claim (mirrors the
+            # reverify guard above) so the suite does not run every iteration.
+            # Type-guarded + best-effort: never blocks the completion path itself;
+            # the evidence gate below is the decider that reads the file.
+            if [ "$_completion_claimed" = 1 ] && type ensure_completion_test_evidence &>/dev/null; then
+                ensure_completion_test_evidence || true
+            fi
             if [ -n "$_gate_block_for_completion" ] && [ "$_completion_claimed" = 1 ]; then
                 log_warn "Completion claim rejected: code review is BLOCKED for this iteration (Critical/High findings). Fix review issues before completion."
                 log_warn "  Review details under .loki/quality/reviews/ ; gate_failures=${gate_failures}"
@@ -14199,6 +14435,89 @@ kill_provider_child() {
     return 1
 }
+# Authoritative self-reap of THIS run's process group on a normal completion.
+#
+# Why this exists: a normal completion (council stop / max-iterations /
+# completion promise) returns from run_autonomous() into main()'s cleanup
+# block, which reaps the app-runner but NOT the orchestrator's own process
+# group. The provider agent (claude/codex/...) and any subagents it spawned
+# share the orchestrator's group; if one detached or was reparented to init,
+# it survived the `exit` and kept consuming CPU (observed: ~27 min orphan).
+# The external `loki stop` path (autonomy/loki) already reaps the whole group
+# via the recorded pgid; this brings the SAME authoritative reap to the
+# completion path so a clean finish leaves no orphans.
+#
+# Foreign-run safety (CRITICAL): this is pgid-scoped to the group THIS run
+# recorded at .loki/loki.pgid -- it NEVER uses a name-based `pkill claude`
+# sweep. A concurrent foreign loki run is its own session leader with a
+# DIFFERENT pgid and a different .loki, so it can never be a member of our
+# group and is structurally unreachable. The pgid file only exists when this
+# runner setsid'd into its own session (LOKI_OWN_SESSION=1, recorded at
+# ~run.sh:15034); in interactive foreground we share the user's shell group,
+# leave the pgid absent, and skip this reap entirely -- so Ctrl+C semantics
+# and the user's shell are untouched.
+reap_own_process_group() {
+    local loki_dir="${TARGET_DIR:-.}/.loki"
+    # Resolve the pgid file the same way main() recorded it (global or per-session).
+    local _reap_pgid_file="$loki_dir/loki.pid"
+    if [ -n "${LOKI_SESSION_ID:-}" ]; then
+        _reap_pgid_file="$loki_dir/sessions/${LOKI_SESSION_ID}/loki.pid"
+    fi
+    _reap_pgid_file="${_reap_pgid_file%.pid}.pgid"
+    [ -f "$_reap_pgid_file" ] || return 0   # interactive / no own session: skip
+    local _pgid
+    _pgid=$(cat "$_reap_pgid_file" 2>/dev/null | tr -d ' ')
+    case "$_pgid" in ''|*[!0-9]*) return 0 ;; esac
+    [ "$_pgid" -gt 1 ] 2>/dev/null || return 0    # never touch pgid 0/1
+    # Safety: the recorded pgid MUST be our own group. If it isn't (stale file
+    # from a prior run, copied tree), refuse -- killing a group we do not own
+    # could hit unrelated processes.
+    local _my_pgid
+    _my_pgid=$(ps -o pgid= -p $$ 2>/dev/null | tr -d ' ')
+    [ -n "$_my_pgid" ] && [ "$_pgid" = "$_my_pgid" ] || return 0
+    # Collect protected pids (dashboard, app-runner, registered children) so the
+    # reap never takes down the shared dashboard if it happens to share our
+    # group. Mirrors the `loki stop` / dashboard reaper protection set.
+    local _protected=" $$ "
+    local _pf _p
+    if [ -d "$loki_dir/pids" ]; then
+        for _pf in "$loki_dir/pids"/*.json; do
+            [ -f "$_pf" ] || continue
+            _p=$(basename "$_pf" .json)
+            case "$_p" in ''|*[!0-9]*) continue ;; esac
+            _protected="${_protected}${_p} "
+        done
+        for _pf in "$loki_dir/pids"/*.pid; do
+            [ -f "$_pf" ] || continue
+            _p=$(cat "$_pf" 2>/dev/null | head -1 | tr -d '[:space:]')
+            [ -n "$_p" ] && _protected="${_protected}${_p} "
+        done
+    fi
+    for _pf in "$loki_dir/dashboard/dashboard.pid" "${HOME}/.loki/dashboard/dashboard.pid"; do
+        [ -f "$_pf" ] && _protected="${_protected}$(cat "$_pf" 2>/dev/null | tr -d ' ') "
+    done
+    # Per-pid TERM then KILL of group members, EXCLUDING $$ (so main() survives
+    # to finish its remaining cleanup and exit normally) and protected pids. We
+    # do per-pid (not a blanket `kill -- -PGID`) precisely so $$ and the
+    # dashboard are spared -- a group-wide signal cannot exclude members.
+    local _gp _did=0
+    for _gp in $(ps -axo pid=,pgid= 2>/dev/null | awk -v g="$_pgid" '$2==g{print $1}'); do
+        case "$_protected" in *" $_gp "*) continue ;; esac
+        kill -TERM "$_gp" 2>/dev/null && _did=1
+    done
+    [ "$_did" = "1" ] || return 0
+    sleep 1
+    for _gp in $(ps -axo pid=,pgid= 2>/dev/null | awk -v g="$_pgid" '$2==g{print $1}'); do
+        case "$_protected" in *" $_gp "*) continue ;; esac
+        kill -KILL "$_gp" 2>/dev/null || true
+    done
+    return 0
+}
 # Check for human intervention signals
 check_human_intervention() {
     local loki_dir="${TARGET_DIR:-.}/.loki"
@@ -14333,6 +14652,11 @@ check_human_intervention() {
         if type council_reverify_checklist &>/dev/null; then
             council_reverify_checklist 2>/dev/null || true
         fi
+        # Finding #598 (HIGH): generate real test evidence before the force-review
+        # evidence gate, so the test axis is not half-blind on absent results.
+        if type ensure_completion_test_evidence &>/dev/null; then
+            ensure_completion_test_evidence || true
+        fi
         if type council_checklist_gate &>/dev/null && ! council_checklist_gate; then
             log_info "Council force-review: blocked by checklist hard gate"
         elif type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
@@ -15205,13 +15529,22 @@ main() {
         app_runner_cleanup
     fi
     stop_status_monitor
+    # v7.41.x: authoritatively reap THIS run's process group on a normal
+    # completion (council stop / max-iterations / completion promise), the same
+    # group reap the STOP signal does. Without this, a provider agent that
+    # detached or reparented survived the exit and ran as an orphan (~27 min in
+    # the reported brownfield run). pgid-scoped to .loki/loki.pgid + excludes
+    # $$/dashboard, so it cannot touch a foreign loki run. No-ops in interactive
+    # foreground (no own session => no pgid file), preserving Ctrl+C semantics.
+    reap_own_process_group 2>/dev/null || true
     local loki_dir="${TARGET_DIR:-.}/.loki"
-    rm -f "$loki_dir/loki.pid" 2>/dev/null
+    rm -f "$loki_dir/loki.pid" "$loki_dir/loki.pgid" 2>/dev/null
     # UT2-13: Clear cli-provider marker on normal session end.
     rm -f "$loki_dir/state/cli-provider" 2>/dev/null || true
     # Clean up per-session PID file if running with session ID
     if [ -n "${LOKI_SESSION_ID:-}" ]; then
-        rm -f "$loki_dir/sessions/${LOKI_SESSION_ID}/loki.pid" 2>/dev/null
+        rm -f "$loki_dir/sessions/${LOKI_SESSION_ID}/loki.pid" \
+              "$loki_dir/sessions/${LOKI_SESSION_ID}/loki.pgid" 2>/dev/null
     fi
     # Mark session.json as stopped
     if [ -f "$loki_dir/session.json" ]; then

package/dashboard/__init__.py CHANGED Viewed

@@ -7,7 +7,7 @@ Modules:
     control: Session control API (start/stop/pause/resume)
 """
-__version__ = "7.41.0"
+__version__ = "7.41.1"
 # Expose the control app for easy import
 try:

package/dashboard/server.py CHANGED Viewed

@@ -6973,17 +6973,110 @@ async def get_council_gate():
 # App Runner Endpoints (v5.45.0)
 # =============================================================================
+# Health checkpoints are written once per autonomous iteration (the app-runner
+# watchdog fires from the run.sh iteration loop, not a fixed timer), so a single
+# long healthy iteration can legitimately leave last_health.checked_at minutes
+# old. The staleness threshold below is therefore deliberately generous and is
+# used ONLY when we cannot verify liveness from a real OS pid (e.g. docker
+# compose, whose main_pid is a short-lived `up -d` subshell). When a real pid is
+# present, pid liveness -- not the timestamp -- is the authoritative signal, so a
+# live run is never reported as dead just because a health beat was missed.
+_APP_RUNNER_STALE_HEALTH_SECONDS = 600
+def _pid_is_alive(pid):
+    """Return True if pid refers to a live process, False if it is gone.
+    Uses signal 0 (no signal sent, just an existence/permission probe). Guards
+    against the pid<=0 footgun: kill(0, 0) targets the caller's own process
+    group and would falsely report "alive". A PermissionError means the process
+    exists but is owned by another user (still alive). Any other error is
+    treated as indeterminate (None) so the caller can fall back safely.
+    """
+    try:
+        pid = int(pid)
+    except (TypeError, ValueError):
+        return None
+    if pid <= 0:
+        return None
+    try:
+        os.kill(pid, 0)
+        return True
+    except ProcessLookupError:
+        return False
+    except PermissionError:
+        return True
+    except OSError:
+        return None
+def _health_checked_age_seconds(state):
+    """Seconds since last_health.checked_at, or None if unparseable/absent."""
+    health = state.get("last_health")
+    if not isinstance(health, dict):
+        return None
+    checked_at = health.get("checked_at")
+    if not checked_at:
+        return None
+    try:
+        ts = datetime.fromisoformat(str(checked_at).replace("Z", "+00:00"))
+    except (ValueError, TypeError):
+        return None
+    if ts.tzinfo is None:
+        ts = ts.replace(tzinfo=timezone.utc)
+    return (datetime.now(timezone.utc) - ts).total_seconds()
+def _reconcile_app_runner_liveness(state):
+    """Correct a frozen state.json so a dead run is not reported as running.
+    state.json is written by the app-runner and is not updated once the app or
+    the orchestrator dies, so a crashed run leaves status frozen at "running".
+    Here we cross-check the recorded main_pid against the real OS before
+    returning, and only ever downgrade -- never upgrade -- the status:
+      - recorded running/starting + pid genuinely gone   -> "stopped"
+      - recorded running/starting + pid not verifiable    +
+        last_health.checked_at older than the threshold   -> "stale"
+    Any failure falls back to the raw recorded status (fail open to the writer's
+    own claim rather than fabricating a state). Returns the (possibly modified)
+    state dict.
+    """
+    if not isinstance(state, dict):
+        return state
+    status = state.get("status")
+    if status not in ("running", "starting"):
+        return state
+    try:
+        alive = _pid_is_alive(state.get("main_pid"))
+        if alive is False:
+            state["status"] = "stopped"
+            state["liveness"] = "pid_gone"
+            return state
+        if alive is None:
+            # Cannot verify via pid (e.g. compose subshell pid). Fall back to
+            # the health-beat freshness with a generous threshold.
+            age = _health_checked_age_seconds(state)
+            if age is not None and age > _APP_RUNNER_STALE_HEALTH_SECONDS:
+                state["status"] = "stale"
+                state["liveness"] = "health_stale"
+    except Exception:
+        # Never let a liveness probe break the status endpoint.
+        return state
+    return state
 @app.get("/api/app-runner/status")
 async def get_app_runner_status():
-    """Get app runner current status."""
+    """Get app runner current status (with dead-run liveness reconciliation)."""
     loki_dir = _get_loki_dir()
     state_file = loki_dir / "app-runner" / "state.json"
     if not state_file.exists():
         return {"status": "not_initialized"}
     try:
-        return json.loads(state_file.read_text())
+        state = json.loads(state_file.read_text())
     except (json.JSONDecodeError, OSError):
         return {"status": "error"}
+    return _reconcile_app_runner_liveness(state)
 def _get_log_redactor():