npm - loki-mode - Versions diffs - 7.26.0 → 7.28.0 - Mend

loki-mode 7.26.0 → 7.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/README.md +15 -13
package/SKILL.md +11 -2
package/VERSION +1 -1
package/autonomy/completion-council.sh +310 -6
package/autonomy/context-tracker.py +32 -7
package/autonomy/grill.sh +321 -0
package/autonomy/lib/trust_metrics.py +636 -0
package/autonomy/loki +142 -0
package/autonomy/prd-checklist.sh +248 -14
package/autonomy/run.sh +283 -32
package/autonomy/spec.sh +646 -0
package/autonomy/verify.sh +1130 -0
package/dashboard/__init__.py +1 -1
package/dashboard/static/index.html +1 -1
package/docs/COMPARISON.md +9 -9
package/docs/COMPETITIVE-ANALYSIS.md +18 -37
package/docs/INSTALLATION.md +1 -1
package/docs/auto-claude-comparison.md +9 -6
package/docs/certification/01-core-concepts/lesson.md +3 -3
package/docs/competitive/emergence-others-analysis.md +1 -1
package/docs/competitive/replit-lovable-analysis.md +1 -1
package/docs/cursor-comparison.md +1 -1
package/docs/prd-purple-lab-platform.md +1 -1
package/docs/show-hn-post.md +2 -2
package/loki-ts/dist/loki.js +2 -2
package/mcp/__init__.py +1 -1
package/package.json +2 -1
package/providers/codex.sh +3 -2
package/references/agent-types.md +9 -9
package/references/agents.md +8 -8
package/references/business-ops.md +1 -1
package/references/competitive-analysis.md +1 -1
package/skills/agents.md +3 -3
package/skills/providers.md +3 -3
package/skills/quality-gates.md +46 -0

package/autonomy/run.sh CHANGED Viewed

@@ -1226,6 +1226,94 @@ emit_event_json() {
     log_debug "Event: $event_type - $json_data"
 }
+# Trust-layer metrics event writer (benchmark program section 3). Appends one
+# durable record per trust event to .loki/metrics/trust-events.jsonl via the
+# Python writer (single source of truth for the JSONL schema). This is ADDITIVE
+# and purely a side effect: it writes nothing to stdout, ignores all errors, and
+# never alters control flow or any caller's return value. The single-state
+# control files (evidence-block.json, gate-failure-count.json) are untouched;
+# this log exists because those files are erased on the successful-run path,
+# losing exactly the self-correction events the trust metrics publish.
+# Resolve a stable, UNIQUE-PER-RUN id for the trust event log. The cross-run
+# denominators (block rate, gate distribution) require ids that are distinct per
+# run. A persisted per-run file is the source of truth, NOT LOKI_SESSION_ID:
+#  - On `loki start ./prd.md`, LOKI_SESSION_ID is unset entirely.
+#  - On `loki run <issue>`, LOKI_SESSION_ID is the issue NUMBER, which is stable
+#    across re-runs by design (so `loki stop <n>` works); using it would merge
+#    every re-run of the same issue into one bucket and skew the rates.
+# So a fresh run always MINTS a new unique id into .loki/state/trust-run-id, and
+# every later event in that run reads it back. LOKI_SESSION_ID is only a
+# last-resort fallback when no minted file exists (e.g. an event fired before
+# any run_start, which the aggregator then treats as un-instrumented anyway).
+# Events never join to proof.json (Metrics 1-3 are events-only, Metric 4 is
+# proofs-only), so intra-log uniqueness is the only requirement.
+# Usage: _loki_trust_run_id [--new]
+_loki_trust_run_id() {
+    local loki_dir="${LOKI_DIR:-${TARGET_DIR:-.}/.loki}"
+    local id_file="$loki_dir/state/trust-run-id"
+    if [ "${1:-}" = "--new" ]; then
+        # Fresh run: mint a new unique id (epoch + pid + short random) and
+        # persist it as the source of truth for this run's events.
+        local new_id
+        new_id="run-$(date -u +%Y%m%d%H%M%S)-$$-${RANDOM:-0}"
+        mkdir -p "$loki_dir/state" 2>/dev/null || true
+        printf '%s' "$new_id" > "$id_file" 2>/dev/null || true
+        printf '%s' "$new_id"
+        return 0
+    fi
+    # Read path: the minted per-run file wins over LOKI_SESSION_ID so a resume
+    # in a separate process (no exported LOKI_TRUST_RUN_ID) still resolves to
+    # the same run, and a stable issue-number session id never collapses re-runs.
+    if [ -s "$id_file" ]; then
+        cat "$id_file" 2>/dev/null || true
+        return 0
+    fi
+    if [ -n "${LOKI_SESSION_ID:-}" ]; then
+        printf '%s' "$LOKI_SESSION_ID"
+        return 0
+    fi
+    # No persisted id and no session id: empty -> writer records "unknown".
+    printf '%s' ""
+}
+# Usage: record_trust_event_bash <event_type> [key=value ...]
+# Pass LOKI_TRUST_RUN_ID in the environment to override the resolved id (the
+# run_start site sets it to the freshly minted id so the first event matches).
+record_trust_event_bash() {
+    local event_type="$1"
+    shift || true
+    local tm_mod="$SCRIPT_DIR/lib/trust_metrics.py"
+    [ -f "$tm_mod" ] || return 0
+    command -v python3 >/dev/null 2>&1 || return 0
+    local loki_dir="${LOKI_DIR:-${TARGET_DIR:-.}/.loki}"
+    local run_id="${LOKI_TRUST_RUN_ID:-$(_loki_trust_run_id)}"
+    # Pass kv pairs as argv so Python parses (no shell JSON building). All
+    # values stay strings except where the reader coerces (iteration -> int).
+    _TM_LOKI_DIR="$loki_dir" \
+    _TM_MOD_PATH="$tm_mod" \
+    _TM_EVENT_TYPE="$event_type" \
+    _TM_RUN_ID="$run_id" \
+    _TM_ITERATION="${ITERATION_COUNT:-0}" \
+    python3 - "$@" <<'TRUST_EVENT_PY' >/dev/null 2>&1 || true
+import os, sys, importlib.util
+spec = importlib.util.spec_from_file_location("trust_metrics", os.environ["_TM_MOD_PATH"])
+tm = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(tm)
+fields = {}
+for arg in sys.argv[1:]:
+    if "=" in arg:
+        k, v = arg.split("=", 1)
+        fields[k] = v
+tm.record_trust_event(
+    os.environ["_TM_LOKI_DIR"],
+    os.environ["_TM_EVENT_TYPE"],
+    run_id=os.environ.get("_TM_RUN_ID", "") or None,
+    iteration=os.environ.get("_TM_ITERATION", "0"),
+    **fields,
+)
+TRUST_EVENT_PY
+}
 # v7.0.2: Bash helper to emit a managed-agents event to the dashboard's
 # managed event log (.loki/managed/events.ndjson). Mirrors the Python
 # emit_managed_event helper so bash callers can land events in the same
@@ -2478,6 +2566,26 @@ except Exception:
     local ts
     ts="$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date)"
+    # v7.28.0: evidence-gate inconclusive line. When the evidence gate could not
+    # establish a diff baseline (no git repo, or no run-start SHA), it records a
+    # durable .loki/state/evidence-inconclusive.json instead of silently passing.
+    # Surface one honest line so the user knows completion was not independently
+    # verified. The record is removed by the gate on any conclusive run.
+    local evidence_inconclusive_line=""
+    local _inc_file="$loki_dir/state/evidence-inconclusive.json"
+    if [ -f "$_inc_file" ]; then
+        local _inc_reason
+        _inc_reason="$(python3 -c "import json,sys
+try:
+    d=json.load(open(sys.argv[1]))
+    print(d.get('reason','') if d.get('inconclusive') else '')
+except Exception:
+    print('')" "$_inc_file" 2>/dev/null)"
+        if [ -n "$_inc_reason" ]; then
+            evidence_inconclusive_line="Evidence gate: inconclusive (${_inc_reason}) - completion not independently verified"
+        fi
+    fi
     # ---- Durable human-readable file: .loki/COMPLETION.txt --------------------
     {
         echo "Loki Mode run summary"
@@ -2508,6 +2616,10 @@ except Exception:
         fi
         echo "Tasks: pending=$pending in_progress=$in_progress completed=$completed failed=$failed"
         echo ""
+        if [ -n "$evidence_inconclusive_line" ]; then
+            echo "$evidence_inconclusive_line"
+            echo ""
+        fi
         echo "Review the work:"
         echo "  $review_cmd"
         echo ""
@@ -2916,7 +3028,7 @@ spawn_worktree_session() {
                     >> "$log_file" 2>&1 || _wt_exit=$?
                 ;;
             codex)
-                codex exec --full-auto \
+                codex exec --full-auto --skip-git-repo-check \
                     "Loki Mode: $task_prompt. Read .loki/CONTINUITY.md for context." \
                     >> "$log_file" 2>&1 || _wt_exit=$?
                 ;;
@@ -3117,7 +3229,7 @@ Output ONLY the resolved file content with no conflict markers. No explanations.
                 resolution=$(claude --dangerously-skip-permissions -p "$conflict_prompt" --output-format text 2>/dev/null)
                 ;;
             codex)
-                resolution=$(codex exec --full-auto "$conflict_prompt" 2>/dev/null)
+                resolution=$(codex exec --full-auto --skip-git-repo-check "$conflict_prompt" 2>/dev/null)
                 ;;
             cline)
                 resolution=$(invoke_cline_capture "$conflict_prompt" 2>/dev/null)
@@ -4602,6 +4714,57 @@ print_ttfv_next_steps() {
     return 0
 }
+# _read_iteration_cost <iteration>
+# Emit "input output cost cache_read cache_creation" for the given iteration,
+# preferring the authoritative result-cost file written by the embedded stream
+# parser (Claude'\''s own total_cost_usd + usage, slug/symlink-independent) over
+# the context-tracker-derived estimate in tracking.json. Falls back to
+# tracking.json when no result-cost file exists, and to all zeros otherwise.
+# Best-effort: any parse failure yields "0 0 0 0 0" and never aborts.
+_read_iteration_cost() {
+    local iteration="$1"
+    local result_cost_file=".loki/metrics/result-cost-${iteration}.json"
+    if [ -f "$result_cost_file" ]; then
+        python3 -c "
+import json
+try:
+    d = json.load(open('$result_cost_file'))
+    print(
+        d.get('input_tokens', 0) or 0,
+        d.get('output_tokens', 0) or 0,
+        d.get('total_cost_usd', 0) or 0,
+        d.get('cache_read_tokens', 0) or 0,
+        d.get('cache_creation_tokens', 0) or 0,
+    )
+except Exception:
+    print(0, 0, 0, 0, 0)
+" 2>/dev/null || echo "0 0 0 0 0"
+    elif [ -f ".loki/context/tracking.json" ]; then
+        python3 -c "
+import json
+try:
+    t = json.load(open('.loki/context/tracking.json'))
+    iters = t.get('per_iteration', [])
+    match = [i for i in iters if i.get('iteration') == $iteration]
+    if match:
+        m = match[-1]
+        print(
+            m.get('input_tokens', 0),
+            m.get('output_tokens', 0),
+            m.get('cost_usd', 0),
+            m.get('cache_read_tokens', 0),
+            m.get('cache_creation_tokens', 0),
+        )
+    else:
+        print(0, 0, 0, 0, 0)
+except Exception:
+    print(0, 0, 0, 0, 0)
+" 2>/dev/null || echo "0 0 0 0 0"
+    else
+        echo "0 0 0 0 0"
+    fi
+}
 track_iteration_complete() {
     local iteration="$1"
     local exit_code="${2:-0}"
@@ -4684,32 +4847,14 @@ track_iteration_complete() {
     local phase="${LAST_KNOWN_PHASE:-}"
     [ -z "$phase" ] && phase=$(python3 -c "import json; print(json.load(open('.loki/state/orchestrator.json')).get('currentPhase', 'unknown'))" 2>/dev/null || echo "unknown")
-    # Read token data from context tracker output (v5.42.0)
+    # Read token data, preferring Claude'\''s authoritative result-cost file over
+    # the context-tracker estimate (v7.28.0 cost-capture fix). See
+    # _read_iteration_cost for precedence rationale.
     # v6.82.0: also capture cache_read_tokens / cache_creation_tokens for
     # prompt-cache hit-rate analysis (S1.1 prompt restructure).
     local iter_input=0 iter_output=0 iter_cost=0
     local iter_cache_read=0 iter_cache_creation=0
-    if [ -f ".loki/context/tracking.json" ]; then
-        read iter_input iter_output iter_cost iter_cache_read iter_cache_creation < <(python3 -c "
-import json
-try:
-    t = json.load(open('.loki/context/tracking.json'))
-    iters = t.get('per_iteration', [])
-    match = [i for i in iters if i.get('iteration') == $iteration]
-    if match:
-        m = match[-1]
-        print(
-            m.get('input_tokens', 0),
-            m.get('output_tokens', 0),
-            m.get('cost_usd', 0),
-            m.get('cache_read_tokens', 0),
-            m.get('cache_creation_tokens', 0),
-        )
-    else:
-        print(0, 0, 0, 0, 0)
-except: print(0, 0, 0, 0, 0)
-" 2>/dev/null || echo "0 0 0 0 0")
-    fi
+    read -r iter_input iter_output iter_cost iter_cache_read iter_cache_creation < <(_read_iteration_cost "$iteration")
     cat > ".loki/metrics/efficiency/iteration-${iteration}.json" << EFF_EOF
 {
@@ -6551,6 +6696,13 @@ print(counts[gate_name])
         loki_crash_friction "gate_failure" "gate=${gate_name} consecutive=${count}" >/dev/null 2>&1 || true
     fi
+    # Trust-metrics: append a durable per-failure record so the gate-failure
+    # distribution survives clear_gate_failure (which resets the running
+    # counter). CRITICAL: this function's stdout IS its return value, so the
+    # write is fully stdout-suppressed and best-effort; it cannot change the
+    # echoed count or any gate behavior.
+    record_trust_event_bash "gate_failure" "gate=${gate_name}" "consecutive=${count}" >/dev/null 2>&1 || true
     echo "$count"
 }
@@ -7500,7 +7652,7 @@ BUILD_PROMPT
                         --output-format text > "$review_output" 2>/dev/null
                     ;;
                 codex)
-                    codex exec --full-auto "$prompt_text" \
+                    codex exec --full-auto --skip-git-repo-check "$prompt_text" \
                         > "$review_output" 2>/dev/null
                     ;;
                 cline)
@@ -7715,7 +7867,7 @@ ADVERSARIAL_EOF
             ;;
         codex)
             if command -v codex &>/dev/null; then
-                codex exec --full-auto "$adversarial_prompt" \
+                codex exec --full-auto --skip-git-repo-check "$adversarial_prompt" \
                     > "$result_file" 2>/dev/null || true
             fi
             ;;
@@ -11900,6 +12052,19 @@ run_autonomous() {
     _LOKI_RUN_START_SHA="$(cat "$_start_sha_file" 2>/dev/null || echo "")"
     export _LOKI_RUN_START_SHA
+    # Trust-metrics instrumentation marker: record one run_start event per
+    # fresh run so the trust-metrics denominator counts ONLY instrumented runs.
+    # This is what lets the aggregator distinguish "0 blocks measured" from
+    # "this run predates instrumentation" (the central honesty rule). Additive,
+    # best-effort, stdout-silent; never affects control flow. Mint a fresh
+    # per-run id here and export it so every later event in this run shares it
+    # (LOKI_SESSION_ID is absent on the `loki start` path).
+    if [ "${ITERATION_COUNT:-0}" -eq 0 ]; then
+        LOKI_TRUST_RUN_ID="$(_loki_trust_run_id --new)"
+        export LOKI_TRUST_RUN_ID
+        record_trust_event_bash "run_start" "start_sha=${_LOKI_RUN_START_SHA:-}" 2>/dev/null || true
+    fi
     # Notify dashboard of active project directory (for AI Chat cross-directory usage)
     if command -v curl &>/dev/null; then
         local project_cwd
@@ -12244,8 +12409,15 @@ except Exception as exc:
             claude)
                 # Claude: Full features with stream-json output and agent tracking
                 # Uses dynamic tier for model selection based on RARV phase
-                # Pass tier to Python via environment for dashboard display
-                { LOKI_CURRENT_MODEL="$tier_param" \
+                # Pass tier + iteration to the embedded stream parser via the
+                # environment. A bare `VAR=val cmd | parser` prefix applies ONLY
+                # to `cmd` (claude) and does NOT cross the pipe to the parser
+                # subprocess, so these must be exported into the shell env first.
+                # LOKI_ITERATION lets the parser stamp the authoritative
+                # result-cost file under the correct iteration index.
+                export LOKI_CURRENT_MODEL="$tier_param"
+                export LOKI_ITERATION="$ITERATION_COUNT"
+                { \
                 claude "${_loki_claude_argv[@]}" -p "$prompt" \
             --output-format stream-json --verbose 2>&1 | \
             tee -a "$log_file" "$agent_log" "$iter_output" | \
@@ -12558,6 +12730,34 @@ def process_stream():
                     active_agents[orchestrator_id]["tasks_completed"].append(f"{tool_count} tools used")
                 save_agents()
+                # Authoritative cost capture (path/slug/symlink-independent).
+                # Claude'"'"'s result message carries its own total_cost_usd plus a
+                # full usage object. The context-tracker session-file path is
+                # brittle (slug derivation must guess Claude'"'"'s naming), so this
+                # stamps the authoritative number to a per-iteration file that
+                # the efficiency writer prefers. Best-effort: a malformed or
+                # missing field must never break the iteration loop.
+                try:
+                    _iter = os.environ.get("LOKI_ITERATION", "0")
+                    _u = data.get("usage", {}) or {}
+                    _rec = {
+                        "total_cost_usd": data.get("total_cost_usd"),
+                        "input_tokens": _u.get("input_tokens", 0),
+                        "output_tokens": _u.get("output_tokens", 0),
+                        "cache_read_tokens": _u.get("cache_read_input_tokens", 0),
+                        "cache_creation_tokens": _u.get("cache_creation_input_tokens", 0),
+                    }
+                    if _rec["total_cost_usd"] is not None:
+                        os.makedirs(".loki/metrics", exist_ok=True)
+                        _p = ".loki/metrics/result-cost-" + str(_iter) + ".json"
+                        _tmp = _p + ".tmp"
+                        with open(_tmp, "w") as _f:
+                            json.dump(_rec, _f)
+                        os.replace(_tmp, _p)
+                except Exception:
+                    pass
                 print(f"\n{GREEN}[Session complete]{NC}", flush=True)
                 is_error = data.get("is_error", False)
                 sys.exit(1 if is_error else 0)
@@ -12586,7 +12786,7 @@ if __name__ == "__main__":
                 # Uses dynamic tier from RARV phase (tier_param already set above)
                 { LOKI_CODEX_REASONING_EFFORT="$tier_param" \
                 CODEX_MODEL_REASONING_EFFORT="$tier_param" \
-                codex exec --full-auto \
+                codex exec --full-auto --skip-git-repo-check \
                     "$prompt" 2>&1 | tee -a "$log_file" "$agent_log" "$iter_output"; \
                 } && exit_code=0 || exit_code=$?
                 ;;
@@ -12990,7 +13190,36 @@ if __name__ == "__main__":
             case "${gate_failures:-}" in
                 *code_review,*|*code_review_ESCALATED*) _gate_block_for_completion="code_review" ;;
             esac
-            if [ -n "$_gate_block_for_completion" ] && check_completion_promise "$iter_output"; then
+            # DROP-FIX (v7.28): check_completion_promise -> check_task_completion_signal
+            # CONSUMES the completion signal (rm -f) on the FIRST successful call.
+            # The completion-promise chain below calls it up to five times in one
+            # iteration (reverify guard, code-review arm, evidence arm, held-out
+            # arm, success arm), so the first call consumed the claim and every
+            # later arm saw nothing -- the success arm never fired and the run
+            # iterated to max_iterations even though the agent had claimed done.
+            # Fix: evaluate the claim EXACTLY ONCE here, capture it in
+            # _completion_claimed, and have every arm test that variable. The
+            # single call discards stdout (matching the prior call sites, which
+            # also discarded it), so the task_completion_claim event still emits
+            # exactly once. Consumption semantics are preserved: the claim is
+            # consumed when evaluated; if a gate rejects it, the agent must
+            # re-claim next iteration (see internal/DEMO-CLAIM-DROP-BUG.md).
+            local _completion_claimed=0
+            if check_completion_promise "$iter_output"; then
+                _completion_claimed=1
+            fi
+            # MEDIUM-3: this completion-promise route evaluates the council hard
+            # gates (evidence + held-out) without the council_evaluate freshness
+            # step, so the held-out gate could read stale verification statuses
+            # (and a stale reservation). Re-verify the checklist ONCE here, but
+            # only when a completion claim is actually present (mirror the
+            # check_completion_promise condition used by the gate chain below) so
+            # verification does not run every iteration. Type-guarded and
+            # best-effort: failure must never block the completion path.
+            if [ "$_completion_claimed" = 1 ] && type council_reverify_checklist &>/dev/null; then
+                council_reverify_checklist 2>/dev/null || true
+            fi
+            if [ -n "$_gate_block_for_completion" ] && [ "$_completion_claimed" = 1 ]; then
                 log_warn "Completion claim rejected: code review is BLOCKED for this iteration (Critical/High findings). Fix review issues before completion."
                 log_warn "  Review details under .loki/quality/reviews/ ; gate_failures=${gate_failures}"
                 _gate_block_for_completion=""
@@ -13005,11 +13234,24 @@ if __name__ == "__main__":
             # LOKI_EVIDENCE_GATE=0 (council_evidence_gate returns 0 immediately
             # when disabled, so this branch never fires). Gate output (reason +
             # opt-out hint) is printed by council_evidence_gate itself.
-            elif check_completion_promise "$iter_output" && type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
+            elif [ "$_completion_claimed" = 1 ] && type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
                 log_warn "Completion claim rejected: evidence gate found no proof of completion (empty diff vs run-start SHA, or red tests)."
                 log_warn "  Details under .loki/council/evidence-block.json ; opt out with LOKI_EVIDENCE_GATE=0"
                 # Fall through; keep iterating until there is real evidence.
-            elif check_completion_promise "$iter_output"; then
+            # v7.28.0: the held-out spec-eval gate must also guard the DEFAULT
+            # completion-promise route, not only the interval-gated council path
+            # (council_evaluate). Otherwise an agent can self-assert "done" and
+            # exit as completion_promise_fulfilled while a held-out acceptance
+            # check is failing, bypassing the anti-reward-hacking gate entirely.
+            # Mirrors the evidence-gate block above. Opt-out: the gate's own
+            # LOKI_HELDOUT_GATE=0 (council_heldout_gate returns 0 immediately
+            # when disabled or when no held-out items are reserved, so this
+            # branch never fires). Gate output is printed by council_heldout_gate.
+            elif [ "$_completion_claimed" = 1 ] && type council_heldout_gate &>/dev/null && ! council_heldout_gate; then
+                log_warn "Completion claim rejected: held-out spec-eval gate found failing held-out acceptance check(s)."
+                log_warn "  Details under .loki/council/heldout-block.json ; opt out with LOKI_HELDOUT_GATE=0"
+                # Fall through; keep iterating until the held-out checks pass.
+            elif [ "$_completion_claimed" = 1 ]; then
                 echo ""
                 if [ -n "$COMPLETION_PROMISE" ]; then
                     log_header "COMPLETION PROMISE FULFILLED: $COMPLETION_PROMISE"
@@ -13383,10 +13625,19 @@ check_human_intervention() {
     if [ -f "$loki_dir/signals/COUNCIL_REVIEW_REQUESTED" ]; then
         log_info "Council force-review requested from dashboard"
         rm -f "$loki_dir/signals/COUNCIL_REVIEW_REQUESTED"
+        # MEDIUM-3: this route evaluates the council hard gates directly without
+        # the council_evaluate freshness step, so re-verify the checklist ONCE
+        # before the gate chain to restore that invariant (refreshes held-out
+        # statuses and repairs a stale reservation). Type-guarded, best-effort.
+        if type council_reverify_checklist &>/dev/null; then
+            council_reverify_checklist 2>/dev/null || true
+        fi
         if type council_checklist_gate &>/dev/null && ! council_checklist_gate; then
             log_info "Council force-review: blocked by checklist hard gate"
         elif type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
             log_info "Council force-review: blocked by evidence hard gate"
+        elif type council_heldout_gate &>/dev/null && ! council_heldout_gate; then
+            log_info "Council force-review: blocked by held-out spec-eval hard gate"
         elif type council_vote &>/dev/null && council_vote; then
             log_header "COMPLETION COUNCIL: FORCE REVIEW - PROJECT COMPLETE"
             # BUG #17 fix: Write COMPLETED marker, generate council report, and