npm - loki-mode - Versions diffs - 7.27.0 → 7.28.0 - Mend

loki-mode 7.27.0 → 7.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +3 -2
package/SKILL.md +11 -2
package/VERSION +1 -1
package/autonomy/completion-council.sh +285 -6
package/autonomy/context-tracker.py +32 -7
package/autonomy/grill.sh +321 -0
package/autonomy/loki +49 -0
package/autonomy/prd-checklist.sh +248 -14
package/autonomy/run.sh +170 -27
package/autonomy/spec.sh +646 -0
package/autonomy/verify.sh +55 -0
package/dashboard/__init__.py +1 -1
package/docs/INSTALLATION.md +1 -1
package/loki-ts/dist/loki.js +2 -2
package/mcp/__init__.py +1 -1
package/package.json +2 -1
package/skills/quality-gates.md +46 -0

package/autonomy/prd-checklist.sh CHANGED Viewed

@@ -105,6 +105,198 @@ checklist_should_verify() {
     return 0
 }
+#===============================================================================
+# Held-out Spec Eval Selection (v7.28.0)
+#===============================================================================
+# Anti-reward-hacking: deterministically reserve ~25% of checklist items as
+# "held-out". Held-out item IDs are excluded from the prompt feed the build loop
+# sees (checklist_summary and council_checklist_gate), so a cooperative build
+# agent is not steered toward those specific acceptance checks. The completion
+# council evaluates them at the ship gate (council_heldout_gate in
+# completion-council.sh). Scope of the guarantee: this protects the prompt feed,
+# not a sandbox. .loki/checklist/held-out.json is plain on-disk JSON, so a
+# non-cooperative agent with filesystem tools can read the reservation directly.
+#
+# Selection is idempotent and reproducible: count = clamp(round(0.25*N), 1, 5)
+# for N>=4 items; ordering by sha256 of each item's "id" (stable, not random).
+# Written once to .loki/checklist/held-out.json; never overwritten if present.
+checklist_select_heldout() {
+    local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
+    if [ ! -f "$CHECKLIST_FILE" ]; then
+        return 0
+    fi
+    # The Python below handles all four cases and prints a single status token so
+    # bash can log honestly and emit the right trust event:
+    #   FRESH n           - no prior reservation, selected n (file written)
+    #   IDEMPOTENT        - prior reservation fully valid vs current ids (no-op,
+    #                       file untouched: preserves the idempotency case 1 tests)
+    #   RESELECTED n      - prior reservation fully stale (zero ids survive); the
+    #                       checklist regenerated, so we deterministically re-select
+    #                       n items from the CURRENT checklist and overwrite
+    #   PARTIAL kept=k dropped=d - some prior ids survived; we keep only survivors
+    #   DUP_SKIP          - current checklist ids are not unique; the id-based
+    #                       mechanism is unsound, so we reserve nothing (MEDIUM-2)
+    #   NOOP              - n<4 with no prior file, or other no-write outcome
+    # Honest caveat: re-selection or partial-survival after a regen can reserve
+    # items the build loop already saw in earlier prompts (the hidden-from-loop
+    # guarantee is best-effort once the checklist ids change mid-run).
+    local status_token
+    status_token=$(_CHECKLIST_FILE="$CHECKLIST_FILE" _HELDOUT_FILE="$heldout_file" python3 -c "
+import json, os, sys, hashlib, tempfile
+cl_path = os.environ['_CHECKLIST_FILE']
+out_path = os.environ['_HELDOUT_FILE']
+try:
+    with open(cl_path) as f:
+        data = json.load(f)
+except Exception:
+    print('NOOP')
+    sys.exit(0)
+# Collect all item ids in document order.
+ids = []
+for cat in data.get('categories', []):
+    for item in cat.get('items', []):
+        iid = item.get('id', '')
+        if iid:
+            ids.append(iid)
+n = len(ids)
+id_set = set(ids)
+# MEDIUM-2: duplicate ids make the id-based hide/select mechanism unsound. Skip
+# selection entirely (no reservation written) so a held-out id can never map to
+# more than one item. Do NOT touch an existing reservation file here (a stale
+# valid file left over from before a dup-introducing regen is handled by the
+# council gate's STALE path; over-removing would be over-engineering).
+if len(id_set) != n:
+    print('DUP_SKIP')
+    sys.exit(0)
+def select_count(num_ids):
+    c = round(0.25 * num_ids)
+    if c < 1:
+        c = 1
+    if c > 5:
+        c = 5
+    return c
+def fresh_selection():
+    # Deterministic order: sort ids by sha256(id), take the first <count>.
+    count = select_count(n)
+    ranked = sorted(ids, key=lambda i: hashlib.sha256(i.encode('utf-8')).hexdigest())
+    return sorted(ranked[:count])
+def atomic_write(payload):
+    d = os.path.dirname(out_path) or '.'
+    os.makedirs(d, exist_ok=True)
+    fd, tmp = tempfile.mkstemp(dir=d, suffix='.tmp')
+    with os.fdopen(fd, 'w') as f:
+        json.dump(payload, f, indent=2)
+        f.write('\n')
+    os.replace(tmp, out_path)
+prior = None
+if os.path.exists(out_path):
+    try:
+        with open(out_path) as f:
+            prior = json.load(f)
+    except Exception:
+        prior = None
+if prior is not None:
+    prior_ids = [i for i in prior.get('held_out', []) if i]
+    # A prior reservation of [] (e.g. an earlier n<4 run) is a valid no-op state;
+    # keep it idempotent rather than re-selecting now that n may have grown.
+    if not prior_ids:
+        print('IDEMPOTENT')
+        sys.exit(0)
+    survivors = [i for i in prior_ids if i in id_set]
+    if len(survivors) == len(prior_ids):
+        # Fully valid against the current checklist: idempotent no-op.
+        print('IDEMPOTENT')
+        sys.exit(0)
+    if not survivors:
+        # Fully stale: the checklist regenerated and orphaned the reservation.
+        # Deterministically re-select from the CURRENT checklist.
+        if n < 4:
+            atomic_write({'held_out': [], 'total_items': n,
+                          'note': 'n<4: no held-out reserved (re-selected after stale reservation)'})
+            print('RESELECTED 0')
+            sys.exit(0)
+        held = fresh_selection()
+        atomic_write({'held_out': held, 'total_items': n})
+        print('RESELECTED %d' % len(held))
+        sys.exit(0)
+    # Partial survival: keep only the surviving ids (do not silently shrink).
+    dropped = len(prior_ids) - len(survivors)
+    payload = {'held_out': sorted(survivors), 'total_items': n}
+    atomic_write(payload)
+    print('PARTIAL kept=%d dropped=%d' % (len(survivors), dropped))
+    sys.exit(0)
+# No prior reservation: first selection.
+if n < 4:
+    # N>=4 gate: smaller checklists get no held-out (nothing to hide reliably).
+    atomic_write({'held_out': [], 'total_items': n, 'note': 'n<4: no held-out reserved'})
+    print('NOOP')
+    sys.exit(0)
+held = fresh_selection()
+atomic_write({'held_out': held, 'total_items': n})
+print('FRESH %d' % len(held))
+" 2>/dev/null || echo "NOOP")
+    # Honest logging + trust event on any stale repair (type-guarded).
+    local tok rest
+    read -r tok rest <<< "$status_token"
+    case "$tok" in
+        RESELECTED)
+            log_warn "[checklist] held-out reservation stale (checklist regenerated); re-selected ${rest:-0} items"
+            if type record_trust_event_bash &>/dev/null; then
+                record_trust_event_bash "heldout_stale" \
+                    "detail=reselected" \
+                    "reselected=${rest:-0}" \
+                    >/dev/null 2>&1 || true
+            fi
+            ;;
+        PARTIAL)
+            log_warn "[checklist] held-out reservation partially stale (checklist regenerated); $rest"
+            if type record_trust_event_bash &>/dev/null; then
+                record_trust_event_bash "heldout_stale" \
+                    "detail=partial" \
+                    "$rest" \
+                    >/dev/null 2>&1 || true
+            fi
+            ;;
+        DUP_SKIP)
+            log_warn "[checklist] checklist ids are not unique; held-out selection skipped (id-based reservation is unsound with duplicate ids)"
+            ;;
+    esac
+    return 0
+}
+# Echo held-out item IDs (one per line) to stdout. Empty when none reserved.
+checklist_heldout_ids() {
+    local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
+    if [ ! -f "$heldout_file" ]; then
+        return 0
+    fi
+    _HELDOUT_FILE="$heldout_file" python3 -c "
+import json, os
+try:
+    with open(os.environ['_HELDOUT_FILE']) as f:
+        data = json.load(f)
+    for i in data.get('held_out', []):
+        print(i)
+except Exception:
+    pass
+" 2>/dev/null || true
+}
 #===============================================================================
 # Verification
 #===============================================================================
@@ -118,6 +310,10 @@ checklist_verify() {
         return 0
     fi
+    # Held-out selection happens BEFORE the first verification so that the very
+    # first verification-results.json summary already excludes held-out items.
+    checklist_select_heldout
     local script_dir
     script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
     local verify_script="${script_dir}/checklist-verify.py"
@@ -160,16 +356,12 @@ checklist_summary() {
     _CHECKLIST_RESULTS="$CHECKLIST_RESULTS_FILE" \
     _CHECKLIST_WAIVERS="${CHECKLIST_DIR:-".loki/checklist"}/waivers.json" \
+    _CHECKLIST_HELDOUT="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json" \
     python3 -c "
 import json, sys, os
 try:
     fpath = os.environ.get('_CHECKLIST_RESULTS', '')
     data = json.load(open(fpath))
-    s = data.get('summary', {})
-    total = s.get('total', 0)
-    verified = s.get('verified', 0)
-    failing = s.get('failing', 0)
-    pending = s.get('pending', 0)
     # Load waivers
     waived_ids = set()
@@ -184,26 +376,68 @@ try:
         except Exception:
             pass
-    # Count waived items and adjust failing list
-    waived_count = 0
-    if total == 0:
-        print('')
-    else:
+    # Load held-out item ids (v7.28.0). Held-out items are NEVER surfaced to the
+    # build loop: they are fully excluded from the counts and the failing list so
+    # the build agent cannot tune to them. The council evaluates them separately.
+    heldout_ids = set()
+    heldout_path = os.environ.get('_CHECKLIST_HELDOUT', '')
+    if heldout_path and os.path.exists(heldout_path):
+        try:
+            with open(heldout_path) as hf:
+                hdata = json.load(hf)
+            heldout_ids = set(hdata.get('held_out', []))
+        except Exception:
+            pass
+    # Count all checklist items first so we can detect the pathological case
+    # where hiding would empty the summary on a non-empty checklist (MEDIUM-2).
+    all_items = 0
+    for cat in data.get('categories', []):
+        all_items += len(cat.get('items', []))
+    def compute(apply_heldout):
+        total = verified = pending = failing = waived_count = 0
         failing_items = []
         for cat in data.get('categories', []):
             for item in cat.get('items', []):
                 item_id = item.get('id', '')
+                if apply_heldout and item_id in heldout_ids:
+                    continue
                 if item_id in waived_ids:
                     waived_count += 1
                     continue
-                if item.get('status') == 'failing' and item.get('priority') in ('critical', 'major'):
-                    failing_items.append(item.get('title', item.get('id', '?')))
+                total += 1
+                status = item.get('status')
+                if status == 'verified':
+                    verified += 1
+                elif status == 'failing':
+                    failing += 1
+                    if item.get('priority') in ('critical', 'major'):
+                        failing_items.append(item.get('title', item.get('id', '?')))
+                else:
+                    pending += 1
+        return total, verified, pending, failing, waived_count, failing_items
+    # Recompute counts over the VISIBLE (non-held-out) items so 'total' never
+    # leaks the existence of held-out items. Waived items are excluded too.
+    total, verified, pending, failing, waived_count, failing_items = compute(True)
+    # MEDIUM-2 guard: if hiding held-out items would empty the summary while the
+    # checklist itself is non-empty, fall back to showing all items (do not hide)
+    # and warn. Returning an empty summary on a non-empty checklist reads as 'no
+    # checklist' to the prompt feed, which is a worse failure than a small leak.
+    if total == 0 and all_items > 0:
+        print('held-out hiding would empty a non-empty checklist summary; showing all items', file=sys.stderr)
+        total, verified, pending, failing, waived_count, failing_items = compute(False)
+    if total == 0:
+        print('')
+    else:
         detail = ''
         if failing_items:
             detail = ' FAILING: ' + ', '.join(failing_items[:5])
         waived_str = f', {waived_count} waived' if waived_count > 0 else ''
-        adjusted_failing = max(0, failing - waived_count)
-        print(f'{verified}/{total} verified, {adjusted_failing} failing{waived_str}, {pending} pending.{detail}')
+        print(f'{verified}/{total} verified, {failing} failing{waived_str}, {pending} pending.{detail}')
 except Exception:
     print('', file=sys.stderr)
 " 2>/dev/null || echo ""

package/autonomy/run.sh CHANGED Viewed

@@ -2566,6 +2566,26 @@ except Exception:
     local ts
     ts="$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date)"
+    # v7.28.0: evidence-gate inconclusive line. When the evidence gate could not
+    # establish a diff baseline (no git repo, or no run-start SHA), it records a
+    # durable .loki/state/evidence-inconclusive.json instead of silently passing.
+    # Surface one honest line so the user knows completion was not independently
+    # verified. The record is removed by the gate on any conclusive run.
+    local evidence_inconclusive_line=""
+    local _inc_file="$loki_dir/state/evidence-inconclusive.json"
+    if [ -f "$_inc_file" ]; then
+        local _inc_reason
+        _inc_reason="$(python3 -c "import json,sys
+try:
+    d=json.load(open(sys.argv[1]))
+    print(d.get('reason','') if d.get('inconclusive') else '')
+except Exception:
+    print('')" "$_inc_file" 2>/dev/null)"
+        if [ -n "$_inc_reason" ]; then
+            evidence_inconclusive_line="Evidence gate: inconclusive (${_inc_reason}) - completion not independently verified"
+        fi
+    fi
     # ---- Durable human-readable file: .loki/COMPLETION.txt --------------------
     {
         echo "Loki Mode run summary"
@@ -2596,6 +2616,10 @@ except Exception:
         fi
         echo "Tasks: pending=$pending in_progress=$in_progress completed=$completed failed=$failed"
         echo ""
+        if [ -n "$evidence_inconclusive_line" ]; then
+            echo "$evidence_inconclusive_line"
+            echo ""
+        fi
         echo "Review the work:"
         echo "  $review_cmd"
         echo ""
@@ -4690,6 +4714,57 @@ print_ttfv_next_steps() {
     return 0
 }
+# _read_iteration_cost <iteration>
+# Emit "input output cost cache_read cache_creation" for the given iteration,
+# preferring the authoritative result-cost file written by the embedded stream
+# parser (Claude'\''s own total_cost_usd + usage, slug/symlink-independent) over
+# the context-tracker-derived estimate in tracking.json. Falls back to
+# tracking.json when no result-cost file exists, and to all zeros otherwise.
+# Best-effort: any parse failure yields "0 0 0 0 0" and never aborts.
+_read_iteration_cost() {
+    local iteration="$1"
+    local result_cost_file=".loki/metrics/result-cost-${iteration}.json"
+    if [ -f "$result_cost_file" ]; then
+        python3 -c "
+import json
+try:
+    d = json.load(open('$result_cost_file'))
+    print(
+        d.get('input_tokens', 0) or 0,
+        d.get('output_tokens', 0) or 0,
+        d.get('total_cost_usd', 0) or 0,
+        d.get('cache_read_tokens', 0) or 0,
+        d.get('cache_creation_tokens', 0) or 0,
+    )
+except Exception:
+    print(0, 0, 0, 0, 0)
+" 2>/dev/null || echo "0 0 0 0 0"
+    elif [ -f ".loki/context/tracking.json" ]; then
+        python3 -c "
+import json
+try:
+    t = json.load(open('.loki/context/tracking.json'))
+    iters = t.get('per_iteration', [])
+    match = [i for i in iters if i.get('iteration') == $iteration]
+    if match:
+        m = match[-1]
+        print(
+            m.get('input_tokens', 0),
+            m.get('output_tokens', 0),
+            m.get('cost_usd', 0),
+            m.get('cache_read_tokens', 0),
+            m.get('cache_creation_tokens', 0),
+        )
+    else:
+        print(0, 0, 0, 0, 0)
+except Exception:
+    print(0, 0, 0, 0, 0)
+" 2>/dev/null || echo "0 0 0 0 0"
+    else
+        echo "0 0 0 0 0"
+    fi
+}
 track_iteration_complete() {
     local iteration="$1"
     local exit_code="${2:-0}"
@@ -4772,32 +4847,14 @@ track_iteration_complete() {
     local phase="${LAST_KNOWN_PHASE:-}"
     [ -z "$phase" ] && phase=$(python3 -c "import json; print(json.load(open('.loki/state/orchestrator.json')).get('currentPhase', 'unknown'))" 2>/dev/null || echo "unknown")
-    # Read token data from context tracker output (v5.42.0)
+    # Read token data, preferring Claude'\''s authoritative result-cost file over
+    # the context-tracker estimate (v7.28.0 cost-capture fix). See
+    # _read_iteration_cost for precedence rationale.
     # v6.82.0: also capture cache_read_tokens / cache_creation_tokens for
     # prompt-cache hit-rate analysis (S1.1 prompt restructure).
     local iter_input=0 iter_output=0 iter_cost=0
     local iter_cache_read=0 iter_cache_creation=0
-    if [ -f ".loki/context/tracking.json" ]; then
-        read iter_input iter_output iter_cost iter_cache_read iter_cache_creation < <(python3 -c "
-import json
-try:
-    t = json.load(open('.loki/context/tracking.json'))
-    iters = t.get('per_iteration', [])
-    match = [i for i in iters if i.get('iteration') == $iteration]
-    if match:
-        m = match[-1]
-        print(
-            m.get('input_tokens', 0),
-            m.get('output_tokens', 0),
-            m.get('cost_usd', 0),
-            m.get('cache_read_tokens', 0),
-            m.get('cache_creation_tokens', 0),
-        )
-    else:
-        print(0, 0, 0, 0, 0)
-except: print(0, 0, 0, 0, 0)
-" 2>/dev/null || echo "0 0 0 0 0")
-    fi
+    read -r iter_input iter_output iter_cost iter_cache_read iter_cache_creation < <(_read_iteration_cost "$iteration")
     cat > ".loki/metrics/efficiency/iteration-${iteration}.json" << EFF_EOF
 {
@@ -12352,8 +12409,15 @@ except Exception as exc:
             claude)
                 # Claude: Full features with stream-json output and agent tracking
                 # Uses dynamic tier for model selection based on RARV phase
-                # Pass tier to Python via environment for dashboard display
-                { LOKI_CURRENT_MODEL="$tier_param" \
+                # Pass tier + iteration to the embedded stream parser via the
+                # environment. A bare `VAR=val cmd | parser` prefix applies ONLY
+                # to `cmd` (claude) and does NOT cross the pipe to the parser
+                # subprocess, so these must be exported into the shell env first.
+                # LOKI_ITERATION lets the parser stamp the authoritative
+                # result-cost file under the correct iteration index.
+                export LOKI_CURRENT_MODEL="$tier_param"
+                export LOKI_ITERATION="$ITERATION_COUNT"
+                { \
                 claude "${_loki_claude_argv[@]}" -p "$prompt" \
             --output-format stream-json --verbose 2>&1 | \
             tee -a "$log_file" "$agent_log" "$iter_output" | \
@@ -12666,6 +12730,34 @@ def process_stream():
                     active_agents[orchestrator_id]["tasks_completed"].append(f"{tool_count} tools used")
                 save_agents()
+                # Authoritative cost capture (path/slug/symlink-independent).
+                # Claude'"'"'s result message carries its own total_cost_usd plus a
+                # full usage object. The context-tracker session-file path is
+                # brittle (slug derivation must guess Claude'"'"'s naming), so this
+                # stamps the authoritative number to a per-iteration file that
+                # the efficiency writer prefers. Best-effort: a malformed or
+                # missing field must never break the iteration loop.
+                try:
+                    _iter = os.environ.get("LOKI_ITERATION", "0")
+                    _u = data.get("usage", {}) or {}
+                    _rec = {
+                        "total_cost_usd": data.get("total_cost_usd"),
+                        "input_tokens": _u.get("input_tokens", 0),
+                        "output_tokens": _u.get("output_tokens", 0),
+                        "cache_read_tokens": _u.get("cache_read_input_tokens", 0),
+                        "cache_creation_tokens": _u.get("cache_creation_input_tokens", 0),
+                    }
+                    if _rec["total_cost_usd"] is not None:
+                        os.makedirs(".loki/metrics", exist_ok=True)
+                        _p = ".loki/metrics/result-cost-" + str(_iter) + ".json"
+                        _tmp = _p + ".tmp"
+                        with open(_tmp, "w") as _f:
+                            json.dump(_rec, _f)
+                        os.replace(_tmp, _p)
+                except Exception:
+                    pass
                 print(f"\n{GREEN}[Session complete]{NC}", flush=True)
                 is_error = data.get("is_error", False)
                 sys.exit(1 if is_error else 0)
@@ -13098,7 +13190,36 @@ if __name__ == "__main__":
             case "${gate_failures:-}" in
                 *code_review,*|*code_review_ESCALATED*) _gate_block_for_completion="code_review" ;;
             esac
-            if [ -n "$_gate_block_for_completion" ] && check_completion_promise "$iter_output"; then
+            # DROP-FIX (v7.28): check_completion_promise -> check_task_completion_signal
+            # CONSUMES the completion signal (rm -f) on the FIRST successful call.
+            # The completion-promise chain below calls it up to five times in one
+            # iteration (reverify guard, code-review arm, evidence arm, held-out
+            # arm, success arm), so the first call consumed the claim and every
+            # later arm saw nothing -- the success arm never fired and the run
+            # iterated to max_iterations even though the agent had claimed done.
+            # Fix: evaluate the claim EXACTLY ONCE here, capture it in
+            # _completion_claimed, and have every arm test that variable. The
+            # single call discards stdout (matching the prior call sites, which
+            # also discarded it), so the task_completion_claim event still emits
+            # exactly once. Consumption semantics are preserved: the claim is
+            # consumed when evaluated; if a gate rejects it, the agent must
+            # re-claim next iteration (see internal/DEMO-CLAIM-DROP-BUG.md).
+            local _completion_claimed=0
+            if check_completion_promise "$iter_output"; then
+                _completion_claimed=1
+            fi
+            # MEDIUM-3: this completion-promise route evaluates the council hard
+            # gates (evidence + held-out) without the council_evaluate freshness
+            # step, so the held-out gate could read stale verification statuses
+            # (and a stale reservation). Re-verify the checklist ONCE here, but
+            # only when a completion claim is actually present (mirror the
+            # check_completion_promise condition used by the gate chain below) so
+            # verification does not run every iteration. Type-guarded and
+            # best-effort: failure must never block the completion path.
+            if [ "$_completion_claimed" = 1 ] && type council_reverify_checklist &>/dev/null; then
+                council_reverify_checklist 2>/dev/null || true
+            fi
+            if [ -n "$_gate_block_for_completion" ] && [ "$_completion_claimed" = 1 ]; then
                 log_warn "Completion claim rejected: code review is BLOCKED for this iteration (Critical/High findings). Fix review issues before completion."
                 log_warn "  Review details under .loki/quality/reviews/ ; gate_failures=${gate_failures}"
                 _gate_block_for_completion=""
@@ -13113,11 +13234,24 @@ if __name__ == "__main__":
             # LOKI_EVIDENCE_GATE=0 (council_evidence_gate returns 0 immediately
             # when disabled, so this branch never fires). Gate output (reason +
             # opt-out hint) is printed by council_evidence_gate itself.
-            elif check_completion_promise "$iter_output" && type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
+            elif [ "$_completion_claimed" = 1 ] && type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
                 log_warn "Completion claim rejected: evidence gate found no proof of completion (empty diff vs run-start SHA, or red tests)."
                 log_warn "  Details under .loki/council/evidence-block.json ; opt out with LOKI_EVIDENCE_GATE=0"
                 # Fall through; keep iterating until there is real evidence.
-            elif check_completion_promise "$iter_output"; then
+            # v7.28.0: the held-out spec-eval gate must also guard the DEFAULT
+            # completion-promise route, not only the interval-gated council path
+            # (council_evaluate). Otherwise an agent can self-assert "done" and
+            # exit as completion_promise_fulfilled while a held-out acceptance
+            # check is failing, bypassing the anti-reward-hacking gate entirely.
+            # Mirrors the evidence-gate block above. Opt-out: the gate's own
+            # LOKI_HELDOUT_GATE=0 (council_heldout_gate returns 0 immediately
+            # when disabled or when no held-out items are reserved, so this
+            # branch never fires). Gate output is printed by council_heldout_gate.
+            elif [ "$_completion_claimed" = 1 ] && type council_heldout_gate &>/dev/null && ! council_heldout_gate; then
+                log_warn "Completion claim rejected: held-out spec-eval gate found failing held-out acceptance check(s)."
+                log_warn "  Details under .loki/council/heldout-block.json ; opt out with LOKI_HELDOUT_GATE=0"
+                # Fall through; keep iterating until the held-out checks pass.
+            elif [ "$_completion_claimed" = 1 ]; then
                 echo ""
                 if [ -n "$COMPLETION_PROMISE" ]; then
                     log_header "COMPLETION PROMISE FULFILLED: $COMPLETION_PROMISE"
@@ -13491,10 +13625,19 @@ check_human_intervention() {
     if [ -f "$loki_dir/signals/COUNCIL_REVIEW_REQUESTED" ]; then
         log_info "Council force-review requested from dashboard"
         rm -f "$loki_dir/signals/COUNCIL_REVIEW_REQUESTED"
+        # MEDIUM-3: this route evaluates the council hard gates directly without
+        # the council_evaluate freshness step, so re-verify the checklist ONCE
+        # before the gate chain to restore that invariant (refreshes held-out
+        # statuses and repairs a stale reservation). Type-guarded, best-effort.
+        if type council_reverify_checklist &>/dev/null; then
+            council_reverify_checklist 2>/dev/null || true
+        fi
         if type council_checklist_gate &>/dev/null && ! council_checklist_gate; then
             log_info "Council force-review: blocked by checklist hard gate"
         elif type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
             log_info "Council force-review: blocked by evidence hard gate"
+        elif type council_heldout_gate &>/dev/null && ! council_heldout_gate; then
+            log_info "Council force-review: blocked by held-out spec-eval hard gate"
         elif type council_vote &>/dev/null && council_vote; then
             log_header "COMPLETION COUNCIL: FORCE REVIEW - PROJECT COMPLETE"
             # BUG #17 fix: Write COMPLETED marker, generate council report, and