npm - loki-mode - Versions diffs - 7.26.0 → 7.28.0 - Mend

loki-mode 7.26.0 → 7.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/README.md +15 -13
package/SKILL.md +11 -2
package/VERSION +1 -1
package/autonomy/completion-council.sh +310 -6
package/autonomy/context-tracker.py +32 -7
package/autonomy/grill.sh +321 -0
package/autonomy/lib/trust_metrics.py +636 -0
package/autonomy/loki +142 -0
package/autonomy/prd-checklist.sh +248 -14
package/autonomy/run.sh +283 -32
package/autonomy/spec.sh +646 -0
package/autonomy/verify.sh +1130 -0
package/dashboard/__init__.py +1 -1
package/dashboard/static/index.html +1 -1
package/docs/COMPARISON.md +9 -9
package/docs/COMPETITIVE-ANALYSIS.md +18 -37
package/docs/INSTALLATION.md +1 -1
package/docs/auto-claude-comparison.md +9 -6
package/docs/certification/01-core-concepts/lesson.md +3 -3
package/docs/competitive/emergence-others-analysis.md +1 -1
package/docs/competitive/replit-lovable-analysis.md +1 -1
package/docs/cursor-comparison.md +1 -1
package/docs/prd-purple-lab-platform.md +1 -1
package/docs/show-hn-post.md +2 -2
package/loki-ts/dist/loki.js +2 -2
package/mcp/__init__.py +1 -1
package/package.json +2 -1
package/providers/codex.sh +3 -2
package/references/agent-types.md +9 -9
package/references/agents.md +8 -8
package/references/business-ops.md +1 -1
package/references/competitive-analysis.md +1 -1
package/skills/agents.md +3 -3
package/skills/providers.md +3 -3
package/skills/quality-gates.md +46 -0

package/autonomy/loki CHANGED Viewed

@@ -554,12 +554,16 @@ show_help() {
     echo "  projects         Multi-project registry management"
     echo "  audit [cmd]      Agent audit log and quality scanning (log|scan)"
     echo "  heal <path>      Legacy system healing (archaeology, stabilize, modernize)"
+    echo "  verify [base]    Deterministic PR verification (Autonomi Verify MVP; CI-gate exit codes)"
+    echo "  spec [cmd]       Living spec: keep the spec true (lock|status|sync; drift detection, CI-gate exit codes)"
+    echo "  grill [spec]     Interrogate a spec with the hardest questions before you build it (Devil's Advocate)"
     echo "  review [opts]    Standalone code review with quality gates (diff, staged, PR, files)"
     echo "  optimize         Optimize prompts based on session history"
     echo "  enterprise       Enterprise feature management (tokens, OIDC)"
     echo "  metrics [opts]   Session productivity report (--json, --last N, --save, --share)"
     echo "  cost [opts]      Transparent cost view: per-run/project spend + budget (--json, --last N)"
     echo "  trust [--json]   Visible trust trajectory: council/gate pass-rate + interventions over runs [R4]"
+    echo "  trust-metrics    Trust-layer metrics: evidence-block rate, gate distribution, council split, cost/verified (--json)"
     echo "  dogfood          Show self-development statistics"
     echo "  secrets [cmd]    API key status and validation (status|validate)"
     echo "  reset [target]   Reset session state (all|retries|failed)"
@@ -11355,6 +11359,70 @@ with open(manifest_path, 'w') as f:
 # Modernize legacy codebases incrementally without breaking existing behavior.
 #===============================================================================
+# ---------------------------------------------------------------------------
+# loki verify - Autonomi Verify (Verification-as-a-Service MVP)
+#
+# Thin dispatcher that sources autonomy/verify.sh and delegates to its
+# verify_main(). The verification core is deliberately standalone (it does NOT
+# enter the autonomous iteration loop): it computes a PR-style merge-base diff
+# and runs deterministic gates against the current tree, emitting a verdict and
+# a consolidated evidence document. Deterministic-only in this MVP (no LLM
+# review). Exit code is propagated to the caller so the command is CI-gate
+# usable.
+# ---------------------------------------------------------------------------
+cmd_verify() {
+    local verify_mod="$_LOKI_SCRIPT_DIR/verify.sh"
+    if [ ! -f "$verify_mod" ]; then
+        echo -e "${RED}Error: verify module not found at $verify_mod${NC}" >&2
+        return 3
+    fi
+    # shellcheck source=/dev/null
+    source "$verify_mod"
+    verify_main "$@"
+    return $?
+}
+# ---------------------------------------------------------------------------
+# loki spec - the living spec (drift detection + lock)
+#
+# Thin dispatcher that sources autonomy/spec.sh and delegates to spec_main().
+# The spec core is deliberately standalone (it does NOT enter the autonomous
+# loop): it binds spec requirements to content hashes (.loki/spec/spec.lock)
+# and detects drift deterministically, emitting .loki/spec/drift-report.json.
+# Exit codes are propagated so `loki spec status` is CI-gate usable.
+# ---------------------------------------------------------------------------
+cmd_spec() {
+    local spec_mod="$_LOKI_SCRIPT_DIR/spec.sh"
+    if [ ! -f "$spec_mod" ]; then
+        echo -e "${RED}Error: spec module not found at $spec_mod${NC}" >&2
+        return 3
+    fi
+    # shellcheck source=/dev/null
+    source "$spec_mod"
+    spec_main "$@"
+    return $?
+}
+# ---------------------------------------------------------------------------
+# loki grill - spec interrogation (Devil's-Advocate, pre-build)
+#
+# Thin dispatcher that sources autonomy/grill.sh and delegates to grill_main().
+# Invokes the provider once to produce the hardest questions exposing spec
+# weaknesses; writes .loki/grill/report.md. Requires the provider CLI and
+# fails cleanly when it is absent (no fabricated questions).
+# ---------------------------------------------------------------------------
+cmd_grill() {
+    local grill_mod="$_LOKI_SCRIPT_DIR/grill.sh"
+    if [ ! -f "$grill_mod" ]; then
+        echo -e "${RED}Error: grill module not found at $grill_mod${NC}" >&2
+        return 3
+    fi
+    # shellcheck source=/dev/null
+    source "$grill_mod"
+    grill_main "$@"
+    return $?
+}
 cmd_heal_help() {
     echo -e "${BOLD}loki heal${NC} - Legacy system healing (v6.67.0)"
     echo ""
@@ -13502,6 +13570,15 @@ main() {
         heal)
             cmd_heal "$@"
             ;;
+        verify)
+            cmd_verify "$@"
+            ;;
+        spec)
+            cmd_spec "$@"
+            ;;
+        grill)
+            cmd_grill "$@"
+            ;;
         migrate)
             cmd_migrate "$@"
             ;;
@@ -13529,6 +13606,9 @@ main() {
         trust)
             cmd_trust "$@"
             ;;
+        trust-metrics)
+            cmd_trust_metrics "$@"
+            ;;
         syslog)
             cmd_syslog "$@"
             ;;
@@ -18775,6 +18855,68 @@ cmd_trust() {
     python3 "$trust_mod" --loki-dir "$loki_dir" ${pass_args[@]+"${pass_args[@]}"}
 }
+# Trust-layer metrics (benchmark program section 3): the four AVAILABLE-TODAY
+# metrics nobody else can publish, computed for THIS project from the durable
+# trust-events.jsonl log plus the .loki/proofs/ corpus. Honest by construction:
+# each metric reports its own n= and says "not instrumented" rather than a
+# fabricated zero. Single project only.
+cmd_trust_metrics() {
+    local pass_args=()
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            --help|-h)
+                echo -e "${BOLD}loki trust-metrics${NC} - Trust-layer metrics (single project)"
+                echo ""
+                echo "Usage: loki trust-metrics [options]"
+                echo ""
+                echo "Computes the four trust-layer metrics from this project's"
+                echo ".loki artifacts and emits .loki/metrics/trust-metrics.json"
+                echo "plus a human-readable table:"
+                echo "  1. Evidence-gate block rate (runs that caught an unproven"
+                echo "     'done' claim before honoring completion)"
+                echo "  2. Gate failure distribution per run (median, p90, per-gate)"
+                echo "  3. Council rejection / split-verdict rate"
+                echo "  4. Cost-per-VERIFIED-task (local verified denominator)"
+                echo ""
+                echo "Sources: .loki/metrics/trust-events.jsonl (durable event log)"
+                echo "and .loki/proofs/<id>/proof.json. A metric with no source"
+                echo "artifact is reported 'not instrumented', never a fake 0."
+                echo ""
+                echo "Options:"
+                echo "  --json               Machine-readable JSON output"
+                echo "  --no-cache           Do not write trust-metrics.json"
+                echo "  --help, -h           Show this help"
+                echo ""
+                echo "Scope: SINGLE PROJECT only. An --all-projects registry"
+                echo "aggregator is out of scope; run this inside each project."
+                exit 0
+                ;;
+            --json) pass_args+=("--json"); shift ;;
+            --no-cache) pass_args+=("--no-cache"); shift ;;
+            --all-projects)
+                echo -e "${RED}--all-projects is out of scope (single project only).${NC}"
+                echo "Run 'loki trust-metrics' inside each project directory."
+                exit 2
+                ;;
+            *) echo -e "${RED}Unknown option: $1${NC}"; echo "Run 'loki trust-metrics --help' for usage."; exit 1 ;;
+        esac
+    done
+    if ! command -v python3 &>/dev/null; then
+        echo -e "${RED}python3 is required for trust metrics${NC}"
+        exit 1
+    fi
+    local tm_mod="$_LOKI_SCRIPT_DIR/lib/trust_metrics.py"
+    if [ ! -f "$tm_mod" ]; then
+        echo -e "${RED}trust_metrics.py not found at $tm_mod${NC}"
+        exit 1
+    fi
+    local loki_dir="${LOKI_DIR:-.loki}"
+    python3 "$tm_mod" --loki-dir "$loki_dir" ${pass_args[@]+"${pass_args[@]}"}
+}
 # Transparent cost view (R3): per-run + per-project spend, model routing, and
 # budget status with the 80% warn line. Reuses efficiency_cost.collect_efficiency
 # for the current-run aggregate (single source of truth) and reads .loki/proofs/

package/autonomy/prd-checklist.sh CHANGED Viewed

@@ -105,6 +105,198 @@ checklist_should_verify() {
     return 0
 }
+#===============================================================================
+# Held-out Spec Eval Selection (v7.28.0)
+#===============================================================================
+# Anti-reward-hacking: deterministically reserve ~25% of checklist items as
+# "held-out". Held-out item IDs are excluded from the prompt feed the build loop
+# sees (checklist_summary and council_checklist_gate), so a cooperative build
+# agent is not steered toward those specific acceptance checks. The completion
+# council evaluates them at the ship gate (council_heldout_gate in
+# completion-council.sh). Scope of the guarantee: this protects the prompt feed,
+# not a sandbox. .loki/checklist/held-out.json is plain on-disk JSON, so a
+# non-cooperative agent with filesystem tools can read the reservation directly.
+#
+# Selection is idempotent and reproducible: count = clamp(round(0.25*N), 1, 5)
+# for N>=4 items; ordering by sha256 of each item's "id" (stable, not random).
+# Written once to .loki/checklist/held-out.json; never overwritten if present.
+checklist_select_heldout() {
+    local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
+    if [ ! -f "$CHECKLIST_FILE" ]; then
+        return 0
+    fi
+    # The Python below handles all four cases and prints a single status token so
+    # bash can log honestly and emit the right trust event:
+    #   FRESH n           - no prior reservation, selected n (file written)
+    #   IDEMPOTENT        - prior reservation fully valid vs current ids (no-op,
+    #                       file untouched: preserves the idempotency case 1 tests)
+    #   RESELECTED n      - prior reservation fully stale (zero ids survive); the
+    #                       checklist regenerated, so we deterministically re-select
+    #                       n items from the CURRENT checklist and overwrite
+    #   PARTIAL kept=k dropped=d - some prior ids survived; we keep only survivors
+    #   DUP_SKIP          - current checklist ids are not unique; the id-based
+    #                       mechanism is unsound, so we reserve nothing (MEDIUM-2)
+    #   NOOP              - n<4 with no prior file, or other no-write outcome
+    # Honest caveat: re-selection or partial-survival after a regen can reserve
+    # items the build loop already saw in earlier prompts (the hidden-from-loop
+    # guarantee is best-effort once the checklist ids change mid-run).
+    local status_token
+    status_token=$(_CHECKLIST_FILE="$CHECKLIST_FILE" _HELDOUT_FILE="$heldout_file" python3 -c "
+import json, os, sys, hashlib, tempfile
+cl_path = os.environ['_CHECKLIST_FILE']
+out_path = os.environ['_HELDOUT_FILE']
+try:
+    with open(cl_path) as f:
+        data = json.load(f)
+except Exception:
+    print('NOOP')
+    sys.exit(0)
+# Collect all item ids in document order.
+ids = []
+for cat in data.get('categories', []):
+    for item in cat.get('items', []):
+        iid = item.get('id', '')
+        if iid:
+            ids.append(iid)
+n = len(ids)
+id_set = set(ids)
+# MEDIUM-2: duplicate ids make the id-based hide/select mechanism unsound. Skip
+# selection entirely (no reservation written) so a held-out id can never map to
+# more than one item. Do NOT touch an existing reservation file here (a stale
+# valid file left over from before a dup-introducing regen is handled by the
+# council gate's STALE path; over-removing would be over-engineering).
+if len(id_set) != n:
+    print('DUP_SKIP')
+    sys.exit(0)
+def select_count(num_ids):
+    c = round(0.25 * num_ids)
+    if c < 1:
+        c = 1
+    if c > 5:
+        c = 5
+    return c
+def fresh_selection():
+    # Deterministic order: sort ids by sha256(id), take the first <count>.
+    count = select_count(n)
+    ranked = sorted(ids, key=lambda i: hashlib.sha256(i.encode('utf-8')).hexdigest())
+    return sorted(ranked[:count])
+def atomic_write(payload):
+    d = os.path.dirname(out_path) or '.'
+    os.makedirs(d, exist_ok=True)
+    fd, tmp = tempfile.mkstemp(dir=d, suffix='.tmp')
+    with os.fdopen(fd, 'w') as f:
+        json.dump(payload, f, indent=2)
+        f.write('\n')
+    os.replace(tmp, out_path)
+prior = None
+if os.path.exists(out_path):
+    try:
+        with open(out_path) as f:
+            prior = json.load(f)
+    except Exception:
+        prior = None
+if prior is not None:
+    prior_ids = [i for i in prior.get('held_out', []) if i]
+    # A prior reservation of [] (e.g. an earlier n<4 run) is a valid no-op state;
+    # keep it idempotent rather than re-selecting now that n may have grown.
+    if not prior_ids:
+        print('IDEMPOTENT')
+        sys.exit(0)
+    survivors = [i for i in prior_ids if i in id_set]
+    if len(survivors) == len(prior_ids):
+        # Fully valid against the current checklist: idempotent no-op.
+        print('IDEMPOTENT')
+        sys.exit(0)
+    if not survivors:
+        # Fully stale: the checklist regenerated and orphaned the reservation.
+        # Deterministically re-select from the CURRENT checklist.
+        if n < 4:
+            atomic_write({'held_out': [], 'total_items': n,
+                          'note': 'n<4: no held-out reserved (re-selected after stale reservation)'})
+            print('RESELECTED 0')
+            sys.exit(0)
+        held = fresh_selection()
+        atomic_write({'held_out': held, 'total_items': n})
+        print('RESELECTED %d' % len(held))
+        sys.exit(0)
+    # Partial survival: keep only the surviving ids (do not silently shrink).
+    dropped = len(prior_ids) - len(survivors)
+    payload = {'held_out': sorted(survivors), 'total_items': n}
+    atomic_write(payload)
+    print('PARTIAL kept=%d dropped=%d' % (len(survivors), dropped))
+    sys.exit(0)
+# No prior reservation: first selection.
+if n < 4:
+    # N>=4 gate: smaller checklists get no held-out (nothing to hide reliably).
+    atomic_write({'held_out': [], 'total_items': n, 'note': 'n<4: no held-out reserved'})
+    print('NOOP')
+    sys.exit(0)
+held = fresh_selection()
+atomic_write({'held_out': held, 'total_items': n})
+print('FRESH %d' % len(held))
+" 2>/dev/null || echo "NOOP")
+    # Honest logging + trust event on any stale repair (type-guarded).
+    local tok rest
+    read -r tok rest <<< "$status_token"
+    case "$tok" in
+        RESELECTED)
+            log_warn "[checklist] held-out reservation stale (checklist regenerated); re-selected ${rest:-0} items"
+            if type record_trust_event_bash &>/dev/null; then
+                record_trust_event_bash "heldout_stale" \
+                    "detail=reselected" \
+                    "reselected=${rest:-0}" \
+                    >/dev/null 2>&1 || true
+            fi
+            ;;
+        PARTIAL)
+            log_warn "[checklist] held-out reservation partially stale (checklist regenerated); $rest"
+            if type record_trust_event_bash &>/dev/null; then
+                record_trust_event_bash "heldout_stale" \
+                    "detail=partial" \
+                    "$rest" \
+                    >/dev/null 2>&1 || true
+            fi
+            ;;
+        DUP_SKIP)
+            log_warn "[checklist] checklist ids are not unique; held-out selection skipped (id-based reservation is unsound with duplicate ids)"
+            ;;
+    esac
+    return 0
+}
+# Echo held-out item IDs (one per line) to stdout. Empty when none reserved.
+checklist_heldout_ids() {
+    local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
+    if [ ! -f "$heldout_file" ]; then
+        return 0
+    fi
+    _HELDOUT_FILE="$heldout_file" python3 -c "
+import json, os
+try:
+    with open(os.environ['_HELDOUT_FILE']) as f:
+        data = json.load(f)
+    for i in data.get('held_out', []):
+        print(i)
+except Exception:
+    pass
+" 2>/dev/null || true
+}
 #===============================================================================
 # Verification
 #===============================================================================
@@ -118,6 +310,10 @@ checklist_verify() {
         return 0
     fi
+    # Held-out selection happens BEFORE the first verification so that the very
+    # first verification-results.json summary already excludes held-out items.
+    checklist_select_heldout
     local script_dir
     script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
     local verify_script="${script_dir}/checklist-verify.py"
@@ -160,16 +356,12 @@ checklist_summary() {
     _CHECKLIST_RESULTS="$CHECKLIST_RESULTS_FILE" \
     _CHECKLIST_WAIVERS="${CHECKLIST_DIR:-".loki/checklist"}/waivers.json" \
+    _CHECKLIST_HELDOUT="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json" \
     python3 -c "
 import json, sys, os
 try:
     fpath = os.environ.get('_CHECKLIST_RESULTS', '')
     data = json.load(open(fpath))
-    s = data.get('summary', {})
-    total = s.get('total', 0)
-    verified = s.get('verified', 0)
-    failing = s.get('failing', 0)
-    pending = s.get('pending', 0)
     # Load waivers
     waived_ids = set()
@@ -184,26 +376,68 @@ try:
         except Exception:
             pass
-    # Count waived items and adjust failing list
-    waived_count = 0
-    if total == 0:
-        print('')
-    else:
+    # Load held-out item ids (v7.28.0). Held-out items are NEVER surfaced to the
+    # build loop: they are fully excluded from the counts and the failing list so
+    # the build agent cannot tune to them. The council evaluates them separately.
+    heldout_ids = set()
+    heldout_path = os.environ.get('_CHECKLIST_HELDOUT', '')
+    if heldout_path and os.path.exists(heldout_path):
+        try:
+            with open(heldout_path) as hf:
+                hdata = json.load(hf)
+            heldout_ids = set(hdata.get('held_out', []))
+        except Exception:
+            pass
+    # Count all checklist items first so we can detect the pathological case
+    # where hiding would empty the summary on a non-empty checklist (MEDIUM-2).
+    all_items = 0
+    for cat in data.get('categories', []):
+        all_items += len(cat.get('items', []))
+    def compute(apply_heldout):
+        total = verified = pending = failing = waived_count = 0
         failing_items = []
         for cat in data.get('categories', []):
             for item in cat.get('items', []):
                 item_id = item.get('id', '')
+                if apply_heldout and item_id in heldout_ids:
+                    continue
                 if item_id in waived_ids:
                     waived_count += 1
                     continue
-                if item.get('status') == 'failing' and item.get('priority') in ('critical', 'major'):
-                    failing_items.append(item.get('title', item.get('id', '?')))
+                total += 1
+                status = item.get('status')
+                if status == 'verified':
+                    verified += 1
+                elif status == 'failing':
+                    failing += 1
+                    if item.get('priority') in ('critical', 'major'):
+                        failing_items.append(item.get('title', item.get('id', '?')))
+                else:
+                    pending += 1
+        return total, verified, pending, failing, waived_count, failing_items
+    # Recompute counts over the VISIBLE (non-held-out) items so 'total' never
+    # leaks the existence of held-out items. Waived items are excluded too.
+    total, verified, pending, failing, waived_count, failing_items = compute(True)
+    # MEDIUM-2 guard: if hiding held-out items would empty the summary while the
+    # checklist itself is non-empty, fall back to showing all items (do not hide)
+    # and warn. Returning an empty summary on a non-empty checklist reads as 'no
+    # checklist' to the prompt feed, which is a worse failure than a small leak.
+    if total == 0 and all_items > 0:
+        print('held-out hiding would empty a non-empty checklist summary; showing all items', file=sys.stderr)
+        total, verified, pending, failing, waived_count, failing_items = compute(False)
+    if total == 0:
+        print('')
+    else:
         detail = ''
         if failing_items:
             detail = ' FAILING: ' + ', '.join(failing_items[:5])
         waived_str = f', {waived_count} waived' if waived_count > 0 else ''
-        adjusted_failing = max(0, failing - waived_count)
-        print(f'{verified}/{total} verified, {adjusted_failing} failing{waived_str}, {pending} pending.{detail}')
+        print(f'{verified}/{total} verified, {failing} failing{waived_str}, {pending} pending.{detail}')
 except Exception:
     print('', file=sys.stderr)
 " 2>/dev/null || echo ""