loki-mode 7.26.0 → 7.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/autonomy/loki CHANGED
@@ -554,12 +554,16 @@ show_help() {
554
554
  echo " projects Multi-project registry management"
555
555
  echo " audit [cmd] Agent audit log and quality scanning (log|scan)"
556
556
  echo " heal <path> Legacy system healing (archaeology, stabilize, modernize)"
557
+ echo " verify [base] Deterministic PR verification (Autonomi Verify MVP; CI-gate exit codes)"
558
+ echo " spec [cmd] Living spec: keep the spec true (lock|status|sync; drift detection, CI-gate exit codes)"
559
+ echo " grill [spec] Interrogate a spec with the hardest questions before you build it (Devil's Advocate)"
557
560
  echo " review [opts] Standalone code review with quality gates (diff, staged, PR, files)"
558
561
  echo " optimize Optimize prompts based on session history"
559
562
  echo " enterprise Enterprise feature management (tokens, OIDC)"
560
563
  echo " metrics [opts] Session productivity report (--json, --last N, --save, --share)"
561
564
  echo " cost [opts] Transparent cost view: per-run/project spend + budget (--json, --last N)"
562
565
  echo " trust [--json] Visible trust trajectory: council/gate pass-rate + interventions over runs [R4]"
566
+ echo " trust-metrics Trust-layer metrics: evidence-block rate, gate distribution, council split, cost/verified (--json)"
563
567
  echo " dogfood Show self-development statistics"
564
568
  echo " secrets [cmd] API key status and validation (status|validate)"
565
569
  echo " reset [target] Reset session state (all|retries|failed)"
@@ -11355,6 +11359,70 @@ with open(manifest_path, 'w') as f:
11355
11359
  # Modernize legacy codebases incrementally without breaking existing behavior.
11356
11360
  #===============================================================================
11357
11361
 
11362
+ # ---------------------------------------------------------------------------
11363
+ # loki verify - Autonomi Verify (Verification-as-a-Service MVP)
11364
+ #
11365
+ # Thin dispatcher that sources autonomy/verify.sh and delegates to its
11366
+ # verify_main(). The verification core is deliberately standalone (it does NOT
11367
+ # enter the autonomous iteration loop): it computes a PR-style merge-base diff
11368
+ # and runs deterministic gates against the current tree, emitting a verdict and
11369
+ # a consolidated evidence document. Deterministic-only in this MVP (no LLM
11370
+ # review). Exit code is propagated to the caller so the command is CI-gate
11371
+ # usable.
11372
+ # ---------------------------------------------------------------------------
11373
+ cmd_verify() {
11374
+ local verify_mod="$_LOKI_SCRIPT_DIR/verify.sh"
11375
+ if [ ! -f "$verify_mod" ]; then
11376
+ echo -e "${RED}Error: verify module not found at $verify_mod${NC}" >&2
11377
+ return 3
11378
+ fi
11379
+ # shellcheck source=/dev/null
11380
+ source "$verify_mod"
11381
+ verify_main "$@"
11382
+ return $?
11383
+ }
11384
+
11385
+ # ---------------------------------------------------------------------------
11386
+ # loki spec - the living spec (drift detection + lock)
11387
+ #
11388
+ # Thin dispatcher that sources autonomy/spec.sh and delegates to spec_main().
11389
+ # The spec core is deliberately standalone (it does NOT enter the autonomous
11390
+ # loop): it binds spec requirements to content hashes (.loki/spec/spec.lock)
11391
+ # and detects drift deterministically, emitting .loki/spec/drift-report.json.
11392
+ # Exit codes are propagated so `loki spec status` is CI-gate usable.
11393
+ # ---------------------------------------------------------------------------
11394
+ cmd_spec() {
11395
+ local spec_mod="$_LOKI_SCRIPT_DIR/spec.sh"
11396
+ if [ ! -f "$spec_mod" ]; then
11397
+ echo -e "${RED}Error: spec module not found at $spec_mod${NC}" >&2
11398
+ return 3
11399
+ fi
11400
+ # shellcheck source=/dev/null
11401
+ source "$spec_mod"
11402
+ spec_main "$@"
11403
+ return $?
11404
+ }
11405
+
11406
+ # ---------------------------------------------------------------------------
11407
+ # loki grill - spec interrogation (Devil's-Advocate, pre-build)
11408
+ #
11409
+ # Thin dispatcher that sources autonomy/grill.sh and delegates to grill_main().
11410
+ # Invokes the provider once to produce the hardest questions exposing spec
11411
+ # weaknesses; writes .loki/grill/report.md. Requires the provider CLI and
11412
+ # fails cleanly when it is absent (no fabricated questions).
11413
+ # ---------------------------------------------------------------------------
11414
+ cmd_grill() {
11415
+ local grill_mod="$_LOKI_SCRIPT_DIR/grill.sh"
11416
+ if [ ! -f "$grill_mod" ]; then
11417
+ echo -e "${RED}Error: grill module not found at $grill_mod${NC}" >&2
11418
+ return 3
11419
+ fi
11420
+ # shellcheck source=/dev/null
11421
+ source "$grill_mod"
11422
+ grill_main "$@"
11423
+ return $?
11424
+ }
11425
+
11358
11426
  cmd_heal_help() {
11359
11427
  echo -e "${BOLD}loki heal${NC} - Legacy system healing (v6.67.0)"
11360
11428
  echo ""
@@ -13502,6 +13570,15 @@ main() {
13502
13570
  heal)
13503
13571
  cmd_heal "$@"
13504
13572
  ;;
13573
+ verify)
13574
+ cmd_verify "$@"
13575
+ ;;
13576
+ spec)
13577
+ cmd_spec "$@"
13578
+ ;;
13579
+ grill)
13580
+ cmd_grill "$@"
13581
+ ;;
13505
13582
  migrate)
13506
13583
  cmd_migrate "$@"
13507
13584
  ;;
@@ -13529,6 +13606,9 @@ main() {
13529
13606
  trust)
13530
13607
  cmd_trust "$@"
13531
13608
  ;;
13609
+ trust-metrics)
13610
+ cmd_trust_metrics "$@"
13611
+ ;;
13532
13612
  syslog)
13533
13613
  cmd_syslog "$@"
13534
13614
  ;;
@@ -18775,6 +18855,68 @@ cmd_trust() {
18775
18855
  python3 "$trust_mod" --loki-dir "$loki_dir" ${pass_args[@]+"${pass_args[@]}"}
18776
18856
  }
18777
18857
 
18858
+ # Trust-layer metrics (benchmark program section 3): the four AVAILABLE-TODAY
18859
+ # metrics nobody else can publish, computed for THIS project from the durable
18860
+ # trust-events.jsonl log plus the .loki/proofs/ corpus. Honest by construction:
18861
+ # each metric reports its own n= and says "not instrumented" rather than a
18862
+ # fabricated zero. Single project only.
18863
+ cmd_trust_metrics() {
18864
+ local pass_args=()
18865
+ while [[ $# -gt 0 ]]; do
18866
+ case "$1" in
18867
+ --help|-h)
18868
+ echo -e "${BOLD}loki trust-metrics${NC} - Trust-layer metrics (single project)"
18869
+ echo ""
18870
+ echo "Usage: loki trust-metrics [options]"
18871
+ echo ""
18872
+ echo "Computes the four trust-layer metrics from this project's"
18873
+ echo ".loki artifacts and emits .loki/metrics/trust-metrics.json"
18874
+ echo "plus a human-readable table:"
18875
+ echo " 1. Evidence-gate block rate (runs that caught an unproven"
18876
+ echo " 'done' claim before honoring completion)"
18877
+ echo " 2. Gate failure distribution per run (median, p90, per-gate)"
18878
+ echo " 3. Council rejection / split-verdict rate"
18879
+ echo " 4. Cost-per-VERIFIED-task (local verified denominator)"
18880
+ echo ""
18881
+ echo "Sources: .loki/metrics/trust-events.jsonl (durable event log)"
18882
+ echo "and .loki/proofs/<id>/proof.json. A metric with no source"
18883
+ echo "artifact is reported 'not instrumented', never a fake 0."
18884
+ echo ""
18885
+ echo "Options:"
18886
+ echo " --json Machine-readable JSON output"
18887
+ echo " --no-cache Do not write trust-metrics.json"
18888
+ echo " --help, -h Show this help"
18889
+ echo ""
18890
+ echo "Scope: SINGLE PROJECT only. An --all-projects registry"
18891
+ echo "aggregator is out of scope; run this inside each project."
18892
+ exit 0
18893
+ ;;
18894
+ --json) pass_args+=("--json"); shift ;;
18895
+ --no-cache) pass_args+=("--no-cache"); shift ;;
18896
+ --all-projects)
18897
+ echo -e "${RED}--all-projects is out of scope (single project only).${NC}"
18898
+ echo "Run 'loki trust-metrics' inside each project directory."
18899
+ exit 2
18900
+ ;;
18901
+ *) echo -e "${RED}Unknown option: $1${NC}"; echo "Run 'loki trust-metrics --help' for usage."; exit 1 ;;
18902
+ esac
18903
+ done
18904
+
18905
+ if ! command -v python3 &>/dev/null; then
18906
+ echo -e "${RED}python3 is required for trust metrics${NC}"
18907
+ exit 1
18908
+ fi
18909
+
18910
+ local tm_mod="$_LOKI_SCRIPT_DIR/lib/trust_metrics.py"
18911
+ if [ ! -f "$tm_mod" ]; then
18912
+ echo -e "${RED}trust_metrics.py not found at $tm_mod${NC}"
18913
+ exit 1
18914
+ fi
18915
+
18916
+ local loki_dir="${LOKI_DIR:-.loki}"
18917
+ python3 "$tm_mod" --loki-dir "$loki_dir" ${pass_args[@]+"${pass_args[@]}"}
18918
+ }
18919
+
18778
18920
  # Transparent cost view (R3): per-run + per-project spend, model routing, and
18779
18921
  # budget status with the 80% warn line. Reuses efficiency_cost.collect_efficiency
18780
18922
  # for the current-run aggregate (single source of truth) and reads .loki/proofs/
@@ -105,6 +105,198 @@ checklist_should_verify() {
105
105
  return 0
106
106
  }
107
107
 
108
+ #===============================================================================
109
+ # Held-out Spec Eval Selection (v7.28.0)
110
+ #===============================================================================
111
+ # Anti-reward-hacking: deterministically reserve ~25% of checklist items as
112
+ # "held-out". Held-out item IDs are excluded from the prompt feed the build loop
113
+ # sees (checklist_summary and council_checklist_gate), so a cooperative build
114
+ # agent is not steered toward those specific acceptance checks. The completion
115
+ # council evaluates them at the ship gate (council_heldout_gate in
116
+ # completion-council.sh). Scope of the guarantee: this protects the prompt feed,
117
+ # not a sandbox. .loki/checklist/held-out.json is plain on-disk JSON, so a
118
+ # non-cooperative agent with filesystem tools can read the reservation directly.
119
+ #
120
+ # Selection is idempotent and reproducible: count = clamp(round(0.25*N), 1, 5)
121
+ # for N>=4 items; ordering by sha256 of each item's "id" (stable, not random).
122
+ # Written once to .loki/checklist/held-out.json; never overwritten if present.
123
+ checklist_select_heldout() {
124
+ local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
125
+
126
+ if [ ! -f "$CHECKLIST_FILE" ]; then
127
+ return 0
128
+ fi
129
+
130
+ # The Python below handles all four cases and prints a single status token so
131
+ # bash can log honestly and emit the right trust event:
132
+ # FRESH n - no prior reservation, selected n (file written)
133
+ # IDEMPOTENT - prior reservation fully valid vs current ids (no-op,
134
+ # file untouched: preserves the idempotency case 1 tests)
135
+ # RESELECTED n - prior reservation fully stale (zero ids survive); the
136
+ # checklist regenerated, so we deterministically re-select
137
+ # n items from the CURRENT checklist and overwrite
138
+ # PARTIAL kept=k dropped=d - some prior ids survived; we keep only survivors
139
+ # DUP_SKIP - current checklist ids are not unique; the id-based
140
+ # mechanism is unsound, so we reserve nothing (MEDIUM-2)
141
+ # NOOP - n<4 with no prior file, or other no-write outcome
142
+ # Honest caveat: re-selection or partial-survival after a regen can reserve
143
+ # items the build loop already saw in earlier prompts (the hidden-from-loop
144
+ # guarantee is best-effort once the checklist ids change mid-run).
145
+ local status_token
146
+ status_token=$(_CHECKLIST_FILE="$CHECKLIST_FILE" _HELDOUT_FILE="$heldout_file" python3 -c "
147
+ import json, os, sys, hashlib, tempfile
148
+
149
+ cl_path = os.environ['_CHECKLIST_FILE']
150
+ out_path = os.environ['_HELDOUT_FILE']
151
+ try:
152
+ with open(cl_path) as f:
153
+ data = json.load(f)
154
+ except Exception:
155
+ print('NOOP')
156
+ sys.exit(0)
157
+
158
+ # Collect all item ids in document order.
159
+ ids = []
160
+ for cat in data.get('categories', []):
161
+ for item in cat.get('items', []):
162
+ iid = item.get('id', '')
163
+ if iid:
164
+ ids.append(iid)
165
+
166
+ n = len(ids)
167
+ id_set = set(ids)
168
+
169
+ # MEDIUM-2: duplicate ids make the id-based hide/select mechanism unsound. Skip
170
+ # selection entirely (no reservation written) so a held-out id can never map to
171
+ # more than one item. Do NOT touch an existing reservation file here (a stale
172
+ # valid file left over from before a dup-introducing regen is handled by the
173
+ # council gate's STALE path; over-removing would be over-engineering).
174
+ if len(id_set) != n:
175
+ print('DUP_SKIP')
176
+ sys.exit(0)
177
+
178
+ def select_count(num_ids):
179
+ c = round(0.25 * num_ids)
180
+ if c < 1:
181
+ c = 1
182
+ if c > 5:
183
+ c = 5
184
+ return c
185
+
186
+ def fresh_selection():
187
+ # Deterministic order: sort ids by sha256(id), take the first <count>.
188
+ count = select_count(n)
189
+ ranked = sorted(ids, key=lambda i: hashlib.sha256(i.encode('utf-8')).hexdigest())
190
+ return sorted(ranked[:count])
191
+
192
+ def atomic_write(payload):
193
+ d = os.path.dirname(out_path) or '.'
194
+ os.makedirs(d, exist_ok=True)
195
+ fd, tmp = tempfile.mkstemp(dir=d, suffix='.tmp')
196
+ with os.fdopen(fd, 'w') as f:
197
+ json.dump(payload, f, indent=2)
198
+ f.write('\n')
199
+ os.replace(tmp, out_path)
200
+
201
+ prior = None
202
+ if os.path.exists(out_path):
203
+ try:
204
+ with open(out_path) as f:
205
+ prior = json.load(f)
206
+ except Exception:
207
+ prior = None
208
+
209
+ if prior is not None:
210
+ prior_ids = [i for i in prior.get('held_out', []) if i]
211
+ # A prior reservation of [] (e.g. an earlier n<4 run) is a valid no-op state;
212
+ # keep it idempotent rather than re-selecting now that n may have grown.
213
+ if not prior_ids:
214
+ print('IDEMPOTENT')
215
+ sys.exit(0)
216
+ survivors = [i for i in prior_ids if i in id_set]
217
+ if len(survivors) == len(prior_ids):
218
+ # Fully valid against the current checklist: idempotent no-op.
219
+ print('IDEMPOTENT')
220
+ sys.exit(0)
221
+ if not survivors:
222
+ # Fully stale: the checklist regenerated and orphaned the reservation.
223
+ # Deterministically re-select from the CURRENT checklist.
224
+ if n < 4:
225
+ atomic_write({'held_out': [], 'total_items': n,
226
+ 'note': 'n<4: no held-out reserved (re-selected after stale reservation)'})
227
+ print('RESELECTED 0')
228
+ sys.exit(0)
229
+ held = fresh_selection()
230
+ atomic_write({'held_out': held, 'total_items': n})
231
+ print('RESELECTED %d' % len(held))
232
+ sys.exit(0)
233
+ # Partial survival: keep only the surviving ids (do not silently shrink).
234
+ dropped = len(prior_ids) - len(survivors)
235
+ payload = {'held_out': sorted(survivors), 'total_items': n}
236
+ atomic_write(payload)
237
+ print('PARTIAL kept=%d dropped=%d' % (len(survivors), dropped))
238
+ sys.exit(0)
239
+
240
+ # No prior reservation: first selection.
241
+ if n < 4:
242
+ # N>=4 gate: smaller checklists get no held-out (nothing to hide reliably).
243
+ atomic_write({'held_out': [], 'total_items': n, 'note': 'n<4: no held-out reserved'})
244
+ print('NOOP')
245
+ sys.exit(0)
246
+
247
+ held = fresh_selection()
248
+ atomic_write({'held_out': held, 'total_items': n})
249
+ print('FRESH %d' % len(held))
250
+ " 2>/dev/null || echo "NOOP")
251
+
252
+ # Honest logging + trust event on any stale repair (type-guarded).
253
+ local tok rest
254
+ read -r tok rest <<< "$status_token"
255
+ case "$tok" in
256
+ RESELECTED)
257
+ log_warn "[checklist] held-out reservation stale (checklist regenerated); re-selected ${rest:-0} items"
258
+ if type record_trust_event_bash &>/dev/null; then
259
+ record_trust_event_bash "heldout_stale" \
260
+ "detail=reselected" \
261
+ "reselected=${rest:-0}" \
262
+ >/dev/null 2>&1 || true
263
+ fi
264
+ ;;
265
+ PARTIAL)
266
+ log_warn "[checklist] held-out reservation partially stale (checklist regenerated); $rest"
267
+ if type record_trust_event_bash &>/dev/null; then
268
+ record_trust_event_bash "heldout_stale" \
269
+ "detail=partial" \
270
+ "$rest" \
271
+ >/dev/null 2>&1 || true
272
+ fi
273
+ ;;
274
+ DUP_SKIP)
275
+ log_warn "[checklist] checklist ids are not unique; held-out selection skipped (id-based reservation is unsound with duplicate ids)"
276
+ ;;
277
+ esac
278
+
279
+ return 0
280
+ }
281
+
282
+ # Echo held-out item IDs (one per line) to stdout. Empty when none reserved.
283
+ checklist_heldout_ids() {
284
+ local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
285
+ if [ ! -f "$heldout_file" ]; then
286
+ return 0
287
+ fi
288
+ _HELDOUT_FILE="$heldout_file" python3 -c "
289
+ import json, os
290
+ try:
291
+ with open(os.environ['_HELDOUT_FILE']) as f:
292
+ data = json.load(f)
293
+ for i in data.get('held_out', []):
294
+ print(i)
295
+ except Exception:
296
+ pass
297
+ " 2>/dev/null || true
298
+ }
299
+
108
300
  #===============================================================================
109
301
  # Verification
110
302
  #===============================================================================
@@ -118,6 +310,10 @@ checklist_verify() {
118
310
  return 0
119
311
  fi
120
312
 
313
+ # Held-out selection happens BEFORE the first verification so that the very
314
+ # first verification-results.json summary already excludes held-out items.
315
+ checklist_select_heldout
316
+
121
317
  local script_dir
122
318
  script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
123
319
  local verify_script="${script_dir}/checklist-verify.py"
@@ -160,16 +356,12 @@ checklist_summary() {
160
356
 
161
357
  _CHECKLIST_RESULTS="$CHECKLIST_RESULTS_FILE" \
162
358
  _CHECKLIST_WAIVERS="${CHECKLIST_DIR:-".loki/checklist"}/waivers.json" \
359
+ _CHECKLIST_HELDOUT="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json" \
163
360
  python3 -c "
164
361
  import json, sys, os
165
362
  try:
166
363
  fpath = os.environ.get('_CHECKLIST_RESULTS', '')
167
364
  data = json.load(open(fpath))
168
- s = data.get('summary', {})
169
- total = s.get('total', 0)
170
- verified = s.get('verified', 0)
171
- failing = s.get('failing', 0)
172
- pending = s.get('pending', 0)
173
365
 
174
366
  # Load waivers
175
367
  waived_ids = set()
@@ -184,26 +376,68 @@ try:
184
376
  except Exception:
185
377
  pass
186
378
 
187
- # Count waived items and adjust failing list
188
- waived_count = 0
189
- if total == 0:
190
- print('')
191
- else:
379
+ # Load held-out item ids (v7.28.0). Held-out items are NEVER surfaced to the
380
+ # build loop: they are fully excluded from the counts and the failing list so
381
+ # the build agent cannot tune to them. The council evaluates them separately.
382
+ heldout_ids = set()
383
+ heldout_path = os.environ.get('_CHECKLIST_HELDOUT', '')
384
+ if heldout_path and os.path.exists(heldout_path):
385
+ try:
386
+ with open(heldout_path) as hf:
387
+ hdata = json.load(hf)
388
+ heldout_ids = set(hdata.get('held_out', []))
389
+ except Exception:
390
+ pass
391
+
392
+ # Count all checklist items first so we can detect the pathological case
393
+ # where hiding would empty the summary on a non-empty checklist (MEDIUM-2).
394
+ all_items = 0
395
+ for cat in data.get('categories', []):
396
+ all_items += len(cat.get('items', []))
397
+
398
+ def compute(apply_heldout):
399
+ total = verified = pending = failing = waived_count = 0
192
400
  failing_items = []
193
401
  for cat in data.get('categories', []):
194
402
  for item in cat.get('items', []):
195
403
  item_id = item.get('id', '')
404
+ if apply_heldout and item_id in heldout_ids:
405
+ continue
196
406
  if item_id in waived_ids:
197
407
  waived_count += 1
198
408
  continue
199
- if item.get('status') == 'failing' and item.get('priority') in ('critical', 'major'):
200
- failing_items.append(item.get('title', item.get('id', '?')))
409
+ total += 1
410
+ status = item.get('status')
411
+ if status == 'verified':
412
+ verified += 1
413
+ elif status == 'failing':
414
+ failing += 1
415
+ if item.get('priority') in ('critical', 'major'):
416
+ failing_items.append(item.get('title', item.get('id', '?')))
417
+ else:
418
+ pending += 1
419
+ return total, verified, pending, failing, waived_count, failing_items
420
+
421
+ # Recompute counts over the VISIBLE (non-held-out) items so 'total' never
422
+ # leaks the existence of held-out items. Waived items are excluded too.
423
+ total, verified, pending, failing, waived_count, failing_items = compute(True)
424
+
425
+ # MEDIUM-2 guard: if hiding held-out items would empty the summary while the
426
+ # checklist itself is non-empty, fall back to showing all items (do not hide)
427
+ # and warn. Returning an empty summary on a non-empty checklist reads as 'no
428
+ # checklist' to the prompt feed, which is a worse failure than a small leak.
429
+ if total == 0 and all_items > 0:
430
+ print('held-out hiding would empty a non-empty checklist summary; showing all items', file=sys.stderr)
431
+ total, verified, pending, failing, waived_count, failing_items = compute(False)
432
+
433
+ if total == 0:
434
+ print('')
435
+ else:
201
436
  detail = ''
202
437
  if failing_items:
203
438
  detail = ' FAILING: ' + ', '.join(failing_items[:5])
204
439
  waived_str = f', {waived_count} waived' if waived_count > 0 else ''
205
- adjusted_failing = max(0, failing - waived_count)
206
- print(f'{verified}/{total} verified, {adjusted_failing} failing{waived_str}, {pending} pending.{detail}')
440
+ print(f'{verified}/{total} verified, {failing} failing{waived_str}, {pending} pending.{detail}')
207
441
  except Exception:
208
442
  print('', file=sys.stderr)
209
443
  " 2>/dev/null || echo ""