loki-mode 7.27.0 → 7.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -105,6 +105,198 @@ checklist_should_verify() {
105
105
  return 0
106
106
  }
107
107
 
108
+ #===============================================================================
109
+ # Held-out Spec Eval Selection (v7.28.0)
110
+ #===============================================================================
111
+ # Anti-reward-hacking: deterministically reserve ~25% of checklist items as
112
+ # "held-out". Held-out item IDs are excluded from the prompt feed the build loop
113
+ # sees (checklist_summary and council_checklist_gate), so a cooperative build
114
+ # agent is not steered toward those specific acceptance checks. The completion
115
+ # council evaluates them at the ship gate (council_heldout_gate in
116
+ # completion-council.sh). Scope of the guarantee: this protects the prompt feed,
117
+ # not a sandbox. .loki/checklist/held-out.json is plain on-disk JSON, so a
118
+ # non-cooperative agent with filesystem tools can read the reservation directly.
119
+ #
120
+ # Selection is idempotent and reproducible: count = clamp(round(0.25*N), 1, 5)
121
+ # for N>=4 items; ordering by sha256 of each item's "id" (stable, not random).
122
+ # Written once to .loki/checklist/held-out.json; never overwritten if present.
123
+ checklist_select_heldout() {
124
+ local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
125
+
126
+ if [ ! -f "$CHECKLIST_FILE" ]; then
127
+ return 0
128
+ fi
129
+
130
+ # The Python below handles all four cases and prints a single status token so
131
+ # bash can log honestly and emit the right trust event:
132
+ # FRESH n - no prior reservation, selected n (file written)
133
+ # IDEMPOTENT - prior reservation fully valid vs current ids (no-op,
134
+ # file untouched: preserves the idempotency case 1 tests)
135
+ # RESELECTED n - prior reservation fully stale (zero ids survive); the
136
+ # checklist regenerated, so we deterministically re-select
137
+ # n items from the CURRENT checklist and overwrite
138
+ # PARTIAL kept=k dropped=d - some prior ids survived; we keep only survivors
139
+ # DUP_SKIP - current checklist ids are not unique; the id-based
140
+ # mechanism is unsound, so we reserve nothing (MEDIUM-2)
141
+ # NOOP - n<4 with no prior file, or other no-write outcome
142
+ # Honest caveat: re-selection or partial-survival after a regen can reserve
143
+ # items the build loop already saw in earlier prompts (the hidden-from-loop
144
+ # guarantee is best-effort once the checklist ids change mid-run).
145
+ local status_token
146
+ status_token=$(_CHECKLIST_FILE="$CHECKLIST_FILE" _HELDOUT_FILE="$heldout_file" python3 -c "
147
+ import json, os, sys, hashlib, tempfile
148
+
149
+ cl_path = os.environ['_CHECKLIST_FILE']
150
+ out_path = os.environ['_HELDOUT_FILE']
151
+ try:
152
+ with open(cl_path) as f:
153
+ data = json.load(f)
154
+ except Exception:
155
+ print('NOOP')
156
+ sys.exit(0)
157
+
158
+ # Collect all item ids in document order.
159
+ ids = []
160
+ for cat in data.get('categories', []):
161
+ for item in cat.get('items', []):
162
+ iid = item.get('id', '')
163
+ if iid:
164
+ ids.append(iid)
165
+
166
+ n = len(ids)
167
+ id_set = set(ids)
168
+
169
+ # MEDIUM-2: duplicate ids make the id-based hide/select mechanism unsound. Skip
170
+ # selection entirely (no reservation written) so a held-out id can never map to
171
+ # more than one item. Do NOT touch an existing reservation file here (a stale
172
+ # valid file left over from before a dup-introducing regen is handled by the
173
+ # council gate's STALE path; over-removing would be over-engineering).
174
+ if len(id_set) != n:
175
+ print('DUP_SKIP')
176
+ sys.exit(0)
177
+
178
+ def select_count(num_ids):
179
+ c = round(0.25 * num_ids)
180
+ if c < 1:
181
+ c = 1
182
+ if c > 5:
183
+ c = 5
184
+ return c
185
+
186
+ def fresh_selection():
187
+ # Deterministic order: sort ids by sha256(id), take the first <count>.
188
+ count = select_count(n)
189
+ ranked = sorted(ids, key=lambda i: hashlib.sha256(i.encode('utf-8')).hexdigest())
190
+ return sorted(ranked[:count])
191
+
192
+ def atomic_write(payload):
193
+ d = os.path.dirname(out_path) or '.'
194
+ os.makedirs(d, exist_ok=True)
195
+ fd, tmp = tempfile.mkstemp(dir=d, suffix='.tmp')
196
+ with os.fdopen(fd, 'w') as f:
197
+ json.dump(payload, f, indent=2)
198
+ f.write('\n')
199
+ os.replace(tmp, out_path)
200
+
201
+ prior = None
202
+ if os.path.exists(out_path):
203
+ try:
204
+ with open(out_path) as f:
205
+ prior = json.load(f)
206
+ except Exception:
207
+ prior = None
208
+
209
+ if prior is not None:
210
+ prior_ids = [i for i in prior.get('held_out', []) if i]
211
+ # A prior reservation of [] (e.g. an earlier n<4 run) is a valid no-op state;
212
+ # keep it idempotent rather than re-selecting now that n may have grown.
213
+ if not prior_ids:
214
+ print('IDEMPOTENT')
215
+ sys.exit(0)
216
+ survivors = [i for i in prior_ids if i in id_set]
217
+ if len(survivors) == len(prior_ids):
218
+ # Fully valid against the current checklist: idempotent no-op.
219
+ print('IDEMPOTENT')
220
+ sys.exit(0)
221
+ if not survivors:
222
+ # Fully stale: the checklist regenerated and orphaned the reservation.
223
+ # Deterministically re-select from the CURRENT checklist.
224
+ if n < 4:
225
+ atomic_write({'held_out': [], 'total_items': n,
226
+ 'note': 'n<4: no held-out reserved (re-selected after stale reservation)'})
227
+ print('RESELECTED 0')
228
+ sys.exit(0)
229
+ held = fresh_selection()
230
+ atomic_write({'held_out': held, 'total_items': n})
231
+ print('RESELECTED %d' % len(held))
232
+ sys.exit(0)
233
+ # Partial survival: keep only the surviving ids (do not silently shrink).
234
+ dropped = len(prior_ids) - len(survivors)
235
+ payload = {'held_out': sorted(survivors), 'total_items': n}
236
+ atomic_write(payload)
237
+ print('PARTIAL kept=%d dropped=%d' % (len(survivors), dropped))
238
+ sys.exit(0)
239
+
240
+ # No prior reservation: first selection.
241
+ if n < 4:
242
+ # N>=4 gate: smaller checklists get no held-out (nothing to hide reliably).
243
+ atomic_write({'held_out': [], 'total_items': n, 'note': 'n<4: no held-out reserved'})
244
+ print('NOOP')
245
+ sys.exit(0)
246
+
247
+ held = fresh_selection()
248
+ atomic_write({'held_out': held, 'total_items': n})
249
+ print('FRESH %d' % len(held))
250
+ " 2>/dev/null || echo "NOOP")
251
+
252
+ # Honest logging + trust event on any stale repair (type-guarded).
253
+ local tok rest
254
+ read -r tok rest <<< "$status_token"
255
+ case "$tok" in
256
+ RESELECTED)
257
+ log_warn "[checklist] held-out reservation stale (checklist regenerated); re-selected ${rest:-0} items"
258
+ if type record_trust_event_bash &>/dev/null; then
259
+ record_trust_event_bash "heldout_stale" \
260
+ "detail=reselected" \
261
+ "reselected=${rest:-0}" \
262
+ >/dev/null 2>&1 || true
263
+ fi
264
+ ;;
265
+ PARTIAL)
266
+ log_warn "[checklist] held-out reservation partially stale (checklist regenerated); $rest"
267
+ if type record_trust_event_bash &>/dev/null; then
268
+ record_trust_event_bash "heldout_stale" \
269
+ "detail=partial" \
270
+ "$rest" \
271
+ >/dev/null 2>&1 || true
272
+ fi
273
+ ;;
274
+ DUP_SKIP)
275
+ log_warn "[checklist] checklist ids are not unique; held-out selection skipped (id-based reservation is unsound with duplicate ids)"
276
+ ;;
277
+ esac
278
+
279
+ return 0
280
+ }
281
+
282
+ # Echo held-out item IDs (one per line) to stdout. Empty when none reserved.
283
+ checklist_heldout_ids() {
284
+ local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
285
+ if [ ! -f "$heldout_file" ]; then
286
+ return 0
287
+ fi
288
+ _HELDOUT_FILE="$heldout_file" python3 -c "
289
+ import json, os
290
+ try:
291
+ with open(os.environ['_HELDOUT_FILE']) as f:
292
+ data = json.load(f)
293
+ for i in data.get('held_out', []):
294
+ print(i)
295
+ except Exception:
296
+ pass
297
+ " 2>/dev/null || true
298
+ }
299
+
108
300
  #===============================================================================
109
301
  # Verification
110
302
  #===============================================================================
@@ -118,6 +310,10 @@ checklist_verify() {
118
310
  return 0
119
311
  fi
120
312
 
313
+ # Held-out selection happens BEFORE the first verification so that the very
314
+ # first verification-results.json summary already excludes held-out items.
315
+ checklist_select_heldout
316
+
121
317
  local script_dir
122
318
  script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
123
319
  local verify_script="${script_dir}/checklist-verify.py"
@@ -160,16 +356,12 @@ checklist_summary() {
160
356
 
161
357
  _CHECKLIST_RESULTS="$CHECKLIST_RESULTS_FILE" \
162
358
  _CHECKLIST_WAIVERS="${CHECKLIST_DIR:-".loki/checklist"}/waivers.json" \
359
+ _CHECKLIST_HELDOUT="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json" \
163
360
  python3 -c "
164
361
  import json, sys, os
165
362
  try:
166
363
  fpath = os.environ.get('_CHECKLIST_RESULTS', '')
167
364
  data = json.load(open(fpath))
168
- s = data.get('summary', {})
169
- total = s.get('total', 0)
170
- verified = s.get('verified', 0)
171
- failing = s.get('failing', 0)
172
- pending = s.get('pending', 0)
173
365
 
174
366
  # Load waivers
175
367
  waived_ids = set()
@@ -184,26 +376,68 @@ try:
184
376
  except Exception:
185
377
  pass
186
378
 
187
- # Count waived items and adjust failing list
188
- waived_count = 0
189
- if total == 0:
190
- print('')
191
- else:
379
+ # Load held-out item ids (v7.28.0). Held-out items are NEVER surfaced to the
380
+ # build loop: they are fully excluded from the counts and the failing list so
381
+ # the build agent cannot tune to them. The council evaluates them separately.
382
+ heldout_ids = set()
383
+ heldout_path = os.environ.get('_CHECKLIST_HELDOUT', '')
384
+ if heldout_path and os.path.exists(heldout_path):
385
+ try:
386
+ with open(heldout_path) as hf:
387
+ hdata = json.load(hf)
388
+ heldout_ids = set(hdata.get('held_out', []))
389
+ except Exception:
390
+ pass
391
+
392
+ # Count all checklist items first so we can detect the pathological case
393
+ # where hiding would empty the summary on a non-empty checklist (MEDIUM-2).
394
+ all_items = 0
395
+ for cat in data.get('categories', []):
396
+ all_items += len(cat.get('items', []))
397
+
398
+ def compute(apply_heldout):
399
+ total = verified = pending = failing = waived_count = 0
192
400
  failing_items = []
193
401
  for cat in data.get('categories', []):
194
402
  for item in cat.get('items', []):
195
403
  item_id = item.get('id', '')
404
+ if apply_heldout and item_id in heldout_ids:
405
+ continue
196
406
  if item_id in waived_ids:
197
407
  waived_count += 1
198
408
  continue
199
- if item.get('status') == 'failing' and item.get('priority') in ('critical', 'major'):
200
- failing_items.append(item.get('title', item.get('id', '?')))
409
+ total += 1
410
+ status = item.get('status')
411
+ if status == 'verified':
412
+ verified += 1
413
+ elif status == 'failing':
414
+ failing += 1
415
+ if item.get('priority') in ('critical', 'major'):
416
+ failing_items.append(item.get('title', item.get('id', '?')))
417
+ else:
418
+ pending += 1
419
+ return total, verified, pending, failing, waived_count, failing_items
420
+
421
+ # Recompute counts over the VISIBLE (non-held-out) items so 'total' never
422
+ # leaks the existence of held-out items. Waived items are excluded too.
423
+ total, verified, pending, failing, waived_count, failing_items = compute(True)
424
+
425
+ # MEDIUM-2 guard: if hiding held-out items would empty the summary while the
426
+ # checklist itself is non-empty, fall back to showing all items (do not hide)
427
+ # and warn. Returning an empty summary on a non-empty checklist reads as 'no
428
+ # checklist' to the prompt feed, which is a worse failure than a small leak.
429
+ if total == 0 and all_items > 0:
430
+ print('held-out hiding would empty a non-empty checklist summary; showing all items', file=sys.stderr)
431
+ total, verified, pending, failing, waived_count, failing_items = compute(False)
432
+
433
+ if total == 0:
434
+ print('')
435
+ else:
201
436
  detail = ''
202
437
  if failing_items:
203
438
  detail = ' FAILING: ' + ', '.join(failing_items[:5])
204
439
  waived_str = f', {waived_count} waived' if waived_count > 0 else ''
205
- adjusted_failing = max(0, failing - waived_count)
206
- print(f'{verified}/{total} verified, {adjusted_failing} failing{waived_str}, {pending} pending.{detail}')
440
+ print(f'{verified}/{total} verified, {failing} failing{waived_str}, {pending} pending.{detail}')
207
441
  except Exception:
208
442
  print('', file=sys.stderr)
209
443
  " 2>/dev/null || echo ""
package/autonomy/run.sh CHANGED
@@ -2566,6 +2566,26 @@ except Exception:
2566
2566
  local ts
2567
2567
  ts="$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date)"
2568
2568
 
2569
+ # v7.28.0: evidence-gate inconclusive line. When the evidence gate could not
2570
+ # establish a diff baseline (no git repo, or no run-start SHA), it records a
2571
+ # durable .loki/state/evidence-inconclusive.json instead of silently passing.
2572
+ # Surface one honest line so the user knows completion was not independently
2573
+ # verified. The record is removed by the gate on any conclusive run.
2574
+ local evidence_inconclusive_line=""
2575
+ local _inc_file="$loki_dir/state/evidence-inconclusive.json"
2576
+ if [ -f "$_inc_file" ]; then
2577
+ local _inc_reason
2578
+ _inc_reason="$(python3 -c "import json,sys
2579
+ try:
2580
+ d=json.load(open(sys.argv[1]))
2581
+ print(d.get('reason','') if d.get('inconclusive') else '')
2582
+ except Exception:
2583
+ print('')" "$_inc_file" 2>/dev/null)"
2584
+ if [ -n "$_inc_reason" ]; then
2585
+ evidence_inconclusive_line="Evidence gate: inconclusive (${_inc_reason}) - completion not independently verified"
2586
+ fi
2587
+ fi
2588
+
2569
2589
  # ---- Durable human-readable file: .loki/COMPLETION.txt --------------------
2570
2590
  {
2571
2591
  echo "Loki Mode run summary"
@@ -2596,6 +2616,10 @@ except Exception:
2596
2616
  fi
2597
2617
  echo "Tasks: pending=$pending in_progress=$in_progress completed=$completed failed=$failed"
2598
2618
  echo ""
2619
+ if [ -n "$evidence_inconclusive_line" ]; then
2620
+ echo "$evidence_inconclusive_line"
2621
+ echo ""
2622
+ fi
2599
2623
  echo "Review the work:"
2600
2624
  echo " $review_cmd"
2601
2625
  echo ""
@@ -4690,6 +4714,57 @@ print_ttfv_next_steps() {
4690
4714
  return 0
4691
4715
  }
4692
4716
 
4717
+ # _read_iteration_cost <iteration>
4718
+ # Emit "input output cost cache_read cache_creation" for the given iteration,
4719
+ # preferring the authoritative result-cost file written by the embedded stream
4720
+ # parser (Claude'\''s own total_cost_usd + usage, slug/symlink-independent) over
4721
+ # the context-tracker-derived estimate in tracking.json. Falls back to
4722
+ # tracking.json when no result-cost file exists, and to all zeros otherwise.
4723
+ # Best-effort: any parse failure yields "0 0 0 0 0" and never aborts.
4724
+ _read_iteration_cost() {
4725
+ local iteration="$1"
4726
+ local result_cost_file=".loki/metrics/result-cost-${iteration}.json"
4727
+ if [ -f "$result_cost_file" ]; then
4728
+ python3 -c "
4729
+ import json
4730
+ try:
4731
+ d = json.load(open('$result_cost_file'))
4732
+ print(
4733
+ d.get('input_tokens', 0) or 0,
4734
+ d.get('output_tokens', 0) or 0,
4735
+ d.get('total_cost_usd', 0) or 0,
4736
+ d.get('cache_read_tokens', 0) or 0,
4737
+ d.get('cache_creation_tokens', 0) or 0,
4738
+ )
4739
+ except Exception:
4740
+ print(0, 0, 0, 0, 0)
4741
+ " 2>/dev/null || echo "0 0 0 0 0"
4742
+ elif [ -f ".loki/context/tracking.json" ]; then
4743
+ python3 -c "
4744
+ import json
4745
+ try:
4746
+ t = json.load(open('.loki/context/tracking.json'))
4747
+ iters = t.get('per_iteration', [])
4748
+ match = [i for i in iters if i.get('iteration') == $iteration]
4749
+ if match:
4750
+ m = match[-1]
4751
+ print(
4752
+ m.get('input_tokens', 0),
4753
+ m.get('output_tokens', 0),
4754
+ m.get('cost_usd', 0),
4755
+ m.get('cache_read_tokens', 0),
4756
+ m.get('cache_creation_tokens', 0),
4757
+ )
4758
+ else:
4759
+ print(0, 0, 0, 0, 0)
4760
+ except Exception:
4761
+ print(0, 0, 0, 0, 0)
4762
+ " 2>/dev/null || echo "0 0 0 0 0"
4763
+ else
4764
+ echo "0 0 0 0 0"
4765
+ fi
4766
+ }
4767
+
4693
4768
  track_iteration_complete() {
4694
4769
  local iteration="$1"
4695
4770
  local exit_code="${2:-0}"
@@ -4772,32 +4847,14 @@ track_iteration_complete() {
4772
4847
  local phase="${LAST_KNOWN_PHASE:-}"
4773
4848
  [ -z "$phase" ] && phase=$(python3 -c "import json; print(json.load(open('.loki/state/orchestrator.json')).get('currentPhase', 'unknown'))" 2>/dev/null || echo "unknown")
4774
4849
 
4775
- # Read token data from context tracker output (v5.42.0)
4850
+ # Read token data, preferring Claude'\''s authoritative result-cost file over
4851
+ # the context-tracker estimate (v7.28.0 cost-capture fix). See
4852
+ # _read_iteration_cost for precedence rationale.
4776
4853
  # v6.82.0: also capture cache_read_tokens / cache_creation_tokens for
4777
4854
  # prompt-cache hit-rate analysis (S1.1 prompt restructure).
4778
4855
  local iter_input=0 iter_output=0 iter_cost=0
4779
4856
  local iter_cache_read=0 iter_cache_creation=0
4780
- if [ -f ".loki/context/tracking.json" ]; then
4781
- read iter_input iter_output iter_cost iter_cache_read iter_cache_creation < <(python3 -c "
4782
- import json
4783
- try:
4784
- t = json.load(open('.loki/context/tracking.json'))
4785
- iters = t.get('per_iteration', [])
4786
- match = [i for i in iters if i.get('iteration') == $iteration]
4787
- if match:
4788
- m = match[-1]
4789
- print(
4790
- m.get('input_tokens', 0),
4791
- m.get('output_tokens', 0),
4792
- m.get('cost_usd', 0),
4793
- m.get('cache_read_tokens', 0),
4794
- m.get('cache_creation_tokens', 0),
4795
- )
4796
- else:
4797
- print(0, 0, 0, 0, 0)
4798
- except: print(0, 0, 0, 0, 0)
4799
- " 2>/dev/null || echo "0 0 0 0 0")
4800
- fi
4857
+ read -r iter_input iter_output iter_cost iter_cache_read iter_cache_creation < <(_read_iteration_cost "$iteration")
4801
4858
 
4802
4859
  cat > ".loki/metrics/efficiency/iteration-${iteration}.json" << EFF_EOF
4803
4860
  {
@@ -12352,8 +12409,15 @@ except Exception as exc:
12352
12409
  claude)
12353
12410
  # Claude: Full features with stream-json output and agent tracking
12354
12411
  # Uses dynamic tier for model selection based on RARV phase
12355
- # Pass tier to Python via environment for dashboard display
12356
- { LOKI_CURRENT_MODEL="$tier_param" \
12412
+ # Pass tier + iteration to the embedded stream parser via the
12413
+ # environment. A bare `VAR=val cmd | parser` prefix applies ONLY
12414
+ # to `cmd` (claude) and does NOT cross the pipe to the parser
12415
+ # subprocess, so these must be exported into the shell env first.
12416
+ # LOKI_ITERATION lets the parser stamp the authoritative
12417
+ # result-cost file under the correct iteration index.
12418
+ export LOKI_CURRENT_MODEL="$tier_param"
12419
+ export LOKI_ITERATION="$ITERATION_COUNT"
12420
+ { \
12357
12421
  claude "${_loki_claude_argv[@]}" -p "$prompt" \
12358
12422
  --output-format stream-json --verbose 2>&1 | \
12359
12423
  tee -a "$log_file" "$agent_log" "$iter_output" | \
@@ -12666,6 +12730,34 @@ def process_stream():
12666
12730
  active_agents[orchestrator_id]["tasks_completed"].append(f"{tool_count} tools used")
12667
12731
 
12668
12732
  save_agents()
12733
+
12734
+ # Authoritative cost capture (path/slug/symlink-independent).
12735
+ # Claude'"'"'s result message carries its own total_cost_usd plus a
12736
+ # full usage object. The context-tracker session-file path is
12737
+ # brittle (slug derivation must guess Claude'"'"'s naming), so this
12738
+ # stamps the authoritative number to a per-iteration file that
12739
+ # the efficiency writer prefers. Best-effort: a malformed or
12740
+ # missing field must never break the iteration loop.
12741
+ try:
12742
+ _iter = os.environ.get("LOKI_ITERATION", "0")
12743
+ _u = data.get("usage", {}) or {}
12744
+ _rec = {
12745
+ "total_cost_usd": data.get("total_cost_usd"),
12746
+ "input_tokens": _u.get("input_tokens", 0),
12747
+ "output_tokens": _u.get("output_tokens", 0),
12748
+ "cache_read_tokens": _u.get("cache_read_input_tokens", 0),
12749
+ "cache_creation_tokens": _u.get("cache_creation_input_tokens", 0),
12750
+ }
12751
+ if _rec["total_cost_usd"] is not None:
12752
+ os.makedirs(".loki/metrics", exist_ok=True)
12753
+ _p = ".loki/metrics/result-cost-" + str(_iter) + ".json"
12754
+ _tmp = _p + ".tmp"
12755
+ with open(_tmp, "w") as _f:
12756
+ json.dump(_rec, _f)
12757
+ os.replace(_tmp, _p)
12758
+ except Exception:
12759
+ pass
12760
+
12669
12761
  print(f"\n{GREEN}[Session complete]{NC}", flush=True)
12670
12762
  is_error = data.get("is_error", False)
12671
12763
  sys.exit(1 if is_error else 0)
@@ -13098,7 +13190,36 @@ if __name__ == "__main__":
13098
13190
  case "${gate_failures:-}" in
13099
13191
  *code_review,*|*code_review_ESCALATED*) _gate_block_for_completion="code_review" ;;
13100
13192
  esac
13101
- if [ -n "$_gate_block_for_completion" ] && check_completion_promise "$iter_output"; then
13193
+ # DROP-FIX (v7.28): check_completion_promise -> check_task_completion_signal
13194
+ # CONSUMES the completion signal (rm -f) on the FIRST successful call.
13195
+ # The completion-promise chain below calls it up to five times in one
13196
+ # iteration (reverify guard, code-review arm, evidence arm, held-out
13197
+ # arm, success arm), so the first call consumed the claim and every
13198
+ # later arm saw nothing -- the success arm never fired and the run
13199
+ # iterated to max_iterations even though the agent had claimed done.
13200
+ # Fix: evaluate the claim EXACTLY ONCE here, capture it in
13201
+ # _completion_claimed, and have every arm test that variable. The
13202
+ # single call discards stdout (matching the prior call sites, which
13203
+ # also discarded it), so the task_completion_claim event still emits
13204
+ # exactly once. Consumption semantics are preserved: the claim is
13205
+ # consumed when evaluated; if a gate rejects it, the agent must
13206
+ # re-claim next iteration (see internal/DEMO-CLAIM-DROP-BUG.md).
13207
+ local _completion_claimed=0
13208
+ if check_completion_promise "$iter_output"; then
13209
+ _completion_claimed=1
13210
+ fi
13211
+ # MEDIUM-3: this completion-promise route evaluates the council hard
13212
+ # gates (evidence + held-out) without the council_evaluate freshness
13213
+ # step, so the held-out gate could read stale verification statuses
13214
+ # (and a stale reservation). Re-verify the checklist ONCE here, but
13215
+ # only when a completion claim is actually present (mirror the
13216
+ # check_completion_promise condition used by the gate chain below) so
13217
+ # verification does not run every iteration. Type-guarded and
13218
+ # best-effort: failure must never block the completion path.
13219
+ if [ "$_completion_claimed" = 1 ] && type council_reverify_checklist &>/dev/null; then
13220
+ council_reverify_checklist 2>/dev/null || true
13221
+ fi
13222
+ if [ -n "$_gate_block_for_completion" ] && [ "$_completion_claimed" = 1 ]; then
13102
13223
  log_warn "Completion claim rejected: code review is BLOCKED for this iteration (Critical/High findings). Fix review issues before completion."
13103
13224
  log_warn " Review details under .loki/quality/reviews/ ; gate_failures=${gate_failures}"
13104
13225
  _gate_block_for_completion=""
@@ -13113,11 +13234,24 @@ if __name__ == "__main__":
13113
13234
  # LOKI_EVIDENCE_GATE=0 (council_evidence_gate returns 0 immediately
13114
13235
  # when disabled, so this branch never fires). Gate output (reason +
13115
13236
  # opt-out hint) is printed by council_evidence_gate itself.
13116
- elif check_completion_promise "$iter_output" && type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
13237
+ elif [ "$_completion_claimed" = 1 ] && type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
13117
13238
  log_warn "Completion claim rejected: evidence gate found no proof of completion (empty diff vs run-start SHA, or red tests)."
13118
13239
  log_warn " Details under .loki/council/evidence-block.json ; opt out with LOKI_EVIDENCE_GATE=0"
13119
13240
  # Fall through; keep iterating until there is real evidence.
13120
- elif check_completion_promise "$iter_output"; then
13241
+ # v7.28.0: the held-out spec-eval gate must also guard the DEFAULT
13242
+ # completion-promise route, not only the interval-gated council path
13243
+ # (council_evaluate). Otherwise an agent can self-assert "done" and
13244
+ # exit as completion_promise_fulfilled while a held-out acceptance
13245
+ # check is failing, bypassing the anti-reward-hacking gate entirely.
13246
+ # Mirrors the evidence-gate block above. Opt-out: the gate's own
13247
+ # LOKI_HELDOUT_GATE=0 (council_heldout_gate returns 0 immediately
13248
+ # when disabled or when no held-out items are reserved, so this
13249
+ # branch never fires). Gate output is printed by council_heldout_gate.
13250
+ elif [ "$_completion_claimed" = 1 ] && type council_heldout_gate &>/dev/null && ! council_heldout_gate; then
13251
+ log_warn "Completion claim rejected: held-out spec-eval gate found failing held-out acceptance check(s)."
13252
+ log_warn " Details under .loki/council/heldout-block.json ; opt out with LOKI_HELDOUT_GATE=0"
13253
+ # Fall through; keep iterating until the held-out checks pass.
13254
+ elif [ "$_completion_claimed" = 1 ]; then
13121
13255
  echo ""
13122
13256
  if [ -n "$COMPLETION_PROMISE" ]; then
13123
13257
  log_header "COMPLETION PROMISE FULFILLED: $COMPLETION_PROMISE"
@@ -13491,10 +13625,19 @@ check_human_intervention() {
13491
13625
  if [ -f "$loki_dir/signals/COUNCIL_REVIEW_REQUESTED" ]; then
13492
13626
  log_info "Council force-review requested from dashboard"
13493
13627
  rm -f "$loki_dir/signals/COUNCIL_REVIEW_REQUESTED"
13628
+ # MEDIUM-3: this route evaluates the council hard gates directly without
13629
+ # the council_evaluate freshness step, so re-verify the checklist ONCE
13630
+ # before the gate chain to restore that invariant (refreshes held-out
13631
+ # statuses and repairs a stale reservation). Type-guarded, best-effort.
13632
+ if type council_reverify_checklist &>/dev/null; then
13633
+ council_reverify_checklist 2>/dev/null || true
13634
+ fi
13494
13635
  if type council_checklist_gate &>/dev/null && ! council_checklist_gate; then
13495
13636
  log_info "Council force-review: blocked by checklist hard gate"
13496
13637
  elif type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
13497
13638
  log_info "Council force-review: blocked by evidence hard gate"
13639
+ elif type council_heldout_gate &>/dev/null && ! council_heldout_gate; then
13640
+ log_info "Council force-review: blocked by held-out spec-eval hard gate"
13498
13641
  elif type council_vote &>/dev/null && council_vote; then
13499
13642
  log_header "COMPLETION COUNCIL: FORCE REVIEW - PROJECT COMPLETE"
13500
13643
  # BUG #17 fix: Write COMPLETED marker, generate council report, and