loki-mode 7.27.0 → 7.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/SKILL.md +11 -2
- package/VERSION +1 -1
- package/autonomy/completion-council.sh +285 -6
- package/autonomy/context-tracker.py +32 -7
- package/autonomy/grill.sh +321 -0
- package/autonomy/loki +49 -0
- package/autonomy/prd-checklist.sh +248 -14
- package/autonomy/run.sh +170 -27
- package/autonomy/spec.sh +646 -0
- package/autonomy/verify.sh +55 -0
- package/dashboard/__init__.py +1 -1
- package/docs/INSTALLATION.md +1 -1
- package/loki-ts/dist/loki.js +2 -2
- package/mcp/__init__.py +1 -1
- package/package.json +2 -1
- package/skills/quality-gates.md +46 -0
|
@@ -105,6 +105,198 @@ checklist_should_verify() {
|
|
|
105
105
|
return 0
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
+
#===============================================================================
|
|
109
|
+
# Held-out Spec Eval Selection (v7.28.0)
|
|
110
|
+
#===============================================================================
|
|
111
|
+
# Anti-reward-hacking: deterministically reserve ~25% of checklist items as
|
|
112
|
+
# "held-out". Held-out item IDs are excluded from the prompt feed the build loop
|
|
113
|
+
# sees (checklist_summary and council_checklist_gate), so a cooperative build
|
|
114
|
+
# agent is not steered toward those specific acceptance checks. The completion
|
|
115
|
+
# council evaluates them at the ship gate (council_heldout_gate in
|
|
116
|
+
# completion-council.sh). Scope of the guarantee: this protects the prompt feed,
|
|
117
|
+
# not a sandbox. .loki/checklist/held-out.json is plain on-disk JSON, so a
|
|
118
|
+
# non-cooperative agent with filesystem tools can read the reservation directly.
|
|
119
|
+
#
|
|
120
|
+
# Selection is idempotent and reproducible: count = clamp(round(0.25*N), 1, 5)
|
|
121
|
+
# for N>=4 items; ordering by sha256 of each item's "id" (stable, not random).
|
|
122
|
+
# Written once to .loki/checklist/held-out.json; never overwritten if present.
|
|
123
|
+
checklist_select_heldout() {
|
|
124
|
+
local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
|
|
125
|
+
|
|
126
|
+
if [ ! -f "$CHECKLIST_FILE" ]; then
|
|
127
|
+
return 0
|
|
128
|
+
fi
|
|
129
|
+
|
|
130
|
+
# The Python below handles all four cases and prints a single status token so
|
|
131
|
+
# bash can log honestly and emit the right trust event:
|
|
132
|
+
# FRESH n - no prior reservation, selected n (file written)
|
|
133
|
+
# IDEMPOTENT - prior reservation fully valid vs current ids (no-op,
|
|
134
|
+
# file untouched: preserves the idempotency case 1 tests)
|
|
135
|
+
# RESELECTED n - prior reservation fully stale (zero ids survive); the
|
|
136
|
+
# checklist regenerated, so we deterministically re-select
|
|
137
|
+
# n items from the CURRENT checklist and overwrite
|
|
138
|
+
# PARTIAL kept=k dropped=d - some prior ids survived; we keep only survivors
|
|
139
|
+
# DUP_SKIP - current checklist ids are not unique; the id-based
|
|
140
|
+
# mechanism is unsound, so we reserve nothing (MEDIUM-2)
|
|
141
|
+
# NOOP - n<4 with no prior file, or other no-write outcome
|
|
142
|
+
# Honest caveat: re-selection or partial-survival after a regen can reserve
|
|
143
|
+
# items the build loop already saw in earlier prompts (the hidden-from-loop
|
|
144
|
+
# guarantee is best-effort once the checklist ids change mid-run).
|
|
145
|
+
local status_token
|
|
146
|
+
status_token=$(_CHECKLIST_FILE="$CHECKLIST_FILE" _HELDOUT_FILE="$heldout_file" python3 -c "
|
|
147
|
+
import json, os, sys, hashlib, tempfile
|
|
148
|
+
|
|
149
|
+
cl_path = os.environ['_CHECKLIST_FILE']
|
|
150
|
+
out_path = os.environ['_HELDOUT_FILE']
|
|
151
|
+
try:
|
|
152
|
+
with open(cl_path) as f:
|
|
153
|
+
data = json.load(f)
|
|
154
|
+
except Exception:
|
|
155
|
+
print('NOOP')
|
|
156
|
+
sys.exit(0)
|
|
157
|
+
|
|
158
|
+
# Collect all item ids in document order.
|
|
159
|
+
ids = []
|
|
160
|
+
for cat in data.get('categories', []):
|
|
161
|
+
for item in cat.get('items', []):
|
|
162
|
+
iid = item.get('id', '')
|
|
163
|
+
if iid:
|
|
164
|
+
ids.append(iid)
|
|
165
|
+
|
|
166
|
+
n = len(ids)
|
|
167
|
+
id_set = set(ids)
|
|
168
|
+
|
|
169
|
+
# MEDIUM-2: duplicate ids make the id-based hide/select mechanism unsound. Skip
|
|
170
|
+
# selection entirely (no reservation written) so a held-out id can never map to
|
|
171
|
+
# more than one item. Do NOT touch an existing reservation file here (a stale
|
|
172
|
+
# valid file left over from before a dup-introducing regen is handled by the
|
|
173
|
+
# council gate's STALE path; over-removing would be over-engineering).
|
|
174
|
+
if len(id_set) != n:
|
|
175
|
+
print('DUP_SKIP')
|
|
176
|
+
sys.exit(0)
|
|
177
|
+
|
|
178
|
+
def select_count(num_ids):
|
|
179
|
+
c = round(0.25 * num_ids)
|
|
180
|
+
if c < 1:
|
|
181
|
+
c = 1
|
|
182
|
+
if c > 5:
|
|
183
|
+
c = 5
|
|
184
|
+
return c
|
|
185
|
+
|
|
186
|
+
def fresh_selection():
|
|
187
|
+
# Deterministic order: sort ids by sha256(id), take the first <count>.
|
|
188
|
+
count = select_count(n)
|
|
189
|
+
ranked = sorted(ids, key=lambda i: hashlib.sha256(i.encode('utf-8')).hexdigest())
|
|
190
|
+
return sorted(ranked[:count])
|
|
191
|
+
|
|
192
|
+
def atomic_write(payload):
|
|
193
|
+
d = os.path.dirname(out_path) or '.'
|
|
194
|
+
os.makedirs(d, exist_ok=True)
|
|
195
|
+
fd, tmp = tempfile.mkstemp(dir=d, suffix='.tmp')
|
|
196
|
+
with os.fdopen(fd, 'w') as f:
|
|
197
|
+
json.dump(payload, f, indent=2)
|
|
198
|
+
f.write('\n')
|
|
199
|
+
os.replace(tmp, out_path)
|
|
200
|
+
|
|
201
|
+
prior = None
|
|
202
|
+
if os.path.exists(out_path):
|
|
203
|
+
try:
|
|
204
|
+
with open(out_path) as f:
|
|
205
|
+
prior = json.load(f)
|
|
206
|
+
except Exception:
|
|
207
|
+
prior = None
|
|
208
|
+
|
|
209
|
+
if prior is not None:
|
|
210
|
+
prior_ids = [i for i in prior.get('held_out', []) if i]
|
|
211
|
+
# A prior reservation of [] (e.g. an earlier n<4 run) is a valid no-op state;
|
|
212
|
+
# keep it idempotent rather than re-selecting now that n may have grown.
|
|
213
|
+
if not prior_ids:
|
|
214
|
+
print('IDEMPOTENT')
|
|
215
|
+
sys.exit(0)
|
|
216
|
+
survivors = [i for i in prior_ids if i in id_set]
|
|
217
|
+
if len(survivors) == len(prior_ids):
|
|
218
|
+
# Fully valid against the current checklist: idempotent no-op.
|
|
219
|
+
print('IDEMPOTENT')
|
|
220
|
+
sys.exit(0)
|
|
221
|
+
if not survivors:
|
|
222
|
+
# Fully stale: the checklist regenerated and orphaned the reservation.
|
|
223
|
+
# Deterministically re-select from the CURRENT checklist.
|
|
224
|
+
if n < 4:
|
|
225
|
+
atomic_write({'held_out': [], 'total_items': n,
|
|
226
|
+
'note': 'n<4: no held-out reserved (re-selected after stale reservation)'})
|
|
227
|
+
print('RESELECTED 0')
|
|
228
|
+
sys.exit(0)
|
|
229
|
+
held = fresh_selection()
|
|
230
|
+
atomic_write({'held_out': held, 'total_items': n})
|
|
231
|
+
print('RESELECTED %d' % len(held))
|
|
232
|
+
sys.exit(0)
|
|
233
|
+
# Partial survival: keep only the surviving ids (do not silently shrink).
|
|
234
|
+
dropped = len(prior_ids) - len(survivors)
|
|
235
|
+
payload = {'held_out': sorted(survivors), 'total_items': n}
|
|
236
|
+
atomic_write(payload)
|
|
237
|
+
print('PARTIAL kept=%d dropped=%d' % (len(survivors), dropped))
|
|
238
|
+
sys.exit(0)
|
|
239
|
+
|
|
240
|
+
# No prior reservation: first selection.
|
|
241
|
+
if n < 4:
|
|
242
|
+
# N>=4 gate: smaller checklists get no held-out (nothing to hide reliably).
|
|
243
|
+
atomic_write({'held_out': [], 'total_items': n, 'note': 'n<4: no held-out reserved'})
|
|
244
|
+
print('NOOP')
|
|
245
|
+
sys.exit(0)
|
|
246
|
+
|
|
247
|
+
held = fresh_selection()
|
|
248
|
+
atomic_write({'held_out': held, 'total_items': n})
|
|
249
|
+
print('FRESH %d' % len(held))
|
|
250
|
+
" 2>/dev/null || echo "NOOP")
|
|
251
|
+
|
|
252
|
+
# Honest logging + trust event on any stale repair (type-guarded).
|
|
253
|
+
local tok rest
|
|
254
|
+
read -r tok rest <<< "$status_token"
|
|
255
|
+
case "$tok" in
|
|
256
|
+
RESELECTED)
|
|
257
|
+
log_warn "[checklist] held-out reservation stale (checklist regenerated); re-selected ${rest:-0} items"
|
|
258
|
+
if type record_trust_event_bash &>/dev/null; then
|
|
259
|
+
record_trust_event_bash "heldout_stale" \
|
|
260
|
+
"detail=reselected" \
|
|
261
|
+
"reselected=${rest:-0}" \
|
|
262
|
+
>/dev/null 2>&1 || true
|
|
263
|
+
fi
|
|
264
|
+
;;
|
|
265
|
+
PARTIAL)
|
|
266
|
+
log_warn "[checklist] held-out reservation partially stale (checklist regenerated); $rest"
|
|
267
|
+
if type record_trust_event_bash &>/dev/null; then
|
|
268
|
+
record_trust_event_bash "heldout_stale" \
|
|
269
|
+
"detail=partial" \
|
|
270
|
+
"$rest" \
|
|
271
|
+
>/dev/null 2>&1 || true
|
|
272
|
+
fi
|
|
273
|
+
;;
|
|
274
|
+
DUP_SKIP)
|
|
275
|
+
log_warn "[checklist] checklist ids are not unique; held-out selection skipped (id-based reservation is unsound with duplicate ids)"
|
|
276
|
+
;;
|
|
277
|
+
esac
|
|
278
|
+
|
|
279
|
+
return 0
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
# Echo held-out item IDs (one per line) to stdout. Empty when none reserved.
|
|
283
|
+
checklist_heldout_ids() {
|
|
284
|
+
local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
|
|
285
|
+
if [ ! -f "$heldout_file" ]; then
|
|
286
|
+
return 0
|
|
287
|
+
fi
|
|
288
|
+
_HELDOUT_FILE="$heldout_file" python3 -c "
|
|
289
|
+
import json, os
|
|
290
|
+
try:
|
|
291
|
+
with open(os.environ['_HELDOUT_FILE']) as f:
|
|
292
|
+
data = json.load(f)
|
|
293
|
+
for i in data.get('held_out', []):
|
|
294
|
+
print(i)
|
|
295
|
+
except Exception:
|
|
296
|
+
pass
|
|
297
|
+
" 2>/dev/null || true
|
|
298
|
+
}
|
|
299
|
+
|
|
108
300
|
#===============================================================================
|
|
109
301
|
# Verification
|
|
110
302
|
#===============================================================================
|
|
@@ -118,6 +310,10 @@ checklist_verify() {
|
|
|
118
310
|
return 0
|
|
119
311
|
fi
|
|
120
312
|
|
|
313
|
+
# Held-out selection happens BEFORE the first verification so that the very
|
|
314
|
+
# first verification-results.json summary already excludes held-out items.
|
|
315
|
+
checklist_select_heldout
|
|
316
|
+
|
|
121
317
|
local script_dir
|
|
122
318
|
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
123
319
|
local verify_script="${script_dir}/checklist-verify.py"
|
|
@@ -160,16 +356,12 @@ checklist_summary() {
|
|
|
160
356
|
|
|
161
357
|
_CHECKLIST_RESULTS="$CHECKLIST_RESULTS_FILE" \
|
|
162
358
|
_CHECKLIST_WAIVERS="${CHECKLIST_DIR:-".loki/checklist"}/waivers.json" \
|
|
359
|
+
_CHECKLIST_HELDOUT="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json" \
|
|
163
360
|
python3 -c "
|
|
164
361
|
import json, sys, os
|
|
165
362
|
try:
|
|
166
363
|
fpath = os.environ.get('_CHECKLIST_RESULTS', '')
|
|
167
364
|
data = json.load(open(fpath))
|
|
168
|
-
s = data.get('summary', {})
|
|
169
|
-
total = s.get('total', 0)
|
|
170
|
-
verified = s.get('verified', 0)
|
|
171
|
-
failing = s.get('failing', 0)
|
|
172
|
-
pending = s.get('pending', 0)
|
|
173
365
|
|
|
174
366
|
# Load waivers
|
|
175
367
|
waived_ids = set()
|
|
@@ -184,26 +376,68 @@ try:
|
|
|
184
376
|
except Exception:
|
|
185
377
|
pass
|
|
186
378
|
|
|
187
|
-
#
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
379
|
+
# Load held-out item ids (v7.28.0). Held-out items are NEVER surfaced to the
|
|
380
|
+
# build loop: they are fully excluded from the counts and the failing list so
|
|
381
|
+
# the build agent cannot tune to them. The council evaluates them separately.
|
|
382
|
+
heldout_ids = set()
|
|
383
|
+
heldout_path = os.environ.get('_CHECKLIST_HELDOUT', '')
|
|
384
|
+
if heldout_path and os.path.exists(heldout_path):
|
|
385
|
+
try:
|
|
386
|
+
with open(heldout_path) as hf:
|
|
387
|
+
hdata = json.load(hf)
|
|
388
|
+
heldout_ids = set(hdata.get('held_out', []))
|
|
389
|
+
except Exception:
|
|
390
|
+
pass
|
|
391
|
+
|
|
392
|
+
# Count all checklist items first so we can detect the pathological case
|
|
393
|
+
# where hiding would empty the summary on a non-empty checklist (MEDIUM-2).
|
|
394
|
+
all_items = 0
|
|
395
|
+
for cat in data.get('categories', []):
|
|
396
|
+
all_items += len(cat.get('items', []))
|
|
397
|
+
|
|
398
|
+
def compute(apply_heldout):
|
|
399
|
+
total = verified = pending = failing = waived_count = 0
|
|
192
400
|
failing_items = []
|
|
193
401
|
for cat in data.get('categories', []):
|
|
194
402
|
for item in cat.get('items', []):
|
|
195
403
|
item_id = item.get('id', '')
|
|
404
|
+
if apply_heldout and item_id in heldout_ids:
|
|
405
|
+
continue
|
|
196
406
|
if item_id in waived_ids:
|
|
197
407
|
waived_count += 1
|
|
198
408
|
continue
|
|
199
|
-
|
|
200
|
-
|
|
409
|
+
total += 1
|
|
410
|
+
status = item.get('status')
|
|
411
|
+
if status == 'verified':
|
|
412
|
+
verified += 1
|
|
413
|
+
elif status == 'failing':
|
|
414
|
+
failing += 1
|
|
415
|
+
if item.get('priority') in ('critical', 'major'):
|
|
416
|
+
failing_items.append(item.get('title', item.get('id', '?')))
|
|
417
|
+
else:
|
|
418
|
+
pending += 1
|
|
419
|
+
return total, verified, pending, failing, waived_count, failing_items
|
|
420
|
+
|
|
421
|
+
# Recompute counts over the VISIBLE (non-held-out) items so 'total' never
|
|
422
|
+
# leaks the existence of held-out items. Waived items are excluded too.
|
|
423
|
+
total, verified, pending, failing, waived_count, failing_items = compute(True)
|
|
424
|
+
|
|
425
|
+
# MEDIUM-2 guard: if hiding held-out items would empty the summary while the
|
|
426
|
+
# checklist itself is non-empty, fall back to showing all items (do not hide)
|
|
427
|
+
# and warn. Returning an empty summary on a non-empty checklist reads as 'no
|
|
428
|
+
# checklist' to the prompt feed, which is a worse failure than a small leak.
|
|
429
|
+
if total == 0 and all_items > 0:
|
|
430
|
+
print('held-out hiding would empty a non-empty checklist summary; showing all items', file=sys.stderr)
|
|
431
|
+
total, verified, pending, failing, waived_count, failing_items = compute(False)
|
|
432
|
+
|
|
433
|
+
if total == 0:
|
|
434
|
+
print('')
|
|
435
|
+
else:
|
|
201
436
|
detail = ''
|
|
202
437
|
if failing_items:
|
|
203
438
|
detail = ' FAILING: ' + ', '.join(failing_items[:5])
|
|
204
439
|
waived_str = f', {waived_count} waived' if waived_count > 0 else ''
|
|
205
|
-
|
|
206
|
-
print(f'{verified}/{total} verified, {adjusted_failing} failing{waived_str}, {pending} pending.{detail}')
|
|
440
|
+
print(f'{verified}/{total} verified, {failing} failing{waived_str}, {pending} pending.{detail}')
|
|
207
441
|
except Exception:
|
|
208
442
|
print('', file=sys.stderr)
|
|
209
443
|
" 2>/dev/null || echo ""
|
package/autonomy/run.sh
CHANGED
|
@@ -2566,6 +2566,26 @@ except Exception:
|
|
|
2566
2566
|
local ts
|
|
2567
2567
|
ts="$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date)"
|
|
2568
2568
|
|
|
2569
|
+
# v7.28.0: evidence-gate inconclusive line. When the evidence gate could not
|
|
2570
|
+
# establish a diff baseline (no git repo, or no run-start SHA), it records a
|
|
2571
|
+
# durable .loki/state/evidence-inconclusive.json instead of silently passing.
|
|
2572
|
+
# Surface one honest line so the user knows completion was not independently
|
|
2573
|
+
# verified. The record is removed by the gate on any conclusive run.
|
|
2574
|
+
local evidence_inconclusive_line=""
|
|
2575
|
+
local _inc_file="$loki_dir/state/evidence-inconclusive.json"
|
|
2576
|
+
if [ -f "$_inc_file" ]; then
|
|
2577
|
+
local _inc_reason
|
|
2578
|
+
_inc_reason="$(python3 -c "import json,sys
|
|
2579
|
+
try:
|
|
2580
|
+
d=json.load(open(sys.argv[1]))
|
|
2581
|
+
print(d.get('reason','') if d.get('inconclusive') else '')
|
|
2582
|
+
except Exception:
|
|
2583
|
+
print('')" "$_inc_file" 2>/dev/null)"
|
|
2584
|
+
if [ -n "$_inc_reason" ]; then
|
|
2585
|
+
evidence_inconclusive_line="Evidence gate: inconclusive (${_inc_reason}) - completion not independently verified"
|
|
2586
|
+
fi
|
|
2587
|
+
fi
|
|
2588
|
+
|
|
2569
2589
|
# ---- Durable human-readable file: .loki/COMPLETION.txt --------------------
|
|
2570
2590
|
{
|
|
2571
2591
|
echo "Loki Mode run summary"
|
|
@@ -2596,6 +2616,10 @@ except Exception:
|
|
|
2596
2616
|
fi
|
|
2597
2617
|
echo "Tasks: pending=$pending in_progress=$in_progress completed=$completed failed=$failed"
|
|
2598
2618
|
echo ""
|
|
2619
|
+
if [ -n "$evidence_inconclusive_line" ]; then
|
|
2620
|
+
echo "$evidence_inconclusive_line"
|
|
2621
|
+
echo ""
|
|
2622
|
+
fi
|
|
2599
2623
|
echo "Review the work:"
|
|
2600
2624
|
echo " $review_cmd"
|
|
2601
2625
|
echo ""
|
|
@@ -4690,6 +4714,57 @@ print_ttfv_next_steps() {
|
|
|
4690
4714
|
return 0
|
|
4691
4715
|
}
|
|
4692
4716
|
|
|
4717
|
+
# _read_iteration_cost <iteration>
|
|
4718
|
+
# Emit "input output cost cache_read cache_creation" for the given iteration,
|
|
4719
|
+
# preferring the authoritative result-cost file written by the embedded stream
|
|
4720
|
+
# parser (Claude'\''s own total_cost_usd + usage, slug/symlink-independent) over
|
|
4721
|
+
# the context-tracker-derived estimate in tracking.json. Falls back to
|
|
4722
|
+
# tracking.json when no result-cost file exists, and to all zeros otherwise.
|
|
4723
|
+
# Best-effort: any parse failure yields "0 0 0 0 0" and never aborts.
|
|
4724
|
+
_read_iteration_cost() {
|
|
4725
|
+
local iteration="$1"
|
|
4726
|
+
local result_cost_file=".loki/metrics/result-cost-${iteration}.json"
|
|
4727
|
+
if [ -f "$result_cost_file" ]; then
|
|
4728
|
+
python3 -c "
|
|
4729
|
+
import json
|
|
4730
|
+
try:
|
|
4731
|
+
d = json.load(open('$result_cost_file'))
|
|
4732
|
+
print(
|
|
4733
|
+
d.get('input_tokens', 0) or 0,
|
|
4734
|
+
d.get('output_tokens', 0) or 0,
|
|
4735
|
+
d.get('total_cost_usd', 0) or 0,
|
|
4736
|
+
d.get('cache_read_tokens', 0) or 0,
|
|
4737
|
+
d.get('cache_creation_tokens', 0) or 0,
|
|
4738
|
+
)
|
|
4739
|
+
except Exception:
|
|
4740
|
+
print(0, 0, 0, 0, 0)
|
|
4741
|
+
" 2>/dev/null || echo "0 0 0 0 0"
|
|
4742
|
+
elif [ -f ".loki/context/tracking.json" ]; then
|
|
4743
|
+
python3 -c "
|
|
4744
|
+
import json
|
|
4745
|
+
try:
|
|
4746
|
+
t = json.load(open('.loki/context/tracking.json'))
|
|
4747
|
+
iters = t.get('per_iteration', [])
|
|
4748
|
+
match = [i for i in iters if i.get('iteration') == $iteration]
|
|
4749
|
+
if match:
|
|
4750
|
+
m = match[-1]
|
|
4751
|
+
print(
|
|
4752
|
+
m.get('input_tokens', 0),
|
|
4753
|
+
m.get('output_tokens', 0),
|
|
4754
|
+
m.get('cost_usd', 0),
|
|
4755
|
+
m.get('cache_read_tokens', 0),
|
|
4756
|
+
m.get('cache_creation_tokens', 0),
|
|
4757
|
+
)
|
|
4758
|
+
else:
|
|
4759
|
+
print(0, 0, 0, 0, 0)
|
|
4760
|
+
except Exception:
|
|
4761
|
+
print(0, 0, 0, 0, 0)
|
|
4762
|
+
" 2>/dev/null || echo "0 0 0 0 0"
|
|
4763
|
+
else
|
|
4764
|
+
echo "0 0 0 0 0"
|
|
4765
|
+
fi
|
|
4766
|
+
}
|
|
4767
|
+
|
|
4693
4768
|
track_iteration_complete() {
|
|
4694
4769
|
local iteration="$1"
|
|
4695
4770
|
local exit_code="${2:-0}"
|
|
@@ -4772,32 +4847,14 @@ track_iteration_complete() {
|
|
|
4772
4847
|
local phase="${LAST_KNOWN_PHASE:-}"
|
|
4773
4848
|
[ -z "$phase" ] && phase=$(python3 -c "import json; print(json.load(open('.loki/state/orchestrator.json')).get('currentPhase', 'unknown'))" 2>/dev/null || echo "unknown")
|
|
4774
4849
|
|
|
4775
|
-
# Read token data
|
|
4850
|
+
# Read token data, preferring Claude'\''s authoritative result-cost file over
|
|
4851
|
+
# the context-tracker estimate (v7.28.0 cost-capture fix). See
|
|
4852
|
+
# _read_iteration_cost for precedence rationale.
|
|
4776
4853
|
# v6.82.0: also capture cache_read_tokens / cache_creation_tokens for
|
|
4777
4854
|
# prompt-cache hit-rate analysis (S1.1 prompt restructure).
|
|
4778
4855
|
local iter_input=0 iter_output=0 iter_cost=0
|
|
4779
4856
|
local iter_cache_read=0 iter_cache_creation=0
|
|
4780
|
-
|
|
4781
|
-
read iter_input iter_output iter_cost iter_cache_read iter_cache_creation < <(python3 -c "
|
|
4782
|
-
import json
|
|
4783
|
-
try:
|
|
4784
|
-
t = json.load(open('.loki/context/tracking.json'))
|
|
4785
|
-
iters = t.get('per_iteration', [])
|
|
4786
|
-
match = [i for i in iters if i.get('iteration') == $iteration]
|
|
4787
|
-
if match:
|
|
4788
|
-
m = match[-1]
|
|
4789
|
-
print(
|
|
4790
|
-
m.get('input_tokens', 0),
|
|
4791
|
-
m.get('output_tokens', 0),
|
|
4792
|
-
m.get('cost_usd', 0),
|
|
4793
|
-
m.get('cache_read_tokens', 0),
|
|
4794
|
-
m.get('cache_creation_tokens', 0),
|
|
4795
|
-
)
|
|
4796
|
-
else:
|
|
4797
|
-
print(0, 0, 0, 0, 0)
|
|
4798
|
-
except: print(0, 0, 0, 0, 0)
|
|
4799
|
-
" 2>/dev/null || echo "0 0 0 0 0")
|
|
4800
|
-
fi
|
|
4857
|
+
read -r iter_input iter_output iter_cost iter_cache_read iter_cache_creation < <(_read_iteration_cost "$iteration")
|
|
4801
4858
|
|
|
4802
4859
|
cat > ".loki/metrics/efficiency/iteration-${iteration}.json" << EFF_EOF
|
|
4803
4860
|
{
|
|
@@ -12352,8 +12409,15 @@ except Exception as exc:
|
|
|
12352
12409
|
claude)
|
|
12353
12410
|
# Claude: Full features with stream-json output and agent tracking
|
|
12354
12411
|
# Uses dynamic tier for model selection based on RARV phase
|
|
12355
|
-
# Pass tier to
|
|
12356
|
-
|
|
12412
|
+
# Pass tier + iteration to the embedded stream parser via the
|
|
12413
|
+
# environment. A bare `VAR=val cmd | parser` prefix applies ONLY
|
|
12414
|
+
# to `cmd` (claude) and does NOT cross the pipe to the parser
|
|
12415
|
+
# subprocess, so these must be exported into the shell env first.
|
|
12416
|
+
# LOKI_ITERATION lets the parser stamp the authoritative
|
|
12417
|
+
# result-cost file under the correct iteration index.
|
|
12418
|
+
export LOKI_CURRENT_MODEL="$tier_param"
|
|
12419
|
+
export LOKI_ITERATION="$ITERATION_COUNT"
|
|
12420
|
+
{ \
|
|
12357
12421
|
claude "${_loki_claude_argv[@]}" -p "$prompt" \
|
|
12358
12422
|
--output-format stream-json --verbose 2>&1 | \
|
|
12359
12423
|
tee -a "$log_file" "$agent_log" "$iter_output" | \
|
|
@@ -12666,6 +12730,34 @@ def process_stream():
|
|
|
12666
12730
|
active_agents[orchestrator_id]["tasks_completed"].append(f"{tool_count} tools used")
|
|
12667
12731
|
|
|
12668
12732
|
save_agents()
|
|
12733
|
+
|
|
12734
|
+
# Authoritative cost capture (path/slug/symlink-independent).
|
|
12735
|
+
# Claude'"'"'s result message carries its own total_cost_usd plus a
|
|
12736
|
+
# full usage object. The context-tracker session-file path is
|
|
12737
|
+
# brittle (slug derivation must guess Claude'"'"'s naming), so this
|
|
12738
|
+
# stamps the authoritative number to a per-iteration file that
|
|
12739
|
+
# the efficiency writer prefers. Best-effort: a malformed or
|
|
12740
|
+
# missing field must never break the iteration loop.
|
|
12741
|
+
try:
|
|
12742
|
+
_iter = os.environ.get("LOKI_ITERATION", "0")
|
|
12743
|
+
_u = data.get("usage", {}) or {}
|
|
12744
|
+
_rec = {
|
|
12745
|
+
"total_cost_usd": data.get("total_cost_usd"),
|
|
12746
|
+
"input_tokens": _u.get("input_tokens", 0),
|
|
12747
|
+
"output_tokens": _u.get("output_tokens", 0),
|
|
12748
|
+
"cache_read_tokens": _u.get("cache_read_input_tokens", 0),
|
|
12749
|
+
"cache_creation_tokens": _u.get("cache_creation_input_tokens", 0),
|
|
12750
|
+
}
|
|
12751
|
+
if _rec["total_cost_usd"] is not None:
|
|
12752
|
+
os.makedirs(".loki/metrics", exist_ok=True)
|
|
12753
|
+
_p = ".loki/metrics/result-cost-" + str(_iter) + ".json"
|
|
12754
|
+
_tmp = _p + ".tmp"
|
|
12755
|
+
with open(_tmp, "w") as _f:
|
|
12756
|
+
json.dump(_rec, _f)
|
|
12757
|
+
os.replace(_tmp, _p)
|
|
12758
|
+
except Exception:
|
|
12759
|
+
pass
|
|
12760
|
+
|
|
12669
12761
|
print(f"\n{GREEN}[Session complete]{NC}", flush=True)
|
|
12670
12762
|
is_error = data.get("is_error", False)
|
|
12671
12763
|
sys.exit(1 if is_error else 0)
|
|
@@ -13098,7 +13190,36 @@ if __name__ == "__main__":
|
|
|
13098
13190
|
case "${gate_failures:-}" in
|
|
13099
13191
|
*code_review,*|*code_review_ESCALATED*) _gate_block_for_completion="code_review" ;;
|
|
13100
13192
|
esac
|
|
13101
|
-
|
|
13193
|
+
# DROP-FIX (v7.28): check_completion_promise -> check_task_completion_signal
|
|
13194
|
+
# CONSUMES the completion signal (rm -f) on the FIRST successful call.
|
|
13195
|
+
# The completion-promise chain below calls it up to five times in one
|
|
13196
|
+
# iteration (reverify guard, code-review arm, evidence arm, held-out
|
|
13197
|
+
# arm, success arm), so the first call consumed the claim and every
|
|
13198
|
+
# later arm saw nothing -- the success arm never fired and the run
|
|
13199
|
+
# iterated to max_iterations even though the agent had claimed done.
|
|
13200
|
+
# Fix: evaluate the claim EXACTLY ONCE here, capture it in
|
|
13201
|
+
# _completion_claimed, and have every arm test that variable. The
|
|
13202
|
+
# single call discards stdout (matching the prior call sites, which
|
|
13203
|
+
# also discarded it), so the task_completion_claim event still emits
|
|
13204
|
+
# exactly once. Consumption semantics are preserved: the claim is
|
|
13205
|
+
# consumed when evaluated; if a gate rejects it, the agent must
|
|
13206
|
+
# re-claim next iteration (see internal/DEMO-CLAIM-DROP-BUG.md).
|
|
13207
|
+
local _completion_claimed=0
|
|
13208
|
+
if check_completion_promise "$iter_output"; then
|
|
13209
|
+
_completion_claimed=1
|
|
13210
|
+
fi
|
|
13211
|
+
# MEDIUM-3: this completion-promise route evaluates the council hard
|
|
13212
|
+
# gates (evidence + held-out) without the council_evaluate freshness
|
|
13213
|
+
# step, so the held-out gate could read stale verification statuses
|
|
13214
|
+
# (and a stale reservation). Re-verify the checklist ONCE here, but
|
|
13215
|
+
# only when a completion claim is actually present (mirror the
|
|
13216
|
+
# check_completion_promise condition used by the gate chain below) so
|
|
13217
|
+
# verification does not run every iteration. Type-guarded and
|
|
13218
|
+
# best-effort: failure must never block the completion path.
|
|
13219
|
+
if [ "$_completion_claimed" = 1 ] && type council_reverify_checklist &>/dev/null; then
|
|
13220
|
+
council_reverify_checklist 2>/dev/null || true
|
|
13221
|
+
fi
|
|
13222
|
+
if [ -n "$_gate_block_for_completion" ] && [ "$_completion_claimed" = 1 ]; then
|
|
13102
13223
|
log_warn "Completion claim rejected: code review is BLOCKED for this iteration (Critical/High findings). Fix review issues before completion."
|
|
13103
13224
|
log_warn " Review details under .loki/quality/reviews/ ; gate_failures=${gate_failures}"
|
|
13104
13225
|
_gate_block_for_completion=""
|
|
@@ -13113,11 +13234,24 @@ if __name__ == "__main__":
|
|
|
13113
13234
|
# LOKI_EVIDENCE_GATE=0 (council_evidence_gate returns 0 immediately
|
|
13114
13235
|
# when disabled, so this branch never fires). Gate output (reason +
|
|
13115
13236
|
# opt-out hint) is printed by council_evidence_gate itself.
|
|
13116
|
-
elif
|
|
13237
|
+
elif [ "$_completion_claimed" = 1 ] && type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
|
|
13117
13238
|
log_warn "Completion claim rejected: evidence gate found no proof of completion (empty diff vs run-start SHA, or red tests)."
|
|
13118
13239
|
log_warn " Details under .loki/council/evidence-block.json ; opt out with LOKI_EVIDENCE_GATE=0"
|
|
13119
13240
|
# Fall through; keep iterating until there is real evidence.
|
|
13120
|
-
|
|
13241
|
+
# v7.28.0: the held-out spec-eval gate must also guard the DEFAULT
|
|
13242
|
+
# completion-promise route, not only the interval-gated council path
|
|
13243
|
+
# (council_evaluate). Otherwise an agent can self-assert "done" and
|
|
13244
|
+
# exit as completion_promise_fulfilled while a held-out acceptance
|
|
13245
|
+
# check is failing, bypassing the anti-reward-hacking gate entirely.
|
|
13246
|
+
# Mirrors the evidence-gate block above. Opt-out: the gate's own
|
|
13247
|
+
# LOKI_HELDOUT_GATE=0 (council_heldout_gate returns 0 immediately
|
|
13248
|
+
# when disabled or when no held-out items are reserved, so this
|
|
13249
|
+
# branch never fires). Gate output is printed by council_heldout_gate.
|
|
13250
|
+
elif [ "$_completion_claimed" = 1 ] && type council_heldout_gate &>/dev/null && ! council_heldout_gate; then
|
|
13251
|
+
log_warn "Completion claim rejected: held-out spec-eval gate found failing held-out acceptance check(s)."
|
|
13252
|
+
log_warn " Details under .loki/council/heldout-block.json ; opt out with LOKI_HELDOUT_GATE=0"
|
|
13253
|
+
# Fall through; keep iterating until the held-out checks pass.
|
|
13254
|
+
elif [ "$_completion_claimed" = 1 ]; then
|
|
13121
13255
|
echo ""
|
|
13122
13256
|
if [ -n "$COMPLETION_PROMISE" ]; then
|
|
13123
13257
|
log_header "COMPLETION PROMISE FULFILLED: $COMPLETION_PROMISE"
|
|
@@ -13491,10 +13625,19 @@ check_human_intervention() {
|
|
|
13491
13625
|
if [ -f "$loki_dir/signals/COUNCIL_REVIEW_REQUESTED" ]; then
|
|
13492
13626
|
log_info "Council force-review requested from dashboard"
|
|
13493
13627
|
rm -f "$loki_dir/signals/COUNCIL_REVIEW_REQUESTED"
|
|
13628
|
+
# MEDIUM-3: this route evaluates the council hard gates directly without
|
|
13629
|
+
# the council_evaluate freshness step, so re-verify the checklist ONCE
|
|
13630
|
+
# before the gate chain to restore that invariant (refreshes held-out
|
|
13631
|
+
# statuses and repairs a stale reservation). Type-guarded, best-effort.
|
|
13632
|
+
if type council_reverify_checklist &>/dev/null; then
|
|
13633
|
+
council_reverify_checklist 2>/dev/null || true
|
|
13634
|
+
fi
|
|
13494
13635
|
if type council_checklist_gate &>/dev/null && ! council_checklist_gate; then
|
|
13495
13636
|
log_info "Council force-review: blocked by checklist hard gate"
|
|
13496
13637
|
elif type council_evidence_gate &>/dev/null && ! council_evidence_gate; then
|
|
13497
13638
|
log_info "Council force-review: blocked by evidence hard gate"
|
|
13639
|
+
elif type council_heldout_gate &>/dev/null && ! council_heldout_gate; then
|
|
13640
|
+
log_info "Council force-review: blocked by held-out spec-eval hard gate"
|
|
13498
13641
|
elif type council_vote &>/dev/null && council_vote; then
|
|
13499
13642
|
log_header "COMPLETION COUNCIL: FORCE REVIEW - PROJECT COMPLETE"
|
|
13500
13643
|
# BUG #17 fix: Write COMPLETED marker, generate council report, and
|