loki-mode 7.26.0 → 7.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -13
- package/SKILL.md +11 -2
- package/VERSION +1 -1
- package/autonomy/completion-council.sh +310 -6
- package/autonomy/context-tracker.py +32 -7
- package/autonomy/grill.sh +321 -0
- package/autonomy/lib/trust_metrics.py +636 -0
- package/autonomy/loki +142 -0
- package/autonomy/prd-checklist.sh +248 -14
- package/autonomy/run.sh +283 -32
- package/autonomy/spec.sh +646 -0
- package/autonomy/verify.sh +1130 -0
- package/dashboard/__init__.py +1 -1
- package/dashboard/static/index.html +1 -1
- package/docs/COMPARISON.md +9 -9
- package/docs/COMPETITIVE-ANALYSIS.md +18 -37
- package/docs/INSTALLATION.md +1 -1
- package/docs/auto-claude-comparison.md +9 -6
- package/docs/certification/01-core-concepts/lesson.md +3 -3
- package/docs/competitive/emergence-others-analysis.md +1 -1
- package/docs/competitive/replit-lovable-analysis.md +1 -1
- package/docs/cursor-comparison.md +1 -1
- package/docs/prd-purple-lab-platform.md +1 -1
- package/docs/show-hn-post.md +2 -2
- package/loki-ts/dist/loki.js +2 -2
- package/mcp/__init__.py +1 -1
- package/package.json +2 -1
- package/providers/codex.sh +3 -2
- package/references/agent-types.md +9 -9
- package/references/agents.md +8 -8
- package/references/business-ops.md +1 -1
- package/references/competitive-analysis.md +1 -1
- package/skills/agents.md +3 -3
- package/skills/providers.md +3 -3
- package/skills/quality-gates.md +46 -0
package/autonomy/loki
CHANGED
|
@@ -554,12 +554,16 @@ show_help() {
|
|
|
554
554
|
echo " projects Multi-project registry management"
|
|
555
555
|
echo " audit [cmd] Agent audit log and quality scanning (log|scan)"
|
|
556
556
|
echo " heal <path> Legacy system healing (archaeology, stabilize, modernize)"
|
|
557
|
+
echo " verify [base] Deterministic PR verification (Autonomi Verify MVP; CI-gate exit codes)"
|
|
558
|
+
echo " spec [cmd] Living spec: keep the spec true (lock|status|sync; drift detection, CI-gate exit codes)"
|
|
559
|
+
echo " grill [spec] Interrogate a spec with the hardest questions before you build it (Devil's Advocate)"
|
|
557
560
|
echo " review [opts] Standalone code review with quality gates (diff, staged, PR, files)"
|
|
558
561
|
echo " optimize Optimize prompts based on session history"
|
|
559
562
|
echo " enterprise Enterprise feature management (tokens, OIDC)"
|
|
560
563
|
echo " metrics [opts] Session productivity report (--json, --last N, --save, --share)"
|
|
561
564
|
echo " cost [opts] Transparent cost view: per-run/project spend + budget (--json, --last N)"
|
|
562
565
|
echo " trust [--json] Visible trust trajectory: council/gate pass-rate + interventions over runs [R4]"
|
|
566
|
+
echo " trust-metrics Trust-layer metrics: evidence-block rate, gate distribution, council split, cost/verified (--json)"
|
|
563
567
|
echo " dogfood Show self-development statistics"
|
|
564
568
|
echo " secrets [cmd] API key status and validation (status|validate)"
|
|
565
569
|
echo " reset [target] Reset session state (all|retries|failed)"
|
|
@@ -11355,6 +11359,70 @@ with open(manifest_path, 'w') as f:
|
|
|
11355
11359
|
# Modernize legacy codebases incrementally without breaking existing behavior.
|
|
11356
11360
|
#===============================================================================
|
|
11357
11361
|
|
|
11362
|
+
# ---------------------------------------------------------------------------
|
|
11363
|
+
# loki verify - Autonomi Verify (Verification-as-a-Service MVP)
|
|
11364
|
+
#
|
|
11365
|
+
# Thin dispatcher that sources autonomy/verify.sh and delegates to its
|
|
11366
|
+
# verify_main(). The verification core is deliberately standalone (it does NOT
|
|
11367
|
+
# enter the autonomous iteration loop): it computes a PR-style merge-base diff
|
|
11368
|
+
# and runs deterministic gates against the current tree, emitting a verdict and
|
|
11369
|
+
# a consolidated evidence document. Deterministic-only in this MVP (no LLM
|
|
11370
|
+
# review). Exit code is propagated to the caller so the command is CI-gate
|
|
11371
|
+
# usable.
|
|
11372
|
+
# ---------------------------------------------------------------------------
|
|
11373
|
+
cmd_verify() {
|
|
11374
|
+
local verify_mod="$_LOKI_SCRIPT_DIR/verify.sh"
|
|
11375
|
+
if [ ! -f "$verify_mod" ]; then
|
|
11376
|
+
echo -e "${RED}Error: verify module not found at $verify_mod${NC}" >&2
|
|
11377
|
+
return 3
|
|
11378
|
+
fi
|
|
11379
|
+
# shellcheck source=/dev/null
|
|
11380
|
+
source "$verify_mod"
|
|
11381
|
+
verify_main "$@"
|
|
11382
|
+
return $?
|
|
11383
|
+
}
|
|
11384
|
+
|
|
11385
|
+
# ---------------------------------------------------------------------------
|
|
11386
|
+
# loki spec - the living spec (drift detection + lock)
|
|
11387
|
+
#
|
|
11388
|
+
# Thin dispatcher that sources autonomy/spec.sh and delegates to spec_main().
|
|
11389
|
+
# The spec core is deliberately standalone (it does NOT enter the autonomous
|
|
11390
|
+
# loop): it binds spec requirements to content hashes (.loki/spec/spec.lock)
|
|
11391
|
+
# and detects drift deterministically, emitting .loki/spec/drift-report.json.
|
|
11392
|
+
# Exit codes are propagated so `loki spec status` is CI-gate usable.
|
|
11393
|
+
# ---------------------------------------------------------------------------
|
|
11394
|
+
cmd_spec() {
|
|
11395
|
+
local spec_mod="$_LOKI_SCRIPT_DIR/spec.sh"
|
|
11396
|
+
if [ ! -f "$spec_mod" ]; then
|
|
11397
|
+
echo -e "${RED}Error: spec module not found at $spec_mod${NC}" >&2
|
|
11398
|
+
return 3
|
|
11399
|
+
fi
|
|
11400
|
+
# shellcheck source=/dev/null
|
|
11401
|
+
source "$spec_mod"
|
|
11402
|
+
spec_main "$@"
|
|
11403
|
+
return $?
|
|
11404
|
+
}
|
|
11405
|
+
|
|
11406
|
+
# ---------------------------------------------------------------------------
|
|
11407
|
+
# loki grill - spec interrogation (Devil's-Advocate, pre-build)
|
|
11408
|
+
#
|
|
11409
|
+
# Thin dispatcher that sources autonomy/grill.sh and delegates to grill_main().
|
|
11410
|
+
# Invokes the provider once to produce the hardest questions exposing spec
|
|
11411
|
+
# weaknesses; writes .loki/grill/report.md. Requires the provider CLI and
|
|
11412
|
+
# fails cleanly when it is absent (no fabricated questions).
|
|
11413
|
+
# ---------------------------------------------------------------------------
|
|
11414
|
+
cmd_grill() {
|
|
11415
|
+
local grill_mod="$_LOKI_SCRIPT_DIR/grill.sh"
|
|
11416
|
+
if [ ! -f "$grill_mod" ]; then
|
|
11417
|
+
echo -e "${RED}Error: grill module not found at $grill_mod${NC}" >&2
|
|
11418
|
+
return 3
|
|
11419
|
+
fi
|
|
11420
|
+
# shellcheck source=/dev/null
|
|
11421
|
+
source "$grill_mod"
|
|
11422
|
+
grill_main "$@"
|
|
11423
|
+
return $?
|
|
11424
|
+
}
|
|
11425
|
+
|
|
11358
11426
|
cmd_heal_help() {
|
|
11359
11427
|
echo -e "${BOLD}loki heal${NC} - Legacy system healing (v6.67.0)"
|
|
11360
11428
|
echo ""
|
|
@@ -13502,6 +13570,15 @@ main() {
|
|
|
13502
13570
|
heal)
|
|
13503
13571
|
cmd_heal "$@"
|
|
13504
13572
|
;;
|
|
13573
|
+
verify)
|
|
13574
|
+
cmd_verify "$@"
|
|
13575
|
+
;;
|
|
13576
|
+
spec)
|
|
13577
|
+
cmd_spec "$@"
|
|
13578
|
+
;;
|
|
13579
|
+
grill)
|
|
13580
|
+
cmd_grill "$@"
|
|
13581
|
+
;;
|
|
13505
13582
|
migrate)
|
|
13506
13583
|
cmd_migrate "$@"
|
|
13507
13584
|
;;
|
|
@@ -13529,6 +13606,9 @@ main() {
|
|
|
13529
13606
|
trust)
|
|
13530
13607
|
cmd_trust "$@"
|
|
13531
13608
|
;;
|
|
13609
|
+
trust-metrics)
|
|
13610
|
+
cmd_trust_metrics "$@"
|
|
13611
|
+
;;
|
|
13532
13612
|
syslog)
|
|
13533
13613
|
cmd_syslog "$@"
|
|
13534
13614
|
;;
|
|
@@ -18775,6 +18855,68 @@ cmd_trust() {
|
|
|
18775
18855
|
python3 "$trust_mod" --loki-dir "$loki_dir" ${pass_args[@]+"${pass_args[@]}"}
|
|
18776
18856
|
}
|
|
18777
18857
|
|
|
18858
|
+
# Trust-layer metrics (benchmark program section 3): the four AVAILABLE-TODAY
|
|
18859
|
+
# metrics nobody else can publish, computed for THIS project from the durable
|
|
18860
|
+
# trust-events.jsonl log plus the .loki/proofs/ corpus. Honest by construction:
|
|
18861
|
+
# each metric reports its own n= and says "not instrumented" rather than a
|
|
18862
|
+
# fabricated zero. Single project only.
|
|
18863
|
+
cmd_trust_metrics() {
|
|
18864
|
+
local pass_args=()
|
|
18865
|
+
while [[ $# -gt 0 ]]; do
|
|
18866
|
+
case "$1" in
|
|
18867
|
+
--help|-h)
|
|
18868
|
+
echo -e "${BOLD}loki trust-metrics${NC} - Trust-layer metrics (single project)"
|
|
18869
|
+
echo ""
|
|
18870
|
+
echo "Usage: loki trust-metrics [options]"
|
|
18871
|
+
echo ""
|
|
18872
|
+
echo "Computes the four trust-layer metrics from this project's"
|
|
18873
|
+
echo ".loki artifacts and emits .loki/metrics/trust-metrics.json"
|
|
18874
|
+
echo "plus a human-readable table:"
|
|
18875
|
+
echo " 1. Evidence-gate block rate (runs that caught an unproven"
|
|
18876
|
+
echo " 'done' claim before honoring completion)"
|
|
18877
|
+
echo " 2. Gate failure distribution per run (median, p90, per-gate)"
|
|
18878
|
+
echo " 3. Council rejection / split-verdict rate"
|
|
18879
|
+
echo " 4. Cost-per-VERIFIED-task (local verified denominator)"
|
|
18880
|
+
echo ""
|
|
18881
|
+
echo "Sources: .loki/metrics/trust-events.jsonl (durable event log)"
|
|
18882
|
+
echo "and .loki/proofs/<id>/proof.json. A metric with no source"
|
|
18883
|
+
echo "artifact is reported 'not instrumented', never a fake 0."
|
|
18884
|
+
echo ""
|
|
18885
|
+
echo "Options:"
|
|
18886
|
+
echo " --json Machine-readable JSON output"
|
|
18887
|
+
echo " --no-cache Do not write trust-metrics.json"
|
|
18888
|
+
echo " --help, -h Show this help"
|
|
18889
|
+
echo ""
|
|
18890
|
+
echo "Scope: SINGLE PROJECT only. An --all-projects registry"
|
|
18891
|
+
echo "aggregator is out of scope; run this inside each project."
|
|
18892
|
+
exit 0
|
|
18893
|
+
;;
|
|
18894
|
+
--json) pass_args+=("--json"); shift ;;
|
|
18895
|
+
--no-cache) pass_args+=("--no-cache"); shift ;;
|
|
18896
|
+
--all-projects)
|
|
18897
|
+
echo -e "${RED}--all-projects is out of scope (single project only).${NC}"
|
|
18898
|
+
echo "Run 'loki trust-metrics' inside each project directory."
|
|
18899
|
+
exit 2
|
|
18900
|
+
;;
|
|
18901
|
+
*) echo -e "${RED}Unknown option: $1${NC}"; echo "Run 'loki trust-metrics --help' for usage."; exit 1 ;;
|
|
18902
|
+
esac
|
|
18903
|
+
done
|
|
18904
|
+
|
|
18905
|
+
if ! command -v python3 &>/dev/null; then
|
|
18906
|
+
echo -e "${RED}python3 is required for trust metrics${NC}"
|
|
18907
|
+
exit 1
|
|
18908
|
+
fi
|
|
18909
|
+
|
|
18910
|
+
local tm_mod="$_LOKI_SCRIPT_DIR/lib/trust_metrics.py"
|
|
18911
|
+
if [ ! -f "$tm_mod" ]; then
|
|
18912
|
+
echo -e "${RED}trust_metrics.py not found at $tm_mod${NC}"
|
|
18913
|
+
exit 1
|
|
18914
|
+
fi
|
|
18915
|
+
|
|
18916
|
+
local loki_dir="${LOKI_DIR:-.loki}"
|
|
18917
|
+
python3 "$tm_mod" --loki-dir "$loki_dir" ${pass_args[@]+"${pass_args[@]}"}
|
|
18918
|
+
}
|
|
18919
|
+
|
|
18778
18920
|
# Transparent cost view (R3): per-run + per-project spend, model routing, and
|
|
18779
18921
|
# budget status with the 80% warn line. Reuses efficiency_cost.collect_efficiency
|
|
18780
18922
|
# for the current-run aggregate (single source of truth) and reads .loki/proofs/
|
|
@@ -105,6 +105,198 @@ checklist_should_verify() {
|
|
|
105
105
|
return 0
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
+
#===============================================================================
|
|
109
|
+
# Held-out Spec Eval Selection (v7.28.0)
|
|
110
|
+
#===============================================================================
|
|
111
|
+
# Anti-reward-hacking: deterministically reserve ~25% of checklist items as
|
|
112
|
+
# "held-out". Held-out item IDs are excluded from the prompt feed the build loop
|
|
113
|
+
# sees (checklist_summary and council_checklist_gate), so a cooperative build
|
|
114
|
+
# agent is not steered toward those specific acceptance checks. The completion
|
|
115
|
+
# council evaluates them at the ship gate (council_heldout_gate in
|
|
116
|
+
# completion-council.sh). Scope of the guarantee: this protects the prompt feed,
|
|
117
|
+
# not a sandbox. .loki/checklist/held-out.json is plain on-disk JSON, so a
|
|
118
|
+
# non-cooperative agent with filesystem tools can read the reservation directly.
|
|
119
|
+
#
|
|
120
|
+
# Selection is idempotent and reproducible: count = clamp(round(0.25*N), 1, 5)
|
|
121
|
+
# for N>=4 items; ordering by sha256 of each item's "id" (stable, not random).
|
|
122
|
+
# Written once to .loki/checklist/held-out.json; never overwritten if present.
|
|
123
|
+
checklist_select_heldout() {
|
|
124
|
+
local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
|
|
125
|
+
|
|
126
|
+
if [ ! -f "$CHECKLIST_FILE" ]; then
|
|
127
|
+
return 0
|
|
128
|
+
fi
|
|
129
|
+
|
|
130
|
+
# The Python below handles all four cases and prints a single status token so
|
|
131
|
+
# bash can log honestly and emit the right trust event:
|
|
132
|
+
# FRESH n - no prior reservation, selected n (file written)
|
|
133
|
+
# IDEMPOTENT - prior reservation fully valid vs current ids (no-op,
|
|
134
|
+
# file untouched: preserves the idempotency case 1 tests)
|
|
135
|
+
# RESELECTED n - prior reservation fully stale (zero ids survive); the
|
|
136
|
+
# checklist regenerated, so we deterministically re-select
|
|
137
|
+
# n items from the CURRENT checklist and overwrite
|
|
138
|
+
# PARTIAL kept=k dropped=d - some prior ids survived; we keep only survivors
|
|
139
|
+
# DUP_SKIP - current checklist ids are not unique; the id-based
|
|
140
|
+
# mechanism is unsound, so we reserve nothing (MEDIUM-2)
|
|
141
|
+
# NOOP - n<4 with no prior file, or other no-write outcome
|
|
142
|
+
# Honest caveat: re-selection or partial-survival after a regen can reserve
|
|
143
|
+
# items the build loop already saw in earlier prompts (the hidden-from-loop
|
|
144
|
+
# guarantee is best-effort once the checklist ids change mid-run).
|
|
145
|
+
local status_token
|
|
146
|
+
status_token=$(_CHECKLIST_FILE="$CHECKLIST_FILE" _HELDOUT_FILE="$heldout_file" python3 -c "
|
|
147
|
+
import json, os, sys, hashlib, tempfile
|
|
148
|
+
|
|
149
|
+
cl_path = os.environ['_CHECKLIST_FILE']
|
|
150
|
+
out_path = os.environ['_HELDOUT_FILE']
|
|
151
|
+
try:
|
|
152
|
+
with open(cl_path) as f:
|
|
153
|
+
data = json.load(f)
|
|
154
|
+
except Exception:
|
|
155
|
+
print('NOOP')
|
|
156
|
+
sys.exit(0)
|
|
157
|
+
|
|
158
|
+
# Collect all item ids in document order.
|
|
159
|
+
ids = []
|
|
160
|
+
for cat in data.get('categories', []):
|
|
161
|
+
for item in cat.get('items', []):
|
|
162
|
+
iid = item.get('id', '')
|
|
163
|
+
if iid:
|
|
164
|
+
ids.append(iid)
|
|
165
|
+
|
|
166
|
+
n = len(ids)
|
|
167
|
+
id_set = set(ids)
|
|
168
|
+
|
|
169
|
+
# MEDIUM-2: duplicate ids make the id-based hide/select mechanism unsound. Skip
|
|
170
|
+
# selection entirely (no reservation written) so a held-out id can never map to
|
|
171
|
+
# more than one item. Do NOT touch an existing reservation file here (a stale
|
|
172
|
+
# valid file left over from before a dup-introducing regen is handled by the
|
|
173
|
+
# council gate's STALE path; over-removing would be over-engineering).
|
|
174
|
+
if len(id_set) != n:
|
|
175
|
+
print('DUP_SKIP')
|
|
176
|
+
sys.exit(0)
|
|
177
|
+
|
|
178
|
+
def select_count(num_ids):
|
|
179
|
+
c = round(0.25 * num_ids)
|
|
180
|
+
if c < 1:
|
|
181
|
+
c = 1
|
|
182
|
+
if c > 5:
|
|
183
|
+
c = 5
|
|
184
|
+
return c
|
|
185
|
+
|
|
186
|
+
def fresh_selection():
|
|
187
|
+
# Deterministic order: sort ids by sha256(id), take the first <count>.
|
|
188
|
+
count = select_count(n)
|
|
189
|
+
ranked = sorted(ids, key=lambda i: hashlib.sha256(i.encode('utf-8')).hexdigest())
|
|
190
|
+
return sorted(ranked[:count])
|
|
191
|
+
|
|
192
|
+
def atomic_write(payload):
|
|
193
|
+
d = os.path.dirname(out_path) or '.'
|
|
194
|
+
os.makedirs(d, exist_ok=True)
|
|
195
|
+
fd, tmp = tempfile.mkstemp(dir=d, suffix='.tmp')
|
|
196
|
+
with os.fdopen(fd, 'w') as f:
|
|
197
|
+
json.dump(payload, f, indent=2)
|
|
198
|
+
f.write('\n')
|
|
199
|
+
os.replace(tmp, out_path)
|
|
200
|
+
|
|
201
|
+
prior = None
|
|
202
|
+
if os.path.exists(out_path):
|
|
203
|
+
try:
|
|
204
|
+
with open(out_path) as f:
|
|
205
|
+
prior = json.load(f)
|
|
206
|
+
except Exception:
|
|
207
|
+
prior = None
|
|
208
|
+
|
|
209
|
+
if prior is not None:
|
|
210
|
+
prior_ids = [i for i in prior.get('held_out', []) if i]
|
|
211
|
+
# A prior reservation of [] (e.g. an earlier n<4 run) is a valid no-op state;
|
|
212
|
+
# keep it idempotent rather than re-selecting now that n may have grown.
|
|
213
|
+
if not prior_ids:
|
|
214
|
+
print('IDEMPOTENT')
|
|
215
|
+
sys.exit(0)
|
|
216
|
+
survivors = [i for i in prior_ids if i in id_set]
|
|
217
|
+
if len(survivors) == len(prior_ids):
|
|
218
|
+
# Fully valid against the current checklist: idempotent no-op.
|
|
219
|
+
print('IDEMPOTENT')
|
|
220
|
+
sys.exit(0)
|
|
221
|
+
if not survivors:
|
|
222
|
+
# Fully stale: the checklist regenerated and orphaned the reservation.
|
|
223
|
+
# Deterministically re-select from the CURRENT checklist.
|
|
224
|
+
if n < 4:
|
|
225
|
+
atomic_write({'held_out': [], 'total_items': n,
|
|
226
|
+
'note': 'n<4: no held-out reserved (re-selected after stale reservation)'})
|
|
227
|
+
print('RESELECTED 0')
|
|
228
|
+
sys.exit(0)
|
|
229
|
+
held = fresh_selection()
|
|
230
|
+
atomic_write({'held_out': held, 'total_items': n})
|
|
231
|
+
print('RESELECTED %d' % len(held))
|
|
232
|
+
sys.exit(0)
|
|
233
|
+
# Partial survival: keep only the surviving ids (do not silently shrink).
|
|
234
|
+
dropped = len(prior_ids) - len(survivors)
|
|
235
|
+
payload = {'held_out': sorted(survivors), 'total_items': n}
|
|
236
|
+
atomic_write(payload)
|
|
237
|
+
print('PARTIAL kept=%d dropped=%d' % (len(survivors), dropped))
|
|
238
|
+
sys.exit(0)
|
|
239
|
+
|
|
240
|
+
# No prior reservation: first selection.
|
|
241
|
+
if n < 4:
|
|
242
|
+
# N>=4 gate: smaller checklists get no held-out (nothing to hide reliably).
|
|
243
|
+
atomic_write({'held_out': [], 'total_items': n, 'note': 'n<4: no held-out reserved'})
|
|
244
|
+
print('NOOP')
|
|
245
|
+
sys.exit(0)
|
|
246
|
+
|
|
247
|
+
held = fresh_selection()
|
|
248
|
+
atomic_write({'held_out': held, 'total_items': n})
|
|
249
|
+
print('FRESH %d' % len(held))
|
|
250
|
+
" 2>/dev/null || echo "NOOP")
|
|
251
|
+
|
|
252
|
+
# Honest logging + trust event on any stale repair (type-guarded).
|
|
253
|
+
local tok rest
|
|
254
|
+
read -r tok rest <<< "$status_token"
|
|
255
|
+
case "$tok" in
|
|
256
|
+
RESELECTED)
|
|
257
|
+
log_warn "[checklist] held-out reservation stale (checklist regenerated); re-selected ${rest:-0} items"
|
|
258
|
+
if type record_trust_event_bash &>/dev/null; then
|
|
259
|
+
record_trust_event_bash "heldout_stale" \
|
|
260
|
+
"detail=reselected" \
|
|
261
|
+
"reselected=${rest:-0}" \
|
|
262
|
+
>/dev/null 2>&1 || true
|
|
263
|
+
fi
|
|
264
|
+
;;
|
|
265
|
+
PARTIAL)
|
|
266
|
+
log_warn "[checklist] held-out reservation partially stale (checklist regenerated); $rest"
|
|
267
|
+
if type record_trust_event_bash &>/dev/null; then
|
|
268
|
+
record_trust_event_bash "heldout_stale" \
|
|
269
|
+
"detail=partial" \
|
|
270
|
+
"$rest" \
|
|
271
|
+
>/dev/null 2>&1 || true
|
|
272
|
+
fi
|
|
273
|
+
;;
|
|
274
|
+
DUP_SKIP)
|
|
275
|
+
log_warn "[checklist] checklist ids are not unique; held-out selection skipped (id-based reservation is unsound with duplicate ids)"
|
|
276
|
+
;;
|
|
277
|
+
esac
|
|
278
|
+
|
|
279
|
+
return 0
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
# Echo held-out item IDs (one per line) to stdout. Empty when none reserved.
|
|
283
|
+
checklist_heldout_ids() {
|
|
284
|
+
local heldout_file="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json"
|
|
285
|
+
if [ ! -f "$heldout_file" ]; then
|
|
286
|
+
return 0
|
|
287
|
+
fi
|
|
288
|
+
_HELDOUT_FILE="$heldout_file" python3 -c "
|
|
289
|
+
import json, os
|
|
290
|
+
try:
|
|
291
|
+
with open(os.environ['_HELDOUT_FILE']) as f:
|
|
292
|
+
data = json.load(f)
|
|
293
|
+
for i in data.get('held_out', []):
|
|
294
|
+
print(i)
|
|
295
|
+
except Exception:
|
|
296
|
+
pass
|
|
297
|
+
" 2>/dev/null || true
|
|
298
|
+
}
|
|
299
|
+
|
|
108
300
|
#===============================================================================
|
|
109
301
|
# Verification
|
|
110
302
|
#===============================================================================
|
|
@@ -118,6 +310,10 @@ checklist_verify() {
|
|
|
118
310
|
return 0
|
|
119
311
|
fi
|
|
120
312
|
|
|
313
|
+
# Held-out selection happens BEFORE the first verification so that the very
|
|
314
|
+
# first verification-results.json summary already excludes held-out items.
|
|
315
|
+
checklist_select_heldout
|
|
316
|
+
|
|
121
317
|
local script_dir
|
|
122
318
|
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
123
319
|
local verify_script="${script_dir}/checklist-verify.py"
|
|
@@ -160,16 +356,12 @@ checklist_summary() {
|
|
|
160
356
|
|
|
161
357
|
_CHECKLIST_RESULTS="$CHECKLIST_RESULTS_FILE" \
|
|
162
358
|
_CHECKLIST_WAIVERS="${CHECKLIST_DIR:-".loki/checklist"}/waivers.json" \
|
|
359
|
+
_CHECKLIST_HELDOUT="${CHECKLIST_DIR:-".loki/checklist"}/held-out.json" \
|
|
163
360
|
python3 -c "
|
|
164
361
|
import json, sys, os
|
|
165
362
|
try:
|
|
166
363
|
fpath = os.environ.get('_CHECKLIST_RESULTS', '')
|
|
167
364
|
data = json.load(open(fpath))
|
|
168
|
-
s = data.get('summary', {})
|
|
169
|
-
total = s.get('total', 0)
|
|
170
|
-
verified = s.get('verified', 0)
|
|
171
|
-
failing = s.get('failing', 0)
|
|
172
|
-
pending = s.get('pending', 0)
|
|
173
365
|
|
|
174
366
|
# Load waivers
|
|
175
367
|
waived_ids = set()
|
|
@@ -184,26 +376,68 @@ try:
|
|
|
184
376
|
except Exception:
|
|
185
377
|
pass
|
|
186
378
|
|
|
187
|
-
#
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
379
|
+
# Load held-out item ids (v7.28.0). Held-out items are NEVER surfaced to the
|
|
380
|
+
# build loop: they are fully excluded from the counts and the failing list so
|
|
381
|
+
# the build agent cannot tune to them. The council evaluates them separately.
|
|
382
|
+
heldout_ids = set()
|
|
383
|
+
heldout_path = os.environ.get('_CHECKLIST_HELDOUT', '')
|
|
384
|
+
if heldout_path and os.path.exists(heldout_path):
|
|
385
|
+
try:
|
|
386
|
+
with open(heldout_path) as hf:
|
|
387
|
+
hdata = json.load(hf)
|
|
388
|
+
heldout_ids = set(hdata.get('held_out', []))
|
|
389
|
+
except Exception:
|
|
390
|
+
pass
|
|
391
|
+
|
|
392
|
+
# Count all checklist items first so we can detect the pathological case
|
|
393
|
+
# where hiding would empty the summary on a non-empty checklist (MEDIUM-2).
|
|
394
|
+
all_items = 0
|
|
395
|
+
for cat in data.get('categories', []):
|
|
396
|
+
all_items += len(cat.get('items', []))
|
|
397
|
+
|
|
398
|
+
def compute(apply_heldout):
|
|
399
|
+
total = verified = pending = failing = waived_count = 0
|
|
192
400
|
failing_items = []
|
|
193
401
|
for cat in data.get('categories', []):
|
|
194
402
|
for item in cat.get('items', []):
|
|
195
403
|
item_id = item.get('id', '')
|
|
404
|
+
if apply_heldout and item_id in heldout_ids:
|
|
405
|
+
continue
|
|
196
406
|
if item_id in waived_ids:
|
|
197
407
|
waived_count += 1
|
|
198
408
|
continue
|
|
199
|
-
|
|
200
|
-
|
|
409
|
+
total += 1
|
|
410
|
+
status = item.get('status')
|
|
411
|
+
if status == 'verified':
|
|
412
|
+
verified += 1
|
|
413
|
+
elif status == 'failing':
|
|
414
|
+
failing += 1
|
|
415
|
+
if item.get('priority') in ('critical', 'major'):
|
|
416
|
+
failing_items.append(item.get('title', item.get('id', '?')))
|
|
417
|
+
else:
|
|
418
|
+
pending += 1
|
|
419
|
+
return total, verified, pending, failing, waived_count, failing_items
|
|
420
|
+
|
|
421
|
+
# Recompute counts over the VISIBLE (non-held-out) items so 'total' never
|
|
422
|
+
# leaks the existence of held-out items. Waived items are excluded too.
|
|
423
|
+
total, verified, pending, failing, waived_count, failing_items = compute(True)
|
|
424
|
+
|
|
425
|
+
# MEDIUM-2 guard: if hiding held-out items would empty the summary while the
|
|
426
|
+
# checklist itself is non-empty, fall back to showing all items (do not hide)
|
|
427
|
+
# and warn. Returning an empty summary on a non-empty checklist reads as 'no
|
|
428
|
+
# checklist' to the prompt feed, which is a worse failure than a small leak.
|
|
429
|
+
if total == 0 and all_items > 0:
|
|
430
|
+
print('held-out hiding would empty a non-empty checklist summary; showing all items', file=sys.stderr)
|
|
431
|
+
total, verified, pending, failing, waived_count, failing_items = compute(False)
|
|
432
|
+
|
|
433
|
+
if total == 0:
|
|
434
|
+
print('')
|
|
435
|
+
else:
|
|
201
436
|
detail = ''
|
|
202
437
|
if failing_items:
|
|
203
438
|
detail = ' FAILING: ' + ', '.join(failing_items[:5])
|
|
204
439
|
waived_str = f', {waived_count} waived' if waived_count > 0 else ''
|
|
205
|
-
|
|
206
|
-
print(f'{verified}/{total} verified, {adjusted_failing} failing{waived_str}, {pending} pending.{detail}')
|
|
440
|
+
print(f'{verified}/{total} verified, {failing} failing{waived_str}, {pending} pending.{detail}')
|
|
207
441
|
except Exception:
|
|
208
442
|
print('', file=sys.stderr)
|
|
209
443
|
" 2>/dev/null || echo ""
|