atris 3.1.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/GETTING_STARTED.md +65 -131
  2. package/README.md +29 -4
  3. package/atris/GETTING_STARTED.md +65 -131
  4. package/atris/PERSONA.md +5 -1
  5. package/atris/atris.md +122 -153
  6. package/atris/skills/aeo/SKILL.md +117 -0
  7. package/atris/skills/atris/SKILL.md +49 -25
  8. package/atris/skills/create-member/SKILL.md +29 -9
  9. package/atris/skills/endgame/SKILL.md +9 -0
  10. package/atris/skills/improve/SKILL.md +2 -2
  11. package/atris/skills/research-search/SKILL.md +167 -0
  12. package/atris/skills/research-search/arxiv_search.py +157 -0
  13. package/atris/skills/research-search/program.md +48 -0
  14. package/atris/skills/research-search/results.tsv +6 -0
  15. package/atris/skills/research-search/scholar_search.py +154 -0
  16. package/atris/skills/tidy/SKILL.md +36 -21
  17. package/atris/team/_template/MEMBER.md +2 -0
  18. package/atris/team/validator/MEMBER.md +35 -1
  19. package/atris.md +118 -178
  20. package/bin/atris.js +37 -6
  21. package/cli/__pycache__/atris_code.cpython-314.pyc +0 -0
  22. package/cli/__pycache__/runtime_guard.cpython-312.pyc +0 -0
  23. package/cli/__pycache__/runtime_guard.cpython-314.pyc +0 -0
  24. package/cli/atris_code.py +889 -0
  25. package/cli/runtime_guard.py +693 -0
  26. package/commands/align.js +15 -0
  27. package/commands/app.js +316 -0
  28. package/commands/autopilot.js +948 -42
  29. package/commands/business.js +691 -11
  30. package/commands/computer.js +1979 -43
  31. package/commands/context-sync.js +5 -0
  32. package/commands/experiments.js +1 -1
  33. package/commands/lifecycle.js +12 -0
  34. package/commands/plugin.js +24 -0
  35. package/commands/pull.js +40 -1
  36. package/commands/push.js +44 -0
  37. package/commands/release.js +183 -0
  38. package/commands/research.js +52 -0
  39. package/commands/serve.js +1 -0
  40. package/commands/sync.js +372 -87
  41. package/commands/verify.js +53 -4
  42. package/commands/wiki.js +71 -26
  43. package/lib/file-ops.js +13 -1
  44. package/lib/journal.js +23 -0
  45. package/lib/reward-config.js +24 -0
  46. package/lib/scorecard.js +58 -6
  47. package/lib/sync-telemetry.js +59 -0
  48. package/lib/todo.js +6 -0
  49. package/lib/wiki.js +235 -60
  50. package/package.json +4 -2
  51. package/utils/api.js +19 -0
  52. package/utils/auth.js +25 -1
  53. package/utils/config.js +24 -0
  54. package/utils/update-check.js +16 -0
@@ -8,7 +8,7 @@
8
8
 
9
9
  const fs = require('fs');
10
10
  const path = require('path');
11
- const { execSync } = require('child_process');
11
+ const { execSync, execFileSync, spawnSync } = require('child_process');
12
12
  const readline = require('readline');
13
13
  const { getLogPath, ensureLogDirectory, createLogFile } = require('../lib/journal');
14
14
  const { parseTodo } = require('../lib/todo');
@@ -19,6 +19,7 @@ const {
19
19
  writeScorecard,
20
20
  detectEndgameCompletion
21
21
  } = require('../lib/scorecard');
22
+ const { REWARD_CONFIG, REWARD_CHECKSUM } = require('../lib/reward-config');
22
23
 
23
24
  const pkg = require('../package.json');
24
25
 
@@ -28,7 +29,7 @@ const PHASE_TIMEOUT = 600000; // 10 min per phase
28
29
  * Scan workspace for the next thing worth doing.
29
30
  * Returns { task, why, kind } or null.
30
31
  */
31
- async function suggestNextTask(cwd, skipped = new Set()) {
32
+ async function suggestNextTask(cwd, skipped = new Set(), { auto = false } = {}) {
32
33
  const atrisDir = path.join(cwd, 'atris');
33
34
  const suggestions = [];
34
35
 
@@ -37,6 +38,7 @@ async function suggestNextTask(cwd, skipped = new Set()) {
37
38
  const todo = parseTodo(todoPath);
38
39
 
39
40
  for (const t of todo.backlog) {
41
+ if (t.tags && t.tags.includes('unverified')) continue;
40
42
  if (t.tag === 'endgame' && !skipped.has(t.title)) {
41
43
  suggestions.push({
42
44
  task: t.title,
@@ -51,7 +53,7 @@ async function suggestNextTask(cwd, skipped = new Set()) {
51
53
  // --- Resume interrupted work ---
52
54
  if (todo.inProgress.length > 0) {
53
55
  const t = todo.inProgress[0];
54
- if (!skipped.has(t.title)) {
56
+ if (!(t.tags && t.tags.includes('unverified')) && !skipped.has(t.title)) {
55
57
  suggestions.push({
56
58
  task: t.title,
57
59
  why: `This was already started${t.claimed ? ` by ${t.claimed}` : ''} but never finished.`,
@@ -102,15 +104,17 @@ async function suggestNextTask(cwd, skipped = new Set()) {
102
104
  }
103
105
 
104
106
  // --- Backlog tasks ---
105
- for (const t of todo.backlog.slice(0, 1)) {
107
+ for (const t of todo.backlog) {
108
+ if (t.tags && t.tags.includes('unverified')) continue;
106
109
  if (skipped.has(t.title)) continue;
107
- const remaining = todo.backlog.length;
110
+ const remaining = todo.backlog.filter(b => !(b.tags && b.tags.includes('unverified'))).length;
108
111
  suggestions.push({
109
112
  task: t.title,
110
113
  why: `Next in the backlog${t.tag ? ` (${t.tag})` : ''}. ${remaining} task${remaining > 1 ? 's' : ''} waiting.`,
111
114
  kind: 'backlog',
112
115
  priority: 5
113
116
  });
117
+ break;
114
118
  }
115
119
 
116
120
  // --- Unprocessed inbox items ---
@@ -223,7 +227,64 @@ async function suggestNextTask(cwd, skipped = new Set()) {
223
227
  }
224
228
 
225
229
  suggestions.sort((a, b) => a.priority - b.priority);
226
- return suggestions[0];
230
+
231
+ // Staleness gate: filter out unverified/stale suggestions
232
+ const staleSkipped = [];
233
+ const fresh = [];
234
+ for (const s of suggestions) {
235
+ const fakeTask = { title: s.task, tag: s.kind === 'endgame' ? 'endgame' : null, claimed: null };
236
+ if (s.kind === 'resume' && todo.inProgress.length > 0) {
237
+ fakeTask.claimed = todo.inProgress[0].claimed;
238
+ }
239
+ const age = getTaskAgeDays(fakeTask, todoPath);
240
+ const status = isStillTrue({ title: s.task, age, source: null }, cwd);
241
+ if (status === 'stale') {
242
+ staleSkipped.push({ task: s.task, status, reasoning: null });
243
+ continue;
244
+ }
245
+ if (status === 'unverified') {
246
+ if (auto) {
247
+ // Auto mode: use model check
248
+ const result = askModel({ title: s.task, age, source: null }, cwd);
249
+ if (!result.fresh) {
250
+ staleSkipped.push({ task: s.task, status: 'unverified (model: not fresh)', reasoning: result.reasoning });
251
+ continue;
252
+ }
253
+ } else {
254
+ // Interactive mode: ask the human
255
+ const result = await askHuman(s.task);
256
+ if (!result.fresh) {
257
+ staleSkipped.push({ task: s.task, status: 'unverified (human: not relevant)', reasoning: null });
258
+ continue;
259
+ }
260
+ }
261
+ }
262
+ fresh.push(s);
263
+ }
264
+
265
+ // Log skipped items to journal
266
+ if (staleSkipped.length > 0) {
267
+ try {
268
+ const { logFile } = getLogPath();
269
+ const now = new Date();
270
+ const hhmm = `${String(now.getHours()).padStart(2, '0')}:${String(now.getMinutes()).padStart(2, '0')}`;
271
+ const lines = staleSkipped.map(s => `- ${s.task} (${s.status})${s.reasoning ? ` — ${s.reasoning}` : ''}`);
272
+ const note = `\n### Staleness skip — ${hhmm}\n${lines.join('\n')}\n`;
273
+ if (fs.existsSync(logFile)) {
274
+ const content = fs.readFileSync(logFile, 'utf8');
275
+ const notesIdx = content.indexOf('## Notes');
276
+ if (notesIdx !== -1) {
277
+ const insertAt = content.indexOf('\n', notesIdx) + 1;
278
+ const updated = content.slice(0, insertAt) + note + content.slice(insertAt);
279
+ fs.writeFileSync(logFile, updated);
280
+ } else {
281
+ fs.appendFileSync(logFile, `\n## Notes\n${note}`);
282
+ }
283
+ }
284
+ } catch {}
285
+ }
286
+
287
+ return fresh[0] || null;
227
288
  }
228
289
 
229
290
  /**
@@ -242,6 +303,22 @@ function askApproval() {
242
303
  });
243
304
  }
244
305
 
306
+ /**
307
+ * Ask the human whether an unverified task is still relevant.
308
+ * Interactive mode only — in auto mode, caller skips silently.
309
+ * Returns { fresh: boolean }.
310
+ */
311
+ function askHuman(taskTitle) {
312
+ return new Promise((resolve) => {
313
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
314
+ rl.question(` is "${taskTitle}" still relevant? y/n → `, (answer) => {
315
+ rl.close();
316
+ const a = (answer || '').trim().toLowerCase();
317
+ resolve({ fresh: a === 'y' || a === 'yes' });
318
+ });
319
+ });
320
+ }
321
+
245
322
  /**
246
323
  * Run a phase via claude -p subprocess.
247
324
  */
@@ -555,35 +632,526 @@ function writeLesson(cwd, slug, status, explanation) {
555
632
  fs.writeFileSync(lessonsPath, content);
556
633
  }
557
634
 
635
+ /**
636
+ * Record a tick's commit hash and verify command in atris/tick-registry.json.
637
+ * Each entry: { hash, verifyCmd, slug, timestamp }.
638
+ */
639
+ function recordTickCommit(cwd, hash, verifyCmd, slug) {
640
+ const registryPath = path.join(cwd, 'atris', 'tick-registry.json');
641
+ let registry = [];
642
+ if (fs.existsSync(registryPath)) {
643
+ try { registry = JSON.parse(fs.readFileSync(registryPath, 'utf8')); } catch { registry = []; }
644
+ }
645
+ registry.push({ hash, verifyCmd, slug, timestamp: new Date().toISOString() });
646
+ fs.writeFileSync(registryPath, JSON.stringify(registry, null, 2) + '\n');
647
+ }
648
+
649
+ /**
650
+ * Retroactive regression check. Reads last 10 entries from tick-registry.json,
651
+ * re-runs each verify command at its original commit using git worktree,
652
+ * returns array of { hash, slug, pass }. On failure: writes a lesson with
653
+ * retroactive context.
654
+ */
655
+ function regressionCheck(cwd) {
656
+ const registryPath = path.join(cwd, 'atris', 'tick-registry.json');
657
+ if (!fs.existsSync(registryPath)) return [];
658
+
659
+ let registry = [];
660
+ try { registry = JSON.parse(fs.readFileSync(registryPath, 'utf8')); } catch { return []; }
661
+ if (!Array.isArray(registry) || registry.length === 0) return [];
662
+
663
+ const entries = registry.slice(-10);
664
+ const results = [];
665
+
666
+ for (const entry of entries) {
667
+ if (!entry.hash || !entry.verifyCmd) {
668
+ results.push({ hash: entry.hash, slug: entry.slug, pass: true, skipped: true });
669
+ continue;
670
+ }
671
+
672
+ const worktreePath = path.join(cwd, '.regression-worktree-' + entry.hash.slice(0, 8));
673
+ let pass = false;
674
+ try {
675
+ // Create a worktree at the commit
676
+ execSync(`git worktree add "${worktreePath}" ${entry.hash} --detach 2>/dev/null`, { cwd, stdio: 'pipe' });
677
+ try {
678
+ execSync(entry.verifyCmd, { cwd: worktreePath, stdio: 'pipe', timeout: 60000 });
679
+ pass = true;
680
+ } catch {
681
+ pass = false;
682
+ }
683
+ } catch {
684
+ // If worktree creation fails (e.g., commit doesn't exist), skip
685
+ results.push({ hash: entry.hash, slug: entry.slug, pass: true, skipped: true });
686
+ continue;
687
+ } finally {
688
+ // Clean up worktree
689
+ try { execSync(`git worktree remove "${worktreePath}" --force 2>/dev/null`, { cwd, stdio: 'pipe' }); } catch {}
690
+ }
691
+
692
+ if (!pass) {
693
+ writeLesson(cwd, `regression-${entry.slug || 'unknown'}`, 'fail',
694
+ `Retroactive regression: verify command for tick ${entry.hash.slice(0, 7)} (${entry.slug}) now fails. -5 retroactive penalty applied.`);
695
+ }
696
+
697
+ results.push({ hash: entry.hash, slug: entry.slug, pass });
698
+ }
699
+
700
+ return results;
701
+ }
702
+
558
703
  /**
559
704
  * Get the verify command for a task from TODO.md
560
705
  * Reads TODO.md, finds the task by title across active/completed sections,
561
706
  * and extracts the verify field.
562
- * Defaults to 'npm test' if no verify field found.
707
+ * Returns { cmd, explicit } — explicit is true only if the task has an explicit Verify field.
563
708
  */
564
709
  function getVerifyCommand(cwd, taskTitle) {
565
710
  const todoPath = path.join(cwd, 'atris', 'TODO.md');
566
- if (!fs.existsSync(todoPath)) return 'npm test';
711
+ if (!fs.existsSync(todoPath)) return { cmd: null, explicit: false };
567
712
 
568
713
  const todo = parseTodo(todoPath);
569
714
  const task = [...todo.inProgress, ...todo.backlog, ...todo.completed]
570
715
  .find(t => t.title === taskTitle);
571
716
 
572
- if (!task) return 'npm test';
573
- if (task.verify) return task.verify;
574
- return 'npm test';
717
+ if (!task || !task.verify) return { cmd: null, explicit: false };
718
+ return { cmd: task.verify, explicit: true };
719
+ }
720
+
721
+ /**
722
+ * Verify that computeTickReward has not been modified since ship time.
723
+ * Returns { ok, expected, actual }.
724
+ */
725
+ function verifyJudgeIntegrity() {
726
+ const crypto = require('crypto');
727
+ const h = crypto.createHash('sha256');
728
+ h.update(JSON.stringify(REWARD_CONFIG));
729
+ h.update(computeTickReward.toString());
730
+ const actual = h.digest('hex');
731
+ return { ok: actual === REWARD_CHECKSUM, expected: REWARD_CHECKSUM, actual };
732
+ }
733
+
734
+ /**
735
+ * Build the validator's plan-review prompt. Fresh context — the validator
736
+ * reads the plan output and the contract fields as if it has never seen them.
737
+ */
738
+ function buildPlanReviewPrompt(context, planOutput) {
739
+ const files = Array.isArray(context.files) && context.files.length
740
+ ? context.files.join(', ')
741
+ : 'none declared in context';
742
+ return `You are the validator in plan-review mode. You have NOT seen the planning context — read everything fresh.
743
+
744
+ Task: "${context.task}"
745
+ Kind: ${context.kind || 'unknown'}
746
+ Files declared in context: ${files}
747
+
748
+ Plan output from the navigator:
749
+ ---
750
+ ${planOutput || '(no plan output captured)'}
751
+ ---
752
+
753
+ Read from disk:
754
+ - atris/atris.md (the workspace protocol — operating rules and task shape)
755
+ - atris/TODO.md (find this task; inspect Files, Exit, Verify, After, Rollback)
756
+ - atris/lessons.md (recent failures — last 20 lines)
757
+
758
+ Decide if the plan is safe to execute. Check:
759
+ 1. Verify points at a falsifiable rubric or test (not \`true\`, \`echo ok\`, or similar).
760
+ Prefer \`atris verify <slug> --section <name>\`.
761
+ 2. Files are explicitly declared (not empty, not vague).
762
+ 3. Rollback is named (commit, checkpoint, or \`git revert\`).
763
+ 4. The plan's claims match the declared Task fields.
764
+ 5. Nothing in lessons.md contradicts this plan.
765
+
766
+ Output EXACTLY one of these two formats as the LAST thing in your response. No preamble before the verdict line.
767
+
768
+ SIGNOFF: <one sentence on why the plan is safe>
769
+
770
+ or
771
+
772
+ REJECT: <one sentence on what is wrong>
773
+ FIX: <one sentence on what must change>
774
+ PROPOSED:
775
+ Files: <concrete path list, or omit this line if original is fine>
776
+ Exit: <sharp observable done condition, or omit this line if original is fine>
777
+ Verify: <falsifiable shell command, or omit this line if original is fine>
778
+ Rollback: <git revert <sha> or concrete checkpoint, or omit this line if original is fine>
779
+
780
+ Be a drafting partner, not just a critic. When you REJECT, write the PROPOSED block as a concrete draft the human can accept as-is, edit, or reject. Include each PROPOSED line only for fields that need changing; skip a line if the original is correct. Omit the entire PROPOSED block only if the rejection is about scope or intent rather than a draftable field.
781
+ `;
782
+ }
783
+
784
+ /**
785
+ * Parse the validator's verdict line(s) from their output. Returns one of:
786
+ * { verdict: 'SIGNOFF', reason }
787
+ * { verdict: 'REJECT', reason, fix }
788
+ * If neither format is present, treats it as a REJECT with a parse-fail reason.
789
+ */
790
+ function parseVerdict(output) {
791
+ const text = String(output || '');
792
+ const rawLines = text.split('\n');
793
+ const lines = rawLines.map((l) => l.trim()).filter(Boolean);
794
+ // Scan from the end backwards — the verdict is supposed to be LAST.
795
+ for (let i = lines.length - 1; i >= 0; i--) {
796
+ const line = lines[i];
797
+ if (/^SIGNOFF\s*:/i.test(line)) {
798
+ return { verdict: 'SIGNOFF', reason: line.replace(/^SIGNOFF\s*:\s*/i, ''), fix: '', proposed: null };
799
+ }
800
+ if (/^REJECT\s*:/i.test(line)) {
801
+ const reason = line.replace(/^REJECT\s*:\s*/i, '');
802
+ // Fix line is usually immediately after REJECT.
803
+ const tail = lines.slice(i);
804
+ const fixLine = tail.find((l) => /^FIX\s*:/i.test(l));
805
+ const fix = fixLine ? fixLine.replace(/^FIX\s*:\s*/i, '') : '';
806
+ const proposed = parseProposedBlock(rawLines.slice(rawLines.findIndex((l) => /PROPOSED\s*:/i.test(l))));
807
+ return { verdict: 'REJECT', reason, fix, proposed };
808
+ }
809
+ }
810
+ return {
811
+ verdict: 'REJECT',
812
+ reason: 'validator output did not contain SIGNOFF or REJECT',
813
+ fix: 'ensure validator emits machine-parseable verdict as the last line',
814
+ proposed: null,
815
+ };
816
+ }
817
+
818
+ /**
819
+ * Parse the PROPOSED block: 4 optional indented fields (Files, Exit, Verify,
820
+ * Rollback). Returns null if no block, or an object with only the fields the
821
+ * validator chose to propose.
822
+ */
823
+ function parseProposedBlock(lines) {
824
+ if (!lines || !lines.length || !/PROPOSED\s*:/i.test(lines[0] || '')) return null;
825
+ const proposed = {};
826
+ const fieldMatchers = {
827
+ files: /^\s*Files\s*:\s*(.+)$/i,
828
+ exit: /^\s*Exit\s*:\s*(.+)$/i,
829
+ verify: /^\s*Verify\s*:\s*(.+)$/i,
830
+ rollback: /^\s*Rollback\s*:\s*(.+)$/i,
831
+ };
832
+ for (let j = 1; j < lines.length; j++) {
833
+ const raw = lines[j];
834
+ // Stop at a blank line or a new top-level marker (no leading whitespace
835
+ // and a known verb). Keep scanning through indented lines.
836
+ if (/^\S/.test(raw) && !/^(Files|Exit|Verify|Rollback)\s*:/i.test(raw)) break;
837
+ for (const [key, matcher] of Object.entries(fieldMatchers)) {
838
+ const m = raw.match(matcher);
839
+ if (m) proposed[key] = m[1].trim();
840
+ }
841
+ }
842
+ return Object.keys(proposed).length ? proposed : null;
843
+ }
844
+
845
+ /**
846
+ * Default executor for plan-review: spawn a fresh claude -p call.
847
+ * Kept thin so tests can inject a stub via options.planReviewExec.
848
+ */
849
+ function defaultPlanReviewExecutor(prompt, { cwd, timeout = 180000 } = {}) {
850
+ const tmpFile = path.join(cwd, '.autopilot-plan-review.tmp');
851
+ fs.writeFileSync(tmpFile, prompt);
852
+ try {
853
+ const cmd = `claude -p "$(cat '${tmpFile.replace(/'/g, "'\\''")}')" --allowedTools "Bash,Read,Grep,Glob"`;
854
+ const env = { ...process.env };
855
+ delete env.CLAUDECODE;
856
+ const output = execSync(cmd, {
857
+ cwd,
858
+ encoding: 'utf8',
859
+ timeout,
860
+ stdio: 'pipe',
861
+ maxBuffer: 10 * 1024 * 1024,
862
+ env,
863
+ });
864
+ return output || '';
865
+ } catch (err) {
866
+ if (err.stdout) return err.stdout;
867
+ throw err;
868
+ } finally {
869
+ try { fs.unlinkSync(tmpFile); } catch {}
870
+ }
871
+ }
872
+
873
+ /**
874
+ * Default executor for codex: spawn `codex` with the prompt via stdin.
875
+ * Users can override with ATRIS_CODEX_CMD env var; tests inject via options.codexExec.
876
+ */
877
+ function defaultCodexExecutor(prompt, { cwd, timeout = 180000 } = {}) {
878
+ const cmd = process.env.ATRIS_CODEX_CMD || 'codex';
879
+ const proc = spawnSync(cmd, ['-p', prompt], {
880
+ cwd,
881
+ encoding: 'utf8',
882
+ timeout,
883
+ stdio: 'pipe',
884
+ maxBuffer: 10 * 1024 * 1024,
885
+ });
886
+ if (proc.status !== 0 && !proc.stdout) {
887
+ throw new Error(`codex exited with status ${proc.status}: ${proc.stderr || 'no output'}`);
888
+ }
889
+ return proc.stdout || '';
890
+ }
891
+
892
+ /**
893
+ * Check if codex is available on PATH (or ATRIS_CODEX_CMD points to something runnable).
894
+ * Kept simple: `which` probe. Tests override via options.hasCodex.
895
+ */
896
+ function hasCodex() {
897
+ const cmd = process.env.ATRIS_CODEX_CMD || 'codex';
898
+ try {
899
+ const r = spawnSync('which', [cmd], { stdio: 'pipe' });
900
+ return r.status === 0;
901
+ } catch {
902
+ return false;
903
+ }
904
+ }
905
+
906
+ /**
907
+ * Run plan-review: the validator (and optionally codex) read the plan and
908
+ * decide if it is safe to execute. Returns { verdict, reason, fix, signers, notes }.
909
+ *
910
+ * Codex is invoked only when the task explicitly opts in:
911
+ * - env ATRIS_USE_CODEX=1, or
912
+ * - context.tags includes 'codex', or
913
+ * - context.kind === 'endgame' AND context.tags includes 'gray' or 'high-risk'
914
+ *
915
+ * If codex is opted-in but not installed, we skip gracefully and surface a note.
916
+ * If both signers run and disagree, verdict is REJECT with both opinions in reason.
917
+ */
918
+ function runPlanReview({ cwd, context, planOutput, options = {} }) {
919
+ const prompt = buildPlanReviewPrompt(context, planOutput);
920
+ const tags = Array.isArray(context.tags) ? context.tags : [];
921
+
922
+ // Primary signer: validator.
923
+ const validatorExec = options.planReviewExec || defaultPlanReviewExecutor;
924
+ const validatorOutput = validatorExec(prompt, { cwd, role: 'validator' });
925
+ const primary = parseVerdict(validatorOutput);
926
+
927
+ // Codex: opted in explicitly, not inferred.
928
+ const codexOptIn =
929
+ process.env.ATRIS_USE_CODEX === '1' ||
930
+ tags.includes('codex') ||
931
+ tags.includes('gray') ||
932
+ tags.includes('high-risk');
933
+
934
+ if (!codexOptIn) {
935
+ return { ...primary, signers: ['validator'], proposed: primary.proposed || null };
936
+ }
937
+
938
+ const codexCheck = options.hasCodex != null ? options.hasCodex : hasCodex();
939
+ if (!codexCheck) {
940
+ return {
941
+ ...primary,
942
+ signers: ['validator'],
943
+ proposed: primary.proposed || null,
944
+ notes: 'codex was requested but not on PATH; skipped gracefully',
945
+ };
946
+ }
947
+
948
+ const codexExec = options.codexExec || defaultCodexExecutor;
949
+ let codexOutput;
950
+ try {
951
+ codexOutput = codexExec(prompt, { cwd, role: 'codex' });
952
+ } catch (err) {
953
+ return {
954
+ ...primary,
955
+ signers: ['validator'],
956
+ notes: `codex invocation failed: ${err.message}; falling back to single signer`,
957
+ };
958
+ }
959
+ const codex = parseVerdict(codexOutput);
960
+
961
+ if (primary.verdict === 'SIGNOFF' && codex.verdict === 'SIGNOFF') {
962
+ return {
963
+ verdict: 'SIGNOFF',
964
+ reason: primary.reason,
965
+ fix: '',
966
+ proposed: null,
967
+ signers: ['validator', 'codex'],
968
+ };
969
+ }
970
+
971
+ // Any disagreement or joint reject → halt with both opinions surfaced.
972
+ // If either signer wrote a PROPOSED draft, surface the validator's first
973
+ // (or codex's if validator didn't propose one).
974
+ return {
975
+ verdict: 'REJECT',
976
+ reason: `Split verdict. validator=${primary.verdict} (${primary.reason || 'no reason'}); codex=${codex.verdict} (${codex.reason || 'no reason'}).`,
977
+ fix: primary.fix || codex.fix || 'reconcile the two signers before re-planning',
978
+ proposed: primary.proposed || codex.proposed || null,
979
+ signers: ['validator', 'codex'],
980
+ split: true,
981
+ };
982
+ }
983
+
984
+ /**
985
+ * Append a plan-review rejection to today's journal under ## Notes.
986
+ * Intentionally does NOT write to lessons.md — rejections only become lessons
987
+ * if a human spots a reusable failure pattern.
988
+ */
989
+ function appendPlanRejection(cwd, context, review) {
990
+ try {
991
+ // Compute the journal path from the passed cwd so tests and isolated
992
+ // workspaces both work. getLogPath() resolves against process.cwd()
993
+ // which isn't always the task's workspace.
994
+ const date = new Date();
995
+ const year = date.getFullYear();
996
+ const month = String(date.getMonth() + 1).padStart(2, '0');
997
+ const day = String(date.getDate()).padStart(2, '0');
998
+ const logFile = path.join(cwd, 'atris', 'logs', String(year), `${year}-${month}-${day}.md`);
999
+ if (!fs.existsSync(logFile)) return;
1000
+ const now = new Date().toISOString().slice(0, 16).replace('T', ' ');
1001
+ const signers = (review.signers || []).join(' + ');
1002
+ const proposedBlock = review.proposed
1003
+ ? `**Proposed draft:**\n` +
1004
+ (review.proposed.files ? `- Files: ${review.proposed.files}\n` : '') +
1005
+ (review.proposed.exit ? `- Exit: ${review.proposed.exit}\n` : '') +
1006
+ (review.proposed.verify ? `- Verify: ${review.proposed.verify}\n` : '') +
1007
+ (review.proposed.rollback ? `- Rollback: ${review.proposed.rollback}\n` : '')
1008
+ : '';
1009
+ const block =
1010
+ `\n### Plan rejected — ${now}\n\n` +
1011
+ `**Task:** ${context.task}\n` +
1012
+ `**Signers:** ${signers}\n` +
1013
+ `**Reason:** ${review.reason}\n` +
1014
+ (review.fix ? `**Fix:** ${review.fix}\n` : '') +
1015
+ (proposedBlock ? `${proposedBlock}` : '') +
1016
+ (review.notes ? `**Notes:** ${review.notes}\n` : '');
1017
+ let content = fs.readFileSync(logFile, 'utf8');
1018
+ const notesIdx = content.indexOf('## Notes');
1019
+ if (notesIdx === -1) {
1020
+ content = content.replace(/\s*$/, '') + `\n\n## Notes\n${block}\n`;
1021
+ } else {
1022
+ const eol = content.indexOf('\n', notesIdx);
1023
+ content = content.slice(0, eol + 1) + block + content.slice(eol + 1);
1024
+ }
1025
+ fs.writeFileSync(logFile, content);
1026
+ } catch {
1027
+ // journaling must never crash the tick
1028
+ }
575
1029
  }
576
1030
 
577
1031
  function runTaskOnce(context, options = {}) {
578
1032
  const { verbose = false, cwd = process.cwd() } = options;
1033
+
1034
+ // Judge integrity check — halt if computeTickReward was tampered with
1035
+ const integrity = verifyJudgeIntegrity();
1036
+ if (!integrity.ok) {
1037
+ writeLesson(cwd, 'judge-corruption', 'fail',
1038
+ `computeTickReward checksum mismatch. Expected ${integrity.expected}, got ${integrity.actual}. Tick halted.`);
1039
+ return {
1040
+ outcome: 'halted',
1041
+ reason: 'judge-corruption',
1042
+ phaseResults: {},
1043
+ elapsedSeconds: 0,
1044
+ verifyRan: false,
1045
+ verifyPass: false,
1046
+ };
1047
+ }
1048
+
579
1049
  const phaseResults = {};
580
1050
  const startedAt = Date.now();
581
- const verifyCmd = getVerifyCommand(cwd, context.task);
1051
+ const verifyResult = getVerifyCommand(cwd, context.task);
1052
+ const verifyCmd = verifyResult.cmd;
1053
+
1054
+ // Guard: endgame tasks must have an explicit Verify field.
1055
+ // Reactive signals (inbox, staleness, imagined) use npm test as default.
1056
+ if (!verifyResult.explicit && context.kind === 'endgame') {
1057
+ writeLesson(cwd, 'no-verify-field', 'fail',
1058
+ `Task "${context.task}" has no explicit **Verify:** field in TODO.md. Tick halted — every endgame task must declare how to verify it.`);
1059
+ return {
1060
+ outcome: 'halted',
1061
+ reason: 'no-verify-field',
1062
+ phaseResults: {},
1063
+ elapsedSeconds: 0,
1064
+ verifyRan: false,
1065
+ verifyPass: false,
1066
+ };
1067
+ }
1068
+
1069
+ // Falsifiability gate (endgame + explicit Verify only).
1070
+ // Run Verify BEFORE the work. If it passes, the rubric is trivial or the
1071
+ // task is already done — either way, halt. This is the keystone that makes
1072
+ // Verify load-bearing. The cmd is captured here and reused post-execute so
1073
+ // an agent cannot swap the rubric mid-tick.
1074
+ const skipFalsifiability = options.skipFalsifiability === true;
1075
+ if (!skipFalsifiability && verifyResult.explicit && context.kind === 'endgame' && verifyCmd) {
1076
+ try {
1077
+ execSync(verifyCmd, { cwd, stdio: 'pipe', timeout: 60000 });
1078
+ writeLesson(cwd, 'verify-not-falsifiable', 'fail',
1079
+ `Verify \`${verifyCmd}\` passed before work started on "${context.task}". Either the rubric is trivial or the task is already done. Tick halted.`);
1080
+ return {
1081
+ outcome: 'halted',
1082
+ reason: 'verify-not-falsifiable',
1083
+ phaseResults: {},
1084
+ elapsedSeconds: 0,
1085
+ verifyRan: true,
1086
+ verifyPass: false,
1087
+ };
1088
+ } catch {
1089
+ // Pre-verify failed — good, the rubric is falsifiable. Proceed.
1090
+ }
1091
+ }
582
1092
 
583
- for (const phase of ['plan', 'do', 'review']) {
1093
+ // Phase: plan
1094
+ {
584
1095
  const t0 = Date.now();
585
- const result = executePhaseDetailed(phase, context, options);
586
- phaseResults[phase] = {
1096
+ const result = (options.phaseExec || executePhaseDetailed)('plan', context, options);
1097
+ phaseResults.plan = {
1098
+ prompt: result.prompt,
1099
+ output: result.output || '',
1100
+ elapsedSeconds: Math.round((Date.now() - t0) / 1000),
1101
+ };
1102
+ }
1103
+
1104
+ // Phase: plan-review — validator reads the plan fresh and signs off or rejects.
1105
+ // Can be skipped via options.skipPlanReview (tests only). Codex is optional,
1106
+ // opt-in via env var / tags. On REJECT, the tick halts and the rejection is
1107
+ // journaled; lessons.md is NOT touched (only promoted lessons go there).
1108
+ if (!options.skipPlanReview) {
1109
+ const t0 = Date.now();
1110
+ const review = runPlanReview({
1111
+ cwd,
1112
+ context,
1113
+ planOutput: phaseResults.plan.output,
1114
+ options,
1115
+ });
1116
+ const elapsed = Math.round((Date.now() - t0) / 1000);
1117
+ phaseResults['plan-review'] = {
1118
+ output:
1119
+ `${review.verdict}: ${review.reason || ''}` +
1120
+ (review.fix ? `\nFIX: ${review.fix}` : '') +
1121
+ (review.notes ? `\n(${review.notes})` : ''),
1122
+ signers: review.signers,
1123
+ elapsedSeconds: elapsed,
1124
+ };
1125
+
1126
+ if (review.verdict === 'REJECT') {
1127
+ appendPlanRejection(cwd, context, review);
1128
+ return {
1129
+ outcome: 'halted',
1130
+ reason: 'plan-rejected-at-review',
1131
+ phaseResults,
1132
+ elapsedSeconds: Math.round((Date.now() - startedAt) / 1000),
1133
+ verifyRan: false,
1134
+ verifyPass: false,
1135
+ };
1136
+ }
1137
+ }
1138
+
1139
+ // Phase: do
1140
+ {
1141
+ const t0 = Date.now();
1142
+ const result = (options.phaseExec || executePhaseDetailed)('do', context, options);
1143
+ phaseResults.do = {
1144
+ prompt: result.prompt,
1145
+ output: result.output || '',
1146
+ elapsedSeconds: Math.round((Date.now() - t0) / 1000),
1147
+ };
1148
+ }
1149
+
1150
+ // Phase: review
1151
+ {
1152
+ const t0 = Date.now();
1153
+ const result = (options.phaseExec || executePhaseDetailed)('review', context, options);
1154
+ phaseResults.review = {
587
1155
  prompt: result.prompt,
588
1156
  output: result.output || '',
589
1157
  elapsedSeconds: Math.round((Date.now() - t0) / 1000),
@@ -595,7 +1163,7 @@ function runTaskOnce(context, options = {}) {
595
1163
  // After review succeeds, run verify command if present
596
1164
  let verifyPass = false;
597
1165
  let verifyRan = false;
598
- if (!reviewOutput.includes('failed') && verifyCmd) {
1166
+ if (verifyCmd) {
599
1167
  verifyRan = true;
600
1168
  let t0 = Date.now();
601
1169
  try {
@@ -620,7 +1188,7 @@ function runTaskOnce(context, options = {}) {
620
1188
  }
621
1189
 
622
1190
  return {
623
- success: !reviewOutput.includes('failed') && (!verifyRan || verifyPass),
1191
+ success: verifyRan && verifyPass,
624
1192
  elapsedSeconds: Math.round((Date.now() - startedAt) / 1000),
625
1193
  phaseResults,
626
1194
  reviewOutput,
@@ -673,28 +1241,28 @@ function computeTickReward(execution, tickOutcome, verifyCmd) {
673
1241
 
674
1242
  // Validator clean: review passed without 'failed'
675
1243
  if (!execution.reviewOutput || !execution.reviewOutput.includes('failed')) {
676
- reward += 1;
1244
+ reward += REWARD_CONFIG.REVIEW_CLEAN;
677
1245
  }
678
1246
 
679
- // Verify passed: +3
1247
+ // Verify passed
680
1248
  if (execution.verifyRan && execution.verifyPass) {
681
- reward += 3;
1249
+ reward += REWARD_CONFIG.VERIFY_PASS;
682
1250
  }
683
1251
 
684
- // npm test passed: +2
1252
+ // npm test passed
685
1253
  if (execution.verifyRan && execution.verifyPass && verifyCmd === 'npm test') {
686
- reward += 2;
1254
+ reward += REWARD_CONFIG.NPM_TEST_BONUS;
687
1255
  }
688
1256
 
689
1257
  // Commit landed: check do phase output for git commit patterns
690
1258
  const doOutput = execution.phaseResults.do.output || '';
691
1259
  if (doOutput.match(/\[.*\s\d+\sfile.*changed/i) || doOutput.includes('git commit') || doOutput.includes('committed')) {
692
- reward += 1;
1260
+ reward += REWARD_CONFIG.COMMIT_LANDED;
693
1261
  }
694
1262
 
695
- // Halt caught hallucination: -3
1263
+ // Halt caught hallucination
696
1264
  if (tickOutcome === 'halted') {
697
- reward -= 3;
1265
+ reward += REWARD_CONFIG.HALT_PENALTY;
698
1266
  }
699
1267
 
700
1268
  return reward;
@@ -1070,7 +1638,15 @@ function getRecentSignals(cwd) {
1070
1638
  /**
1071
1639
  * Score endgame candidates by historical reward of similar horizon types.
1072
1640
  * Reads last 10 scorecards, infers type from slug prefix, calculates mean
1073
- * reward per type, scores candidates by expected value, applies 80/20 exploit/explore.
1641
+ * reward per type, scores candidates by expected value.
1642
+ *
1643
+ * Adaptive explore rate: if the last 5 endgames are all the same type,
1644
+ * explore rate boosts to 50%. Otherwise scales between 20%-50% based on
1645
+ * type repetition in the last 5.
1646
+ *
1647
+ * Difficulty floor: candidates whose inferred type has >80% success rate
1648
+ * AND mean reward >5 are filtered out when harder candidates exist, so
1649
+ * easy wins don't starve hard work.
1074
1650
  *
1075
1651
  * @param {string} cwd - Current working directory
1076
1652
  * @param {array} candidates - Array of { title, confidence, rationale }
@@ -1094,10 +1670,14 @@ function scoreEndgameCandidates(cwd, candidates) {
1094
1670
 
1095
1671
  // Infer type from slug/title by taking prefix before first dash
1096
1672
  const typeToRewards = {};
1673
+ const typeToAttempts = {}; // track shipped/attempted per type
1097
1674
  for (const sc of scorecards) {
1098
1675
  const type = sc.slug.split('-')[0];
1099
1676
  if (!typeToRewards[type]) typeToRewards[type] = [];
1100
1677
  typeToRewards[type].push(sc.totalReward);
1678
+ if (!typeToAttempts[type]) typeToAttempts[type] = { shipped: 0, attempted: 0 };
1679
+ typeToAttempts[type].shipped += sc.tasksShipped;
1680
+ typeToAttempts[type].attempted += sc.tasksAttempted;
1101
1681
  }
1102
1682
 
1103
1683
  // Calculate mean reward per type
@@ -1107,45 +1687,70 @@ function scoreEndgameCandidates(cwd, candidates) {
1107
1687
  typeMeans[type] = mean;
1108
1688
  }
1109
1689
 
1690
+ // Calculate success rate per type
1691
+ const typeSuccessRate = {};
1692
+ for (const [type, counts] of Object.entries(typeToAttempts)) {
1693
+ typeSuccessRate[type] = counts.attempted > 0 ? counts.shipped / counts.attempted : 0;
1694
+ }
1695
+
1696
+ // Adaptive explore rate based on diversity of last 5 scorecards
1697
+ const last5 = scorecards.slice(-5);
1698
+ const last5Types = last5.map(sc => sc.slug.split('-')[0]);
1699
+ const uniqueTypes = new Set(last5Types).size;
1700
+ // All same type → exploreRate=0.5; all different → exploreRate=0.2
1701
+ // Linear interpolation: exploreRate = 0.5 - (uniqueTypes - 1) * 0.3 / (last5Types.length - 1 || 1)
1702
+ const maxTypes = last5Types.length;
1703
+ const exploreRate = maxTypes <= 1
1704
+ ? 0.2
1705
+ : 0.5 - (uniqueTypes - 1) * 0.3 / (maxTypes - 1);
1706
+
1110
1707
  // Score each candidate by expected value based on historical type mean
1111
1708
  const scored = candidates.map(c => {
1112
1709
  // Infer type from title keywords that match scorecard slug prefixes
1113
1710
  const titleLower = (c.title || '').toLowerCase();
1114
1711
  const cType = Object.keys(typeMeans).find(t => titleLower.includes(t)) || titleLower.split(/[\s\-]+/)[0];
1115
1712
  const historicalMean = typeMeans[cType] !== undefined ? typeMeans[cType] : 0;
1713
+ const successRate = typeSuccessRate[cType] !== undefined ? typeSuccessRate[cType] : 0;
1116
1714
  const expectedValue = historicalMean * c.confidence;
1117
1715
  return {
1118
1716
  ...c,
1119
1717
  expectedValue,
1120
1718
  type: cType,
1121
- historicalMean
1719
+ historicalMean,
1720
+ successRate
1122
1721
  };
1123
1722
  });
1124
1723
 
1724
+ // Difficulty floor: filter out easy-win candidates (>80% success rate AND
1725
+ // mean reward >5) when harder candidates exist
1726
+ const hardCandidates = scored.filter(c => !(c.successRate > 0.8 && c.historicalMean > 5));
1727
+ const pool = hardCandidates.length > 0 ? hardCandidates : scored;
1728
+
1125
1729
  // Sort by expected value (descending)
1126
- scored.sort((a, b) => b.expectedValue - a.expectedValue);
1730
+ pool.sort((a, b) => b.expectedValue - a.expectedValue);
1127
1731
 
1128
- // 80/20 split: 80% exploit (best), 20% explore (random)
1732
+ // Adaptive exploit/explore split
1129
1733
  const choice = Math.random();
1130
1734
  let selected;
1131
- if (choice < 0.8) {
1735
+ if (choice < (1 - exploreRate)) {
1132
1736
  // Exploit: return highest expected value
1133
- selected = scored[0];
1737
+ selected = pool[0];
1134
1738
  } else {
1135
- // Explore: return random candidate
1739
+ // Explore: return random candidate from full scored list (not filtered)
1136
1740
  selected = scored[Math.floor(Math.random() * scored.length)];
1137
1741
  }
1138
1742
 
1139
- const reason = choice < 0.8
1140
- ? `exploit: type=${selected.type} mean-reward=${selected.historicalMean.toFixed(1)} expected-value=${selected.expectedValue.toFixed(1)}`
1141
- : `explore: random-candidate type=${selected.type}`;
1743
+ const reason = choice < (1 - exploreRate)
1744
+ ? `exploit: type=${selected.type} mean-reward=${selected.historicalMean.toFixed(1)} expected-value=${selected.expectedValue.toFixed(1)} explore-rate=${exploreRate.toFixed(2)}`
1745
+ : `explore: random-candidate type=${selected.type} explore-rate=${exploreRate.toFixed(2)}`;
1142
1746
 
1143
1747
  return {
1144
1748
  title: selected.title,
1145
1749
  confidence: selected.confidence,
1146
1750
  rationale: selected.rationale,
1147
1751
  scored: true,
1148
- reason
1752
+ reason,
1753
+ exploreRate
1149
1754
  };
1150
1755
  } catch (err) {
1151
1756
  // If scoring fails, fall back to best by confidence
@@ -1154,14 +1759,71 @@ function scoreEndgameCandidates(cwd, candidates) {
1154
1759
  }
1155
1760
  }
1156
1761
 
1762
+ /**
1763
+ * Check whether a lesson's bug pattern is still present in the named files.
1764
+ * Parses the lesson line for file paths (e.g. `commands/autopilot.js:116`)
1765
+ * and the slug (e.g. `inbox-parser-eats-hr-separator`). Greps the named
1766
+ * files for slug keywords. If none match → lesson is resolved.
1767
+ *
1768
+ * @param {string} lessonLine - A single line from lessons.md
1769
+ * @param {string} cwd - Current working directory
1770
+ * @returns {boolean} true if the lesson's bug pattern is gone (resolved)
1771
+ */
1772
+ function isLessonResolved(lessonLine, cwd) {
1773
+ // Extract slug: bold text after date, e.g. **[2026-04-08] inbox-parser-eats-hr-separator**
1774
+ const slugMatch = lessonLine.match(/\*\*\[\d{4}-\d{2}-\d{2}\]\s+([\w-]+)\*\*/);
1775
+ if (!slugMatch) return false;
1776
+ const slug = slugMatch[1];
1777
+
1778
+ // Extract file paths: patterns like `commands/autopilot.js:116` or `commands/run.js:157`
1779
+ const fileRefs = [];
1780
+ const filePattern = /`([a-zA-Z0-9_/./-]+\.[a-zA-Z]+(?::\d+(?:-\d+)?)?)`/g;
1781
+ let m;
1782
+ while ((m = filePattern.exec(lessonLine)) !== null) {
1783
+ const ref = m[1].replace(/:\d+(-\d+)?$/, ''); // strip line numbers
1784
+ if (ref.includes('/') || ref.endsWith('.js') || ref.endsWith('.md') || ref.endsWith('.ts')) {
1785
+ fileRefs.push(ref);
1786
+ }
1787
+ }
1788
+
1789
+ if (fileRefs.length === 0) return false;
1790
+
1791
+ // Derive keywords from slug (split on dashes, drop short words)
1792
+ const keywords = slug.split('-').filter(w => w.length > 2);
1793
+ if (keywords.length === 0) return false;
1794
+
1795
+ // Grep each named file for any keyword. If at least one file still matches → not resolved.
1796
+ for (const ref of fileRefs) {
1797
+ const absPath = path.isAbsolute(ref) ? ref : path.join(cwd, ref);
1798
+ if (!fs.existsSync(absPath)) continue; // file deleted = pattern gone
1799
+ for (const kw of keywords) {
1800
+ try {
1801
+ execFileSync('grep', ['-q', '-i', kw, absPath], {
1802
+ cwd,
1803
+ timeout: 5000,
1804
+ stdio: ['ignore', 'ignore', 'ignore']
1805
+ });
1806
+ // grep exited 0 → keyword found → lesson still applies
1807
+ return false;
1808
+ } catch {
1809
+ // grep exited non-zero → keyword not found in this file, continue
1810
+ }
1811
+ }
1812
+ }
1813
+
1814
+ // No keyword matched in any named file → lesson is resolved
1815
+ return true;
1816
+ }
1817
+
1157
1818
  /**
1158
1819
  * Propose 3 candidate next horizons for the autopilot loop. Combines
1159
1820
  * `getIdleTickCount` + `getRecentSignals` into a prompt asking the LLM
1160
1821
  * to imagine what to work on next, spawns `claude -p`, and parses the
1161
1822
  * JSON response into `[{ title, confidence, rationale }]`.
1162
1823
  *
1163
- * Throws on subprocess failure or when fewer than 3 valid candidates
1164
- * come back. Callers are responsible for catching and falling back.
1824
+ * Filters out candidates derived from resolved lessons (bug pattern no
1825
+ * longer present in named files). Resolved lessons get tagged `[resolved]`
1826
+ * in lessons.md. Requires at least 1 valid candidate after filtering.
1165
1827
  */
1166
1828
  async function proposeCandidateHorizons(cwd) {
1167
1829
  const idleTicks = getIdleTickCount(cwd);
@@ -1257,11 +1919,49 @@ Reply with the JSON array and nothing else.`;
1257
1919
  c.rationale.length > 0
1258
1920
  );
1259
1921
 
1260
- if (candidates.length < 3) {
1261
- throw new Error(`proposeCandidateHorizons: expected 3 valid candidates, got ${candidates.length}`);
1922
+ if (candidates.length < 1) {
1923
+ throw new Error(`proposeCandidateHorizons: expected at least 1 valid candidate, got ${candidates.length}`);
1924
+ }
1925
+
1926
+ // Filter out candidates derived from resolved lessons
1927
+ const lessonsPath = path.join(cwd, 'atris', 'lessons.md');
1928
+ const filtered = [];
1929
+ for (const c of candidates) {
1930
+ const combinedText = `${c.title} ${c.rationale}`.toLowerCase();
1931
+ let droppedByLesson = false;
1932
+ for (const lessonLine of signals.recentLessons) {
1933
+ const slugMatch = lessonLine.match(/\*\*\[\d{4}-\d{2}-\d{2}\]\s+([\w-]+)\*\*/);
1934
+ if (!slugMatch) continue;
1935
+ if (lessonLine.includes('[resolved]')) continue;
1936
+ const slug = slugMatch[1];
1937
+ // Fuzzy match: check if slug keywords appear in the candidate text
1938
+ const slugWords = slug.split('-').filter(w => w.length > 2);
1939
+ const matchCount = slugWords.filter(w => combinedText.includes(w)).length;
1940
+ if (matchCount < Math.ceil(slugWords.length * 0.5)) continue;
1941
+ // Candidate matches this lesson — check if the lesson is resolved
1942
+ if (isLessonResolved(lessonLine, cwd)) {
1943
+ // Tag lesson [resolved] in lessons.md
1944
+ try {
1945
+ let content = fs.readFileSync(lessonsPath, 'utf8');
1946
+ const taggedLine = lessonLine.replace(
1947
+ /\*\*\[(\d{4}-\d{2}-\d{2})\]\s+([\w-]+)\*\*/,
1948
+ '**[$1] $2** [resolved]'
1949
+ );
1950
+ content = content.replace(lessonLine.trim(), taggedLine.trim());
1951
+ fs.writeFileSync(lessonsPath, content);
1952
+ } catch {}
1953
+ droppedByLesson = true;
1954
+ break;
1955
+ }
1956
+ }
1957
+ if (!droppedByLesson) filtered.push(c);
1262
1958
  }
1263
1959
 
1264
- return candidates.slice(0, 3);
1960
+ if (filtered.length < 1) {
1961
+ throw new Error('proposeCandidateHorizons: all candidates were from resolved lessons');
1962
+ }
1963
+
1964
+ return filtered.slice(0, 3);
1265
1965
  }
1266
1966
 
1267
1967
  async function autopilotAtris(description, options = {}) {
@@ -1361,7 +2061,7 @@ async function autopilotAtris(description, options = {}) {
1361
2061
  break;
1362
2062
  }
1363
2063
 
1364
- const suggestion = await suggestNextTask(cwd, skipped);
2064
+ const suggestion = await suggestNextTask(cwd, skipped, { auto });
1365
2065
 
1366
2066
  if (!suggestion) {
1367
2067
  tickOutcome = 'idle';
@@ -1472,6 +2172,22 @@ async function autopilotAtris(description, options = {}) {
1472
2172
  const execution = runTaskOnce(context, { verbose, cwd });
1473
2173
  lastExecution = execution;
1474
2174
  lastVerifyCmd = execution.verifyCmd;
2175
+
2176
+ // Early halt — judge corruption or no verify field
2177
+ if (execution.outcome === 'halted') {
2178
+ tickOutcome = 'halted';
2179
+ tickOutcomeText = `I halted before running "${lastTaskTitle}": ${execution.reason}.`;
2180
+ tickNextStep = 'stop until a human looks at the error';
2181
+ if (!verbose) {
2182
+ printPlainBlock([
2183
+ `I halted: ${execution.reason}.`,
2184
+ '',
2185
+ 'Next I stopped the loop.'
2186
+ ].join('\n'));
2187
+ }
2188
+ break;
2189
+ }
2190
+
1475
2191
  const planTime = execution.phaseResults.plan.elapsedSeconds;
1476
2192
  if (verbose) console.log(` planned (${planTime}s)`);
1477
2193
 
@@ -1523,6 +2239,39 @@ async function autopilotAtris(description, options = {}) {
1523
2239
  tickOutcomeText = `I planned, built, and reviewed "${suggestion.task}".`;
1524
2240
  tickNextStep = 'pick the next endgame task';
1525
2241
  logCompletion(suggestion.task);
2242
+
2243
+ // Record commit hash + verify command for retroactive regression checks
2244
+ try {
2245
+ const commitHash = execSync('git rev-parse HEAD', { cwd, encoding: 'utf8' }).trim();
2246
+ const taskSlug = (suggestion.task || 'unknown').replace(/\s+/g, '-').toLowerCase().slice(0, 40);
2247
+ recordTickCommit(cwd, commitHash, execution.verifyCmd || '', taskSlug);
2248
+
2249
+ // Every 10th tick, run retroactive regression check
2250
+ const registryPath = path.join(cwd, 'atris', 'tick-registry.json');
2251
+ if (fs.existsSync(registryPath)) {
2252
+ try {
2253
+ const registry = JSON.parse(fs.readFileSync(registryPath, 'utf8'));
2254
+ if (Array.isArray(registry) && registry.length % 10 === 0) {
2255
+ const regressionResults = regressionCheck(cwd);
2256
+ const failures = regressionResults.filter(r => !r.pass && !r.skipped);
2257
+ if (failures.length > 0) {
2258
+ // Apply -5 retroactive penalty per failure via journal note
2259
+ for (const f of failures) {
2260
+ appendTickSummary(cwd, {
2261
+ outcome: `Retroactive regression failure: tick ${f.hash.slice(0, 7)} (${f.slug}) verify now fails. -5 penalty.`,
2262
+ horizon: readHorizonSlug(cwd),
2263
+ nextStep: 'investigate regression',
2264
+ reward: -5,
2265
+ });
2266
+ }
2267
+ if (verbose) console.log(` regression check: ${failures.length} failure(s) found`);
2268
+ } else if (verbose) {
2269
+ console.log(` regression check: all ${regressionResults.length} entries pass`);
2270
+ }
2271
+ }
2272
+ } catch { /* registry read failure must not crash */ }
2273
+ }
2274
+ } catch { /* commit recording failure must not crash the tick */ }
1526
2275
  if (maybeWriteCompletedEndgameScorecard(cwd, startingEndgame)) {
1527
2276
  tickNextStep = 'pick the next horizon';
1528
2277
  }
@@ -1602,6 +2351,152 @@ async function autopilotAtris(description, options = {}) {
1602
2351
  return { success: completed > 0, completed };
1603
2352
  }
1604
2353
 
2354
+ /**
2355
+ * Compute age in days for a task.
2356
+ * Endgame tasks use the Picked: date from TODO.md Endgame section.
2357
+ * In-progress tasks parse timestamp from Claimed by: field.
2358
+ * Fallback returns 0 (fresh).
2359
+ */
2360
+ function getTaskAgeDays(task, todoPath) {
2361
+ if (task.claimed) {
2362
+ const tsMatch = task.claimed.match(/\d{4}-\d{2}-\d{2}/);
2363
+ if (tsMatch) {
2364
+ const d = new Date(tsMatch[0]);
2365
+ if (!isNaN(d)) return Math.floor((Date.now() - d.getTime()) / (1000 * 60 * 60 * 24));
2366
+ }
2367
+ }
2368
+ if (task.tag === 'endgame' && todoPath && fs.existsSync(todoPath)) {
2369
+ const content = fs.readFileSync(todoPath, 'utf8');
2370
+ const m = content.match(/\*\*Picked:\*\*\s*(\d{4}-\d{2}-\d{2})/);
2371
+ if (m) {
2372
+ const d = new Date(m[1]);
2373
+ if (!isNaN(d)) return Math.floor((Date.now() - d.getTime()) / (1000 * 60 * 60 * 24));
2374
+ }
2375
+ }
2376
+ return 0;
2377
+ }
2378
+
2379
+ /**
2380
+ * Check whether a task/fact is still actionable.
2381
+ *
2382
+ * @param {{ title: string, age: number, source?: string }} fact
2383
+ * - title: the task or fact description
2384
+ * - age: age in days since the task was created/last verified
2385
+ * - source: optional file path or identifier where the fact originated
2386
+ * @param {string} cwd - workspace root
2387
+ * @returns {'actionable'|'unverified'|'stale'}
2388
+ */
2389
+ function isStillTrue(fact, cwd) {
2390
+ const { title, age, source } = fact;
2391
+
2392
+ // Fresh tasks are always actionable
2393
+ if (age <= 7) return 'actionable';
2394
+
2395
+ // Extract searchable keywords from the title (skip short/common words)
2396
+ const keywords = title
2397
+ .replace(/[`\[\](){}]/g, '')
2398
+ .split(/[\s/\\.:,;]+/)
2399
+ .filter(w => w.length > 3)
2400
+ .slice(0, 5);
2401
+
2402
+ if (keywords.length === 0) return 'unverified';
2403
+
2404
+ // Strategy 1: If source file is given, check it still exists
2405
+ if (source) {
2406
+ const sourcePath = path.isAbsolute(source) ? source : path.join(cwd, source);
2407
+ if (!fs.existsSync(sourcePath)) return 'stale';
2408
+ }
2409
+
2410
+ // Strategy 2: grep the codebase for key terms from the title
2411
+ let grepHits = 0;
2412
+ for (const kw of keywords) {
2413
+ try {
2414
+ execFileSync('grep', ['-r', '-l', '--include=*.js', '--include=*.md', '-m', '1', kw, '.'], {
2415
+ cwd,
2416
+ stdio: ['ignore', 'pipe', 'ignore'],
2417
+ timeout: 10000
2418
+ });
2419
+ grepHits++;
2420
+ } catch {
2421
+ // grep returns non-zero when no match — that's fine
2422
+ }
2423
+ }
2424
+
2425
+ // If none of the keywords appear in the codebase, it's stale
2426
+ if (grepHits === 0) return 'stale';
2427
+
2428
+ // Strategy 3: check git log for recent activity related to the keywords
2429
+ let gitHits = 0;
2430
+ for (const kw of keywords.slice(0, 3)) {
2431
+ try {
2432
+ const out = execFileSync(
2433
+ 'git', ['log', '--oneline', '--since=30 days ago', '--all', `--grep=${kw}`, '-1'],
2434
+ { cwd, stdio: ['ignore', 'pipe', 'ignore'], timeout: 10000 }
2435
+ ).toString().trim();
2436
+ if (out.length > 0) gitHits++;
2437
+ } catch {
2438
+ // git-log failure is non-fatal
2439
+ }
2440
+ }
2441
+
2442
+ // Strong mechanical evidence: grep found terms AND recent git activity
2443
+ if (gitHits > 0) return 'actionable';
2444
+
2445
+ // Grep found terms but no recent git activity — can't fully verify
2446
+ return 'unverified';
2447
+ }
2448
+
2449
+ /**
2450
+ * Ask a local model whether a task/fact is still relevant.
2451
+ * Called when isStillTrue returns 'unverified' — the mechanical check
2452
+ * couldn't confirm or deny, so we ask claude -p to inspect the codebase.
2453
+ *
2454
+ * @param {{ title: string, age: number, source?: string }} fact
2455
+ * @param {string} cwd - workspace root
2456
+ * @returns {{ fresh: boolean, reasoning: string }}
2457
+ */
2458
+ function askModel(fact, cwd) {
2459
+ const { title, source } = fact;
2460
+ const sourceHint = source ? `\nOriginal source file: ${source}` : '';
2461
+ const prompt = `You are a staleness checker. Answer with exactly one line: YES or NO, followed by a short reason (under 30 words).
2462
+
2463
+ Is this task still relevant to the codebase? Check for the mentioned files, functions, or patterns.
2464
+
2465
+ Task: "${title}"${sourceHint}
2466
+
2467
+ Search the codebase to verify. Reply: YES <reason> or NO <reason>`;
2468
+
2469
+ const tmpFile = path.join(cwd, '.staleness-prompt.tmp');
2470
+ fs.writeFileSync(tmpFile, prompt);
2471
+
2472
+ try {
2473
+ const env = { ...process.env };
2474
+ delete env.CLAUDECODE;
2475
+ const cmd = `claude -p "$(cat '${tmpFile.replace(/'/g, "'\\''")}')" --allowedTools "Bash,Read,Glob,Grep"`;
2476
+ const output = execSync(cmd, {
2477
+ cwd,
2478
+ encoding: 'utf8',
2479
+ timeout: 60000,
2480
+ stdio: 'pipe',
2481
+ maxBuffer: 2 * 1024 * 1024,
2482
+ env
2483
+ }).trim();
2484
+
2485
+ try { fs.unlinkSync(tmpFile); } catch {}
2486
+
2487
+ // Parse YES/NO from the first line of output
2488
+ const firstLine = output.split('\n').find(l => /^\s*(YES|NO)\b/i.test(l)) || output.split('\n')[0] || '';
2489
+ const fresh = /^\s*YES\b/i.test(firstLine);
2490
+ const reasoning = firstLine.replace(/^\s*(YES|NO)\s*/i, '').trim() || output.slice(0, 200);
2491
+
2492
+ return { fresh, reasoning };
2493
+ } catch (err) {
2494
+ try { fs.unlinkSync(tmpFile); } catch {}
2495
+ // On timeout or crash, treat as unverifiable — conservative default
2496
+ return { fresh: false, reasoning: `Model check failed: ${(err.message || '').slice(0, 100)}` };
2497
+ }
2498
+ }
2499
+
1605
2500
  /**
1606
2501
  * Entry point when called without a description.
1607
2502
  */
@@ -1611,19 +2506,30 @@ async function autopilotFromTodo(options = {}) {
1611
2506
 
1612
2507
  module.exports = {
1613
2508
  appendTickSummary,
2509
+ askHuman,
2510
+ askModel,
1614
2511
  autopilotAtris,
1615
2512
  autopilotFromTodo,
1616
2513
  buildPrompt,
2514
+ isLessonResolved,
2515
+ isStillTrue,
2516
+ getTaskAgeDays,
1617
2517
  getIdleTickCount,
1618
2518
  getRecentSignals,
1619
2519
  getTickStatus,
1620
2520
  getVerifyCommand,
1621
2521
  computeTickReward,
2522
+ verifyJudgeIntegrity,
1622
2523
  maybeWriteCompletedEndgameScorecard,
1623
2524
  renderHumanSuggestion,
1624
2525
  renderHumanTickIntro,
1625
2526
  proposeCandidateHorizons,
2527
+ recordTickCommit,
2528
+ regressionCheck,
2529
+ runPlanReview,
1626
2530
  runTaskOnce,
2531
+ buildPlanReviewPrompt,
2532
+ parseVerdict,
1627
2533
  scoreEndgameCandidates,
1628
2534
  suggestNextTask,
1629
2535
  writeLesson