atris 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@
8
8
 
9
9
  const fs = require('fs');
10
10
  const path = require('path');
11
- const { execSync } = require('child_process');
11
+ const { execSync, execFileSync } = require('child_process');
12
12
  const readline = require('readline');
13
13
  const { getLogPath, ensureLogDirectory, createLogFile } = require('../lib/journal');
14
14
  const { parseTodo } = require('../lib/todo');
@@ -19,6 +19,7 @@ const {
19
19
  writeScorecard,
20
20
  detectEndgameCompletion
21
21
  } = require('../lib/scorecard');
22
+ const { REWARD_CONFIG, REWARD_CHECKSUM } = require('../lib/reward-config');
22
23
 
23
24
  const pkg = require('../package.json');
24
25
 
@@ -28,7 +29,7 @@ const PHASE_TIMEOUT = 600000; // 10 min per phase
28
29
  * Scan workspace for the next thing worth doing.
29
30
  * Returns { task, why, kind } or null.
30
31
  */
31
- async function suggestNextTask(cwd, skipped = new Set()) {
32
+ async function suggestNextTask(cwd, skipped = new Set(), { auto = false } = {}) {
32
33
  const atrisDir = path.join(cwd, 'atris');
33
34
  const suggestions = [];
34
35
 
@@ -37,6 +38,7 @@ async function suggestNextTask(cwd, skipped = new Set()) {
37
38
  const todo = parseTodo(todoPath);
38
39
 
39
40
  for (const t of todo.backlog) {
41
+ if (t.tags && t.tags.includes('unverified')) continue;
40
42
  if (t.tag === 'endgame' && !skipped.has(t.title)) {
41
43
  suggestions.push({
42
44
  task: t.title,
@@ -51,7 +53,7 @@ async function suggestNextTask(cwd, skipped = new Set()) {
51
53
  // --- Resume interrupted work ---
52
54
  if (todo.inProgress.length > 0) {
53
55
  const t = todo.inProgress[0];
54
- if (!skipped.has(t.title)) {
56
+ if (!(t.tags && t.tags.includes('unverified')) && !skipped.has(t.title)) {
55
57
  suggestions.push({
56
58
  task: t.title,
57
59
  why: `This was already started${t.claimed ? ` by ${t.claimed}` : ''} but never finished.`,
@@ -102,15 +104,17 @@ async function suggestNextTask(cwd, skipped = new Set()) {
102
104
  }
103
105
 
104
106
  // --- Backlog tasks ---
105
- for (const t of todo.backlog.slice(0, 1)) {
107
+ for (const t of todo.backlog) {
108
+ if (t.tags && t.tags.includes('unverified')) continue;
106
109
  if (skipped.has(t.title)) continue;
107
- const remaining = todo.backlog.length;
110
+ const remaining = todo.backlog.filter(b => !(b.tags && b.tags.includes('unverified'))).length;
108
111
  suggestions.push({
109
112
  task: t.title,
110
113
  why: `Next in the backlog${t.tag ? ` (${t.tag})` : ''}. ${remaining} task${remaining > 1 ? 's' : ''} waiting.`,
111
114
  kind: 'backlog',
112
115
  priority: 5
113
116
  });
117
+ break;
114
118
  }
115
119
 
116
120
  // --- Unprocessed inbox items ---
@@ -223,7 +227,64 @@ async function suggestNextTask(cwd, skipped = new Set()) {
223
227
  }
224
228
 
225
229
  suggestions.sort((a, b) => a.priority - b.priority);
226
- return suggestions[0];
230
+
231
+ // Staleness gate: filter out unverified/stale suggestions
232
+ const staleSkipped = [];
233
+ const fresh = [];
234
+ for (const s of suggestions) {
235
+ const fakeTask = { title: s.task, tag: s.kind === 'endgame' ? 'endgame' : null, claimed: null };
236
+ if (s.kind === 'resume' && todo.inProgress.length > 0) {
237
+ fakeTask.claimed = todo.inProgress[0].claimed;
238
+ }
239
+ const age = getTaskAgeDays(fakeTask, todoPath);
240
+ const status = isStillTrue({ title: s.task, age, source: null }, cwd);
241
+ if (status === 'stale') {
242
+ staleSkipped.push({ task: s.task, status, reasoning: null });
243
+ continue;
244
+ }
245
+ if (status === 'unverified') {
246
+ if (auto) {
247
+ // Auto mode: use model check
248
+ const result = askModel({ title: s.task, age, source: null }, cwd);
249
+ if (!result.fresh) {
250
+ staleSkipped.push({ task: s.task, status: 'unverified (model: not fresh)', reasoning: result.reasoning });
251
+ continue;
252
+ }
253
+ } else {
254
+ // Interactive mode: ask the human
255
+ const result = await askHuman(s.task);
256
+ if (!result.fresh) {
257
+ staleSkipped.push({ task: s.task, status: 'unverified (human: not relevant)', reasoning: null });
258
+ continue;
259
+ }
260
+ }
261
+ }
262
+ fresh.push(s);
263
+ }
264
+
265
+ // Log skipped items to journal
266
+ if (staleSkipped.length > 0) {
267
+ try {
268
+ const { logFile } = getLogPath();
269
+ const now = new Date();
270
+ const hhmm = `${String(now.getHours()).padStart(2, '0')}:${String(now.getMinutes()).padStart(2, '0')}`;
271
+ const lines = staleSkipped.map(s => `- ${s.task} (${s.status})${s.reasoning ? ` — ${s.reasoning}` : ''}`);
272
+ const note = `\n### Staleness skip — ${hhmm}\n${lines.join('\n')}\n`;
273
+ if (fs.existsSync(logFile)) {
274
+ const content = fs.readFileSync(logFile, 'utf8');
275
+ const notesIdx = content.indexOf('## Notes');
276
+ if (notesIdx !== -1) {
277
+ const insertAt = content.indexOf('\n', notesIdx) + 1;
278
+ const updated = content.slice(0, insertAt) + note + content.slice(insertAt);
279
+ fs.writeFileSync(logFile, updated);
280
+ } else {
281
+ fs.appendFileSync(logFile, `\n## Notes\n${note}`);
282
+ }
283
+ }
284
+ } catch {}
285
+ }
286
+
287
+ return fresh[0] || null;
227
288
  }
228
289
 
229
290
  /**
@@ -242,6 +303,22 @@ function askApproval() {
242
303
  });
243
304
  }
244
305
 
306
+ /**
307
+ * Ask the human whether an unverified task is still relevant.
308
+ * Interactive mode only — in auto mode, caller skips silently.
309
+ * Returns { fresh: boolean }.
310
+ */
311
+ function askHuman(taskTitle) {
312
+ return new Promise((resolve) => {
313
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
314
+ rl.question(` is "${taskTitle}" still relevant? y/n → `, (answer) => {
315
+ rl.close();
316
+ const a = (answer || '').trim().toLowerCase();
317
+ resolve({ fresh: a === 'y' || a === 'yes' });
318
+ });
319
+ });
320
+ }
321
+
245
322
  /**
246
323
  * Run a phase via claude -p subprocess.
247
324
  */
@@ -555,30 +632,141 @@ function writeLesson(cwd, slug, status, explanation) {
555
632
  fs.writeFileSync(lessonsPath, content);
556
633
  }
557
634
 
635
+ /**
636
+ * Record a tick's commit hash and verify command in atris/tick-registry.json.
637
+ * Each entry: { hash, verifyCmd, slug, timestamp }.
638
+ */
639
+ function recordTickCommit(cwd, hash, verifyCmd, slug) {
640
+ const registryPath = path.join(cwd, 'atris', 'tick-registry.json');
641
+ let registry = [];
642
+ if (fs.existsSync(registryPath)) {
643
+ try { registry = JSON.parse(fs.readFileSync(registryPath, 'utf8')); } catch { registry = []; }
644
+ }
645
+ registry.push({ hash, verifyCmd, slug, timestamp: new Date().toISOString() });
646
+ fs.writeFileSync(registryPath, JSON.stringify(registry, null, 2) + '\n');
647
+ }
648
+
649
+ /**
650
+ * Retroactive regression check. Reads last 10 entries from tick-registry.json,
651
+ * re-runs each verify command at its original commit using git worktree,
652
+ * returns array of { hash, slug, pass }. On failure: writes a lesson with
653
+ * retroactive context.
654
+ */
655
+ function regressionCheck(cwd) {
656
+ const registryPath = path.join(cwd, 'atris', 'tick-registry.json');
657
+ if (!fs.existsSync(registryPath)) return [];
658
+
659
+ let registry = [];
660
+ try { registry = JSON.parse(fs.readFileSync(registryPath, 'utf8')); } catch { return []; }
661
+ if (!Array.isArray(registry) || registry.length === 0) return [];
662
+
663
+ const entries = registry.slice(-10);
664
+ const results = [];
665
+
666
+ for (const entry of entries) {
667
+ if (!entry.hash || !entry.verifyCmd) {
668
+ results.push({ hash: entry.hash, slug: entry.slug, pass: true, skipped: true });
669
+ continue;
670
+ }
671
+
672
+ const worktreePath = path.join(cwd, '.regression-worktree-' + entry.hash.slice(0, 8));
673
+ let pass = false;
674
+ try {
675
+ // Create a worktree at the commit
676
+ execSync(`git worktree add "${worktreePath}" ${entry.hash} --detach 2>/dev/null`, { cwd, stdio: 'pipe' });
677
+ try {
678
+ execSync(entry.verifyCmd, { cwd: worktreePath, stdio: 'pipe', timeout: 60000 });
679
+ pass = true;
680
+ } catch {
681
+ pass = false;
682
+ }
683
+ } catch {
684
+ // If worktree creation fails (e.g., commit doesn't exist), skip
685
+ results.push({ hash: entry.hash, slug: entry.slug, pass: true, skipped: true });
686
+ continue;
687
+ } finally {
688
+ // Clean up worktree
689
+ try { execSync(`git worktree remove "${worktreePath}" --force 2>/dev/null`, { cwd, stdio: 'pipe' }); } catch {}
690
+ }
691
+
692
+ if (!pass) {
693
+ writeLesson(cwd, `regression-${entry.slug || 'unknown'}`, 'fail',
694
+ `Retroactive regression: verify command for tick ${entry.hash.slice(0, 7)} (${entry.slug}) now fails. -5 retroactive penalty applied.`);
695
+ }
696
+
697
+ results.push({ hash: entry.hash, slug: entry.slug, pass });
698
+ }
699
+
700
+ return results;
701
+ }
702
+
558
703
  /**
559
704
  * Get the verify command for a task from TODO.md
560
705
  * Reads TODO.md, finds the task by title across active/completed sections,
561
706
  * and extracts the verify field.
562
- * Defaults to 'npm test' if no verify field found.
707
+ * Returns { cmd, explicit } — explicit is true only if the task has an explicit Verify field.
563
708
  */
564
709
  function getVerifyCommand(cwd, taskTitle) {
565
710
  const todoPath = path.join(cwd, 'atris', 'TODO.md');
566
- if (!fs.existsSync(todoPath)) return 'npm test';
711
+ if (!fs.existsSync(todoPath)) return { cmd: null, explicit: false };
567
712
 
568
713
  const todo = parseTodo(todoPath);
569
714
  const task = [...todo.inProgress, ...todo.backlog, ...todo.completed]
570
715
  .find(t => t.title === taskTitle);
571
716
 
572
- if (!task) return 'npm test';
573
- if (task.verify) return task.verify;
574
- return 'npm test';
717
+ if (!task || !task.verify) return { cmd: null, explicit: false };
718
+ return { cmd: task.verify, explicit: true };
719
+ }
720
+
721
+ /**
722
+ * Verify that computeTickReward has not been modified since ship time.
723
+ * Returns { ok, expected, actual }.
724
+ */
725
+ function verifyJudgeIntegrity() {
726
+ const crypto = require('crypto');
727
+ const h = crypto.createHash('sha256');
728
+ h.update(JSON.stringify(REWARD_CONFIG));
729
+ h.update(computeTickReward.toString());
730
+ const actual = h.digest('hex');
731
+ return { ok: actual === REWARD_CHECKSUM, expected: REWARD_CHECKSUM, actual };
575
732
  }
576
733
 
577
734
  function runTaskOnce(context, options = {}) {
578
735
  const { verbose = false, cwd = process.cwd() } = options;
736
+
737
+ // Judge integrity check — halt if computeTickReward was tampered with
738
+ const integrity = verifyJudgeIntegrity();
739
+ if (!integrity.ok) {
740
+ writeLesson(cwd, 'judge-corruption', 'fail',
741
+ `computeTickReward checksum mismatch. Expected ${integrity.expected}, got ${integrity.actual}. Tick halted.`);
742
+ return {
743
+ outcome: 'halted',
744
+ reason: 'judge-corruption',
745
+ phaseResults: {},
746
+ elapsedSeconds: 0,
747
+ verifyRan: false,
748
+ verifyPass: false,
749
+ };
750
+ }
751
+
579
752
  const phaseResults = {};
580
753
  const startedAt = Date.now();
581
- const verifyCmd = getVerifyCommand(cwd, context.task);
754
+ const verifyResult = getVerifyCommand(cwd, context.task);
755
+ const verifyCmd = verifyResult.cmd;
756
+
757
+ // Guard: refuse to run ticks without an explicit Verify field
758
+ if (!verifyResult.explicit) {
759
+ writeLesson(cwd, 'no-verify-field', 'fail',
760
+ `Task "${context.task}" has no explicit **Verify:** field in TODO.md. Tick halted — every task must declare how to verify it.`);
761
+ return {
762
+ outcome: 'halted',
763
+ reason: 'no-verify-field',
764
+ phaseResults: {},
765
+ elapsedSeconds: 0,
766
+ verifyRan: false,
767
+ verifyPass: false,
768
+ };
769
+ }
582
770
 
583
771
  for (const phase of ['plan', 'do', 'review']) {
584
772
  const t0 = Date.now();
@@ -595,7 +783,7 @@ function runTaskOnce(context, options = {}) {
595
783
  // After review succeeds, run verify command if present
596
784
  let verifyPass = false;
597
785
  let verifyRan = false;
598
- if (!reviewOutput.includes('failed') && verifyCmd) {
786
+ if (verifyCmd) {
599
787
  verifyRan = true;
600
788
  let t0 = Date.now();
601
789
  try {
@@ -620,7 +808,7 @@ function runTaskOnce(context, options = {}) {
620
808
  }
621
809
 
622
810
  return {
623
- success: !reviewOutput.includes('failed') && (!verifyRan || verifyPass),
811
+ success: verifyRan && verifyPass,
624
812
  elapsedSeconds: Math.round((Date.now() - startedAt) / 1000),
625
813
  phaseResults,
626
814
  reviewOutput,
@@ -673,28 +861,28 @@ function computeTickReward(execution, tickOutcome, verifyCmd) {
673
861
 
674
862
  // Validator clean: review passed without 'failed'
675
863
  if (!execution.reviewOutput || !execution.reviewOutput.includes('failed')) {
676
- reward += 1;
864
+ reward += REWARD_CONFIG.REVIEW_CLEAN;
677
865
  }
678
866
 
679
- // Verify passed: +3
867
+ // Verify passed
680
868
  if (execution.verifyRan && execution.verifyPass) {
681
- reward += 3;
869
+ reward += REWARD_CONFIG.VERIFY_PASS;
682
870
  }
683
871
 
684
- // npm test passed: +2
872
+ // npm test passed
685
873
  if (execution.verifyRan && execution.verifyPass && verifyCmd === 'npm test') {
686
- reward += 2;
874
+ reward += REWARD_CONFIG.NPM_TEST_BONUS;
687
875
  }
688
876
 
689
877
  // Commit landed: check do phase output for git commit patterns
690
878
  const doOutput = execution.phaseResults.do.output || '';
691
879
  if (doOutput.match(/\[.*\s\d+\sfile.*changed/i) || doOutput.includes('git commit') || doOutput.includes('committed')) {
692
- reward += 1;
880
+ reward += REWARD_CONFIG.COMMIT_LANDED;
693
881
  }
694
882
 
695
- // Halt caught hallucination: -3
883
+ // Halt caught hallucination
696
884
  if (tickOutcome === 'halted') {
697
- reward -= 3;
885
+ reward += REWARD_CONFIG.HALT_PENALTY;
698
886
  }
699
887
 
700
888
  return reward;
@@ -1070,7 +1258,15 @@ function getRecentSignals(cwd) {
1070
1258
  /**
1071
1259
  * Score endgame candidates by historical reward of similar horizon types.
1072
1260
  * Reads last 10 scorecards, infers type from slug prefix, calculates mean
1073
- * reward per type, scores candidates by expected value, applies 80/20 exploit/explore.
1261
+ * reward per type, scores candidates by expected value.
1262
+ *
1263
+ * Adaptive explore rate: if the last 5 endgames are all the same type,
1264
+ * explore rate boosts to 50%. Otherwise scales between 20%-50% based on
1265
+ * type repetition in the last 5.
1266
+ *
1267
+ * Difficulty floor: candidates whose inferred type has >80% success rate
1268
+ * AND mean reward >5 are filtered out when harder candidates exist, so
1269
+ * easy wins don't starve hard work.
1074
1270
  *
1075
1271
  * @param {string} cwd - Current working directory
1076
1272
  * @param {array} candidates - Array of { title, confidence, rationale }
@@ -1094,10 +1290,14 @@ function scoreEndgameCandidates(cwd, candidates) {
1094
1290
 
1095
1291
  // Infer type from slug/title by taking prefix before first dash
1096
1292
  const typeToRewards = {};
1293
+ const typeToAttempts = {}; // track shipped/attempted per type
1097
1294
  for (const sc of scorecards) {
1098
1295
  const type = sc.slug.split('-')[0];
1099
1296
  if (!typeToRewards[type]) typeToRewards[type] = [];
1100
1297
  typeToRewards[type].push(sc.totalReward);
1298
+ if (!typeToAttempts[type]) typeToAttempts[type] = { shipped: 0, attempted: 0 };
1299
+ typeToAttempts[type].shipped += sc.tasksShipped;
1300
+ typeToAttempts[type].attempted += sc.tasksAttempted;
1101
1301
  }
1102
1302
 
1103
1303
  // Calculate mean reward per type
@@ -1107,45 +1307,70 @@ function scoreEndgameCandidates(cwd, candidates) {
1107
1307
  typeMeans[type] = mean;
1108
1308
  }
1109
1309
 
1310
+ // Calculate success rate per type
1311
+ const typeSuccessRate = {};
1312
+ for (const [type, counts] of Object.entries(typeToAttempts)) {
1313
+ typeSuccessRate[type] = counts.attempted > 0 ? counts.shipped / counts.attempted : 0;
1314
+ }
1315
+
1316
+ // Adaptive explore rate based on diversity of last 5 scorecards
1317
+ const last5 = scorecards.slice(-5);
1318
+ const last5Types = last5.map(sc => sc.slug.split('-')[0]);
1319
+ const uniqueTypes = new Set(last5Types).size;
1320
+ // All same type → exploreRate=0.5; all different → exploreRate=0.2
1321
+ // Linear interpolation: exploreRate = 0.5 - (uniqueTypes - 1) * 0.3 / (last5Types.length - 1 || 1)
1322
+ const maxTypes = last5Types.length;
1323
+ const exploreRate = maxTypes <= 1
1324
+ ? 0.2
1325
+ : 0.5 - (uniqueTypes - 1) * 0.3 / (maxTypes - 1);
1326
+
1110
1327
  // Score each candidate by expected value based on historical type mean
1111
1328
  const scored = candidates.map(c => {
1112
1329
  // Infer type from title keywords that match scorecard slug prefixes
1113
1330
  const titleLower = (c.title || '').toLowerCase();
1114
1331
  const cType = Object.keys(typeMeans).find(t => titleLower.includes(t)) || titleLower.split(/[\s\-]+/)[0];
1115
1332
  const historicalMean = typeMeans[cType] !== undefined ? typeMeans[cType] : 0;
1333
+ const successRate = typeSuccessRate[cType] !== undefined ? typeSuccessRate[cType] : 0;
1116
1334
  const expectedValue = historicalMean * c.confidence;
1117
1335
  return {
1118
1336
  ...c,
1119
1337
  expectedValue,
1120
1338
  type: cType,
1121
- historicalMean
1339
+ historicalMean,
1340
+ successRate
1122
1341
  };
1123
1342
  });
1124
1343
 
1344
+ // Difficulty floor: filter out easy-win candidates (>80% success rate AND
1345
+ // mean reward >5) when harder candidates exist
1346
+ const hardCandidates = scored.filter(c => !(c.successRate > 0.8 && c.historicalMean > 5));
1347
+ const pool = hardCandidates.length > 0 ? hardCandidates : scored;
1348
+
1125
1349
  // Sort by expected value (descending)
1126
- scored.sort((a, b) => b.expectedValue - a.expectedValue);
1350
+ pool.sort((a, b) => b.expectedValue - a.expectedValue);
1127
1351
 
1128
- // 80/20 split: 80% exploit (best), 20% explore (random)
1352
+ // Adaptive exploit/explore split
1129
1353
  const choice = Math.random();
1130
1354
  let selected;
1131
- if (choice < 0.8) {
1355
+ if (choice < (1 - exploreRate)) {
1132
1356
  // Exploit: return highest expected value
1133
- selected = scored[0];
1357
+ selected = pool[0];
1134
1358
  } else {
1135
- // Explore: return random candidate
1359
+ // Explore: return random candidate from full scored list (not filtered)
1136
1360
  selected = scored[Math.floor(Math.random() * scored.length)];
1137
1361
  }
1138
1362
 
1139
- const reason = choice < 0.8
1140
- ? `exploit: type=${selected.type} mean-reward=${selected.historicalMean.toFixed(1)} expected-value=${selected.expectedValue.toFixed(1)}`
1141
- : `explore: random-candidate type=${selected.type}`;
1363
+ const reason = choice < (1 - exploreRate)
1364
+ ? `exploit: type=${selected.type} mean-reward=${selected.historicalMean.toFixed(1)} expected-value=${selected.expectedValue.toFixed(1)} explore-rate=${exploreRate.toFixed(2)}`
1365
+ : `explore: random-candidate type=${selected.type} explore-rate=${exploreRate.toFixed(2)}`;
1142
1366
 
1143
1367
  return {
1144
1368
  title: selected.title,
1145
1369
  confidence: selected.confidence,
1146
1370
  rationale: selected.rationale,
1147
1371
  scored: true,
1148
- reason
1372
+ reason,
1373
+ exploreRate
1149
1374
  };
1150
1375
  } catch (err) {
1151
1376
  // If scoring fails, fall back to best by confidence
@@ -1154,14 +1379,71 @@ function scoreEndgameCandidates(cwd, candidates) {
1154
1379
  }
1155
1380
  }
1156
1381
 
1382
+ /**
1383
+ * Check whether a lesson's bug pattern is still present in the named files.
1384
+ * Parses the lesson line for file paths (e.g. `commands/autopilot.js:116`)
1385
+ * and the slug (e.g. `inbox-parser-eats-hr-separator`). Greps the named
1386
+ * files for slug keywords. If none match → lesson is resolved.
1387
+ *
1388
+ * @param {string} lessonLine - A single line from lessons.md
1389
+ * @param {string} cwd - Current working directory
1390
+ * @returns {boolean} true if the lesson's bug pattern is gone (resolved)
1391
+ */
1392
+ function isLessonResolved(lessonLine, cwd) {
1393
+ // Extract slug: bold text after date, e.g. **[2026-04-08] inbox-parser-eats-hr-separator**
1394
+ const slugMatch = lessonLine.match(/\*\*\[\d{4}-\d{2}-\d{2}\]\s+([\w-]+)\*\*/);
1395
+ if (!slugMatch) return false;
1396
+ const slug = slugMatch[1];
1397
+
1398
+ // Extract file paths: patterns like `commands/autopilot.js:116` or `commands/run.js:157`
1399
+ const fileRefs = [];
1400
+ const filePattern = /`([a-zA-Z0-9_/./-]+\.[a-zA-Z]+(?::\d+(?:-\d+)?)?)`/g;
1401
+ let m;
1402
+ while ((m = filePattern.exec(lessonLine)) !== null) {
1403
+ const ref = m[1].replace(/:\d+(-\d+)?$/, ''); // strip line numbers
1404
+ if (ref.includes('/') || ref.endsWith('.js') || ref.endsWith('.md') || ref.endsWith('.ts')) {
1405
+ fileRefs.push(ref);
1406
+ }
1407
+ }
1408
+
1409
+ if (fileRefs.length === 0) return false;
1410
+
1411
+ // Derive keywords from slug (split on dashes, drop short words)
1412
+ const keywords = slug.split('-').filter(w => w.length > 2);
1413
+ if (keywords.length === 0) return false;
1414
+
1415
+ // Grep each named file for any keyword. If at least one file still matches → not resolved.
1416
+ for (const ref of fileRefs) {
1417
+ const absPath = path.isAbsolute(ref) ? ref : path.join(cwd, ref);
1418
+ if (!fs.existsSync(absPath)) continue; // file deleted = pattern gone
1419
+ for (const kw of keywords) {
1420
+ try {
1421
+ execFileSync('grep', ['-q', '-i', kw, absPath], {
1422
+ cwd,
1423
+ timeout: 5000,
1424
+ stdio: ['ignore', 'ignore', 'ignore']
1425
+ });
1426
+ // grep exited 0 → keyword found → lesson still applies
1427
+ return false;
1428
+ } catch {
1429
+ // grep exited non-zero → keyword not found in this file, continue
1430
+ }
1431
+ }
1432
+ }
1433
+
1434
+ // No keyword matched in any named file → lesson is resolved
1435
+ return true;
1436
+ }
1437
+
1157
1438
  /**
1158
1439
  * Propose 3 candidate next horizons for the autopilot loop. Combines
1159
1440
  * `getIdleTickCount` + `getRecentSignals` into a prompt asking the LLM
1160
1441
  * to imagine what to work on next, spawns `claude -p`, and parses the
1161
1442
  * JSON response into `[{ title, confidence, rationale }]`.
1162
1443
  *
1163
- * Throws on subprocess failure or when fewer than 3 valid candidates
1164
- * come back. Callers are responsible for catching and falling back.
1444
+ * Filters out candidates derived from resolved lessons (bug pattern no
1445
+ * longer present in named files). Resolved lessons get tagged `[resolved]`
1446
+ * in lessons.md. Requires at least 1 valid candidate after filtering.
1165
1447
  */
1166
1448
  async function proposeCandidateHorizons(cwd) {
1167
1449
  const idleTicks = getIdleTickCount(cwd);
@@ -1257,11 +1539,49 @@ Reply with the JSON array and nothing else.`;
1257
1539
  c.rationale.length > 0
1258
1540
  );
1259
1541
 
1260
- if (candidates.length < 3) {
1261
- throw new Error(`proposeCandidateHorizons: expected 3 valid candidates, got ${candidates.length}`);
1542
+ if (candidates.length < 1) {
1543
+ throw new Error(`proposeCandidateHorizons: expected at least 1 valid candidate, got ${candidates.length}`);
1544
+ }
1545
+
1546
+ // Filter out candidates derived from resolved lessons
1547
+ const lessonsPath = path.join(cwd, 'atris', 'lessons.md');
1548
+ const filtered = [];
1549
+ for (const c of candidates) {
1550
+ const combinedText = `${c.title} ${c.rationale}`.toLowerCase();
1551
+ let droppedByLesson = false;
1552
+ for (const lessonLine of signals.recentLessons) {
1553
+ const slugMatch = lessonLine.match(/\*\*\[\d{4}-\d{2}-\d{2}\]\s+([\w-]+)\*\*/);
1554
+ if (!slugMatch) continue;
1555
+ if (lessonLine.includes('[resolved]')) continue;
1556
+ const slug = slugMatch[1];
1557
+ // Fuzzy match: check if slug keywords appear in the candidate text
1558
+ const slugWords = slug.split('-').filter(w => w.length > 2);
1559
+ const matchCount = slugWords.filter(w => combinedText.includes(w)).length;
1560
+ if (matchCount < Math.ceil(slugWords.length * 0.5)) continue;
1561
+ // Candidate matches this lesson — check if the lesson is resolved
1562
+ if (isLessonResolved(lessonLine, cwd)) {
1563
+ // Tag lesson [resolved] in lessons.md
1564
+ try {
1565
+ let content = fs.readFileSync(lessonsPath, 'utf8');
1566
+ const taggedLine = lessonLine.replace(
1567
+ /\*\*\[(\d{4}-\d{2}-\d{2})\]\s+([\w-]+)\*\*/,
1568
+ '**[$1] $2** [resolved]'
1569
+ );
1570
+ content = content.replace(lessonLine.trim(), taggedLine.trim());
1571
+ fs.writeFileSync(lessonsPath, content);
1572
+ } catch {}
1573
+ droppedByLesson = true;
1574
+ break;
1575
+ }
1576
+ }
1577
+ if (!droppedByLesson) filtered.push(c);
1578
+ }
1579
+
1580
+ if (filtered.length < 1) {
1581
+ throw new Error('proposeCandidateHorizons: all candidates were from resolved lessons');
1262
1582
  }
1263
1583
 
1264
- return candidates.slice(0, 3);
1584
+ return filtered.slice(0, 3);
1265
1585
  }
1266
1586
 
1267
1587
  async function autopilotAtris(description, options = {}) {
@@ -1361,7 +1681,7 @@ async function autopilotAtris(description, options = {}) {
1361
1681
  break;
1362
1682
  }
1363
1683
 
1364
- const suggestion = await suggestNextTask(cwd, skipped);
1684
+ const suggestion = await suggestNextTask(cwd, skipped, { auto });
1365
1685
 
1366
1686
  if (!suggestion) {
1367
1687
  tickOutcome = 'idle';
@@ -1472,6 +1792,22 @@ async function autopilotAtris(description, options = {}) {
1472
1792
  const execution = runTaskOnce(context, { verbose, cwd });
1473
1793
  lastExecution = execution;
1474
1794
  lastVerifyCmd = execution.verifyCmd;
1795
+
1796
+ // Early halt — judge corruption or no verify field
1797
+ if (execution.outcome === 'halted') {
1798
+ tickOutcome = 'halted';
1799
+ tickOutcomeText = `I halted before running "${lastTaskTitle}": ${execution.reason}.`;
1800
+ tickNextStep = 'stop until a human looks at the error';
1801
+ if (!verbose) {
1802
+ printPlainBlock([
1803
+ `I halted: ${execution.reason}.`,
1804
+ '',
1805
+ 'Next I stopped the loop.'
1806
+ ].join('\n'));
1807
+ }
1808
+ break;
1809
+ }
1810
+
1475
1811
  const planTime = execution.phaseResults.plan.elapsedSeconds;
1476
1812
  if (verbose) console.log(` planned (${planTime}s)`);
1477
1813
 
@@ -1523,6 +1859,39 @@ async function autopilotAtris(description, options = {}) {
1523
1859
  tickOutcomeText = `I planned, built, and reviewed "${suggestion.task}".`;
1524
1860
  tickNextStep = 'pick the next endgame task';
1525
1861
  logCompletion(suggestion.task);
1862
+
1863
+ // Record commit hash + verify command for retroactive regression checks
1864
+ try {
1865
+ const commitHash = execSync('git rev-parse HEAD', { cwd, encoding: 'utf8' }).trim();
1866
+ const taskSlug = (suggestion.task || 'unknown').replace(/\s+/g, '-').toLowerCase().slice(0, 40);
1867
+ recordTickCommit(cwd, commitHash, execution.verifyCmd || '', taskSlug);
1868
+
1869
+ // Every 10th tick, run retroactive regression check
1870
+ const registryPath = path.join(cwd, 'atris', 'tick-registry.json');
1871
+ if (fs.existsSync(registryPath)) {
1872
+ try {
1873
+ const registry = JSON.parse(fs.readFileSync(registryPath, 'utf8'));
1874
+ if (Array.isArray(registry) && registry.length % 10 === 0) {
1875
+ const regressionResults = regressionCheck(cwd);
1876
+ const failures = regressionResults.filter(r => !r.pass && !r.skipped);
1877
+ if (failures.length > 0) {
1878
+ // Apply -5 retroactive penalty per failure via journal note
1879
+ for (const f of failures) {
1880
+ appendTickSummary(cwd, {
1881
+ outcome: `Retroactive regression failure: tick ${f.hash.slice(0, 7)} (${f.slug}) verify now fails. -5 penalty.`,
1882
+ horizon: readHorizonSlug(cwd),
1883
+ nextStep: 'investigate regression',
1884
+ reward: -5,
1885
+ });
1886
+ }
1887
+ if (verbose) console.log(` regression check: ${failures.length} failure(s) found`);
1888
+ } else if (verbose) {
1889
+ console.log(` regression check: all ${regressionResults.length} entries pass`);
1890
+ }
1891
+ }
1892
+ } catch { /* registry read failure must not crash */ }
1893
+ }
1894
+ } catch { /* commit recording failure must not crash the tick */ }
1526
1895
  if (maybeWriteCompletedEndgameScorecard(cwd, startingEndgame)) {
1527
1896
  tickNextStep = 'pick the next horizon';
1528
1897
  }
@@ -1602,6 +1971,152 @@ async function autopilotAtris(description, options = {}) {
1602
1971
  return { success: completed > 0, completed };
1603
1972
  }
1604
1973
 
1974
+ /**
1975
+ * Compute age in days for a task.
1976
+ * Endgame tasks use the Picked: date from TODO.md Endgame section.
1977
+ * In-progress tasks parse timestamp from Claimed by: field.
1978
+ * Fallback returns 0 (fresh).
1979
+ */
1980
+ function getTaskAgeDays(task, todoPath) {
1981
+ if (task.claimed) {
1982
+ const tsMatch = task.claimed.match(/\d{4}-\d{2}-\d{2}/);
1983
+ if (tsMatch) {
1984
+ const d = new Date(tsMatch[0]);
1985
+ if (!isNaN(d)) return Math.floor((Date.now() - d.getTime()) / (1000 * 60 * 60 * 24));
1986
+ }
1987
+ }
1988
+ if (task.tag === 'endgame' && todoPath && fs.existsSync(todoPath)) {
1989
+ const content = fs.readFileSync(todoPath, 'utf8');
1990
+ const m = content.match(/\*\*Picked:\*\*\s*(\d{4}-\d{2}-\d{2})/);
1991
+ if (m) {
1992
+ const d = new Date(m[1]);
1993
+ if (!isNaN(d)) return Math.floor((Date.now() - d.getTime()) / (1000 * 60 * 60 * 24));
1994
+ }
1995
+ }
1996
+ return 0;
1997
+ }
1998
+
1999
+ /**
2000
+ * Check whether a task/fact is still actionable.
2001
+ *
2002
+ * @param {{ title: string, age: number, source?: string }} fact
2003
+ * - title: the task or fact description
2004
+ * - age: age in days since the task was created/last verified
2005
+ * - source: optional file path or identifier where the fact originated
2006
+ * @param {string} cwd - workspace root
2007
+ * @returns {'actionable'|'unverified'|'stale'}
2008
+ */
2009
+ function isStillTrue(fact, cwd) {
2010
+ const { title, age, source } = fact;
2011
+
2012
+ // Fresh tasks are always actionable
2013
+ if (age <= 7) return 'actionable';
2014
+
2015
+ // Extract searchable keywords from the title (skip short/common words)
2016
+ const keywords = title
2017
+ .replace(/[`\[\](){}]/g, '')
2018
+ .split(/[\s/\\.:,;]+/)
2019
+ .filter(w => w.length > 3)
2020
+ .slice(0, 5);
2021
+
2022
+ if (keywords.length === 0) return 'unverified';
2023
+
2024
+ // Strategy 1: If source file is given, check it still exists
2025
+ if (source) {
2026
+ const sourcePath = path.isAbsolute(source) ? source : path.join(cwd, source);
2027
+ if (!fs.existsSync(sourcePath)) return 'stale';
2028
+ }
2029
+
2030
+ // Strategy 2: grep the codebase for key terms from the title
2031
+ let grepHits = 0;
2032
+ for (const kw of keywords) {
2033
+ try {
2034
+ execFileSync('grep', ['-r', '-l', '--include=*.js', '--include=*.md', '-m', '1', kw, '.'], {
2035
+ cwd,
2036
+ stdio: ['ignore', 'pipe', 'ignore'],
2037
+ timeout: 10000
2038
+ });
2039
+ grepHits++;
2040
+ } catch {
2041
+ // grep returns non-zero when no match — that's fine
2042
+ }
2043
+ }
2044
+
2045
+ // If none of the keywords appear in the codebase, it's stale
2046
+ if (grepHits === 0) return 'stale';
2047
+
2048
+ // Strategy 3: check git log for recent activity related to the keywords
2049
+ let gitHits = 0;
2050
+ for (const kw of keywords.slice(0, 3)) {
2051
+ try {
2052
+ const out = execFileSync(
2053
+ 'git', ['log', '--oneline', '--since=30 days ago', '--all', `--grep=${kw}`, '-1'],
2054
+ { cwd, stdio: ['ignore', 'pipe', 'ignore'], timeout: 10000 }
2055
+ ).toString().trim();
2056
+ if (out.length > 0) gitHits++;
2057
+ } catch {
2058
+ // git-log failure is non-fatal
2059
+ }
2060
+ }
2061
+
2062
+ // Strong mechanical evidence: grep found terms AND recent git activity
2063
+ if (gitHits > 0) return 'actionable';
2064
+
2065
+ // Grep found terms but no recent git activity — can't fully verify
2066
+ return 'unverified';
2067
+ }
2068
+
2069
+ /**
2070
+ * Ask a local model whether a task/fact is still relevant.
2071
+ * Called when isStillTrue returns 'unverified' — the mechanical check
2072
+ * couldn't confirm or deny, so we ask claude -p to inspect the codebase.
2073
+ *
2074
+ * @param {{ title: string, age: number, source?: string }} fact
2075
+ * @param {string} cwd - workspace root
2076
+ * @returns {{ fresh: boolean, reasoning: string }}
2077
+ */
2078
+ function askModel(fact, cwd) {
2079
+ const { title, source } = fact;
2080
+ const sourceHint = source ? `\nOriginal source file: ${source}` : '';
2081
+ const prompt = `You are a staleness checker. Answer with exactly one line: YES or NO, followed by a short reason (under 30 words).
2082
+
2083
+ Is this task still relevant to the codebase? Check for the mentioned files, functions, or patterns.
2084
+
2085
+ Task: "${title}"${sourceHint}
2086
+
2087
+ Search the codebase to verify. Reply: YES <reason> or NO <reason>`;
2088
+
2089
+ const tmpFile = path.join(cwd, '.staleness-prompt.tmp');
2090
+ fs.writeFileSync(tmpFile, prompt);
2091
+
2092
+ try {
2093
+ const env = { ...process.env };
2094
+ delete env.CLAUDECODE;
2095
+ const cmd = `claude -p "$(cat '${tmpFile.replace(/'/g, "'\\''")}')" --allowedTools "Bash,Read,Glob,Grep"`;
2096
+ const output = execSync(cmd, {
2097
+ cwd,
2098
+ encoding: 'utf8',
2099
+ timeout: 60000,
2100
+ stdio: 'pipe',
2101
+ maxBuffer: 2 * 1024 * 1024,
2102
+ env
2103
+ }).trim();
2104
+
2105
+ try { fs.unlinkSync(tmpFile); } catch {}
2106
+
2107
+ // Parse YES/NO from the first line of output
2108
+ const firstLine = output.split('\n').find(l => /^\s*(YES|NO)\b/i.test(l)) || output.split('\n')[0] || '';
2109
+ const fresh = /^\s*YES\b/i.test(firstLine);
2110
+ const reasoning = firstLine.replace(/^\s*(YES|NO)\s*/i, '').trim() || output.slice(0, 200);
2111
+
2112
+ return { fresh, reasoning };
2113
+ } catch (err) {
2114
+ try { fs.unlinkSync(tmpFile); } catch {}
2115
+ // On timeout or crash, treat as unverifiable — conservative default
2116
+ return { fresh: false, reasoning: `Model check failed: ${(err.message || '').slice(0, 100)}` };
2117
+ }
2118
+ }
2119
+
1605
2120
  /**
1606
2121
  * Entry point when called without a description.
1607
2122
  */
@@ -1611,18 +2126,26 @@ async function autopilotFromTodo(options = {}) {
1611
2126
 
1612
2127
  module.exports = {
1613
2128
  appendTickSummary,
2129
+ askHuman,
2130
+ askModel,
1614
2131
  autopilotAtris,
1615
2132
  autopilotFromTodo,
1616
2133
  buildPrompt,
2134
+ isLessonResolved,
2135
+ isStillTrue,
2136
+ getTaskAgeDays,
1617
2137
  getIdleTickCount,
1618
2138
  getRecentSignals,
1619
2139
  getTickStatus,
1620
2140
  getVerifyCommand,
1621
2141
  computeTickReward,
2142
+ verifyJudgeIntegrity,
1622
2143
  maybeWriteCompletedEndgameScorecard,
1623
2144
  renderHumanSuggestion,
1624
2145
  renderHumanTickIntro,
1625
2146
  proposeCandidateHorizons,
2147
+ recordTickCommit,
2148
+ regressionCheck,
1626
2149
  runTaskOnce,
1627
2150
  scoreEndgameCandidates,
1628
2151
  suggestNextTask,