atris 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,12 @@ const readline = require('readline');
13
13
  const { getLogPath, ensureLogDirectory, createLogFile } = require('../lib/journal');
14
14
  const { parseTodo } = require('../lib/todo');
15
15
  const { findStalePages, findStaleTasks, healBrokenMapRefs } = require('./clean');
16
+ const {
17
+ buildScorecardData,
18
+ readScorecards,
19
+ writeScorecard,
20
+ detectEndgameCompletion
21
+ } = require('../lib/scorecard');
16
22
 
17
23
  const pkg = require('../package.json');
18
24
 
@@ -204,7 +210,7 @@ async function suggestNextTask(cwd, skipped = new Set()) {
204
210
  if (suggestions.length === 0) {
205
211
  try {
206
212
  const candidates = await proposeCandidateHorizons(cwd);
207
- const top = candidates.reduce((best, c) => (c.confidence > best.confidence ? c : best), candidates[0]);
213
+ const top = scoreEndgameCandidates(cwd, candidates);
208
214
  return {
209
215
  task: top.title,
210
216
  why: top.rationale,
@@ -525,10 +531,54 @@ If broken beyond quick fix, reply: failed — [reason].`;
525
531
  return '';
526
532
  }
527
533
 
534
+ /**
535
+ * Write a lesson to atris/lessons.md
536
+ * Appends a line in format: - **[YYYY-MM-DD] slug** — pass/fail — explanation
537
+ */
538
+ function writeLesson(cwd, slug, status, explanation) {
539
+ const lessonsPath = path.join(cwd, 'atris', 'lessons.md');
540
+ const today = new Date().toISOString().split('T')[0]; // YYYY-MM-DD
541
+ const lessonLine = `- **[${today}] ${slug}** — ${status} — ${explanation}`;
542
+
543
+ if (!fs.existsSync(lessonsPath)) {
544
+ fs.writeFileSync(lessonsPath, `# lessons.md — What We Learned\n\n> Append-only. One line per lesson.\n\n---\n\n${lessonLine}\n`);
545
+ return;
546
+ }
547
+
548
+ let content = fs.readFileSync(lessonsPath, 'utf8');
549
+ // Append after the --- separator
550
+ if (content.includes('---\n')) {
551
+ content = content.replace(/---\n/, `---\n\n${lessonLine}\n`);
552
+ } else {
553
+ content += `\n${lessonLine}\n`;
554
+ }
555
+ fs.writeFileSync(lessonsPath, content);
556
+ }
557
+
558
+ /**
559
+ * Get the verify command for a task from TODO.md
560
+ * Reads TODO.md, finds the task by title across active/completed sections,
561
+ * and extracts the verify field.
562
+ * Defaults to 'npm test' if no verify field found.
563
+ */
564
+ function getVerifyCommand(cwd, taskTitle) {
565
+ const todoPath = path.join(cwd, 'atris', 'TODO.md');
566
+ if (!fs.existsSync(todoPath)) return 'npm test';
567
+
568
+ const todo = parseTodo(todoPath);
569
+ const task = [...todo.inProgress, ...todo.backlog, ...todo.completed]
570
+ .find(t => t.title === taskTitle);
571
+
572
+ if (!task) return 'npm test';
573
+ if (task.verify) return task.verify;
574
+ return 'npm test';
575
+ }
576
+
528
577
  function runTaskOnce(context, options = {}) {
529
- const { verbose = false } = options;
578
+ const { verbose = false, cwd = process.cwd() } = options;
530
579
  const phaseResults = {};
531
580
  const startedAt = Date.now();
581
+ const verifyCmd = getVerifyCommand(cwd, context.task);
532
582
 
533
583
  for (const phase of ['plan', 'do', 'review']) {
534
584
  const t0 = Date.now();
@@ -542,11 +592,41 @@ function runTaskOnce(context, options = {}) {
542
592
 
543
593
  const reviewOutput = phaseResults.review.output || '';
544
594
 
595
+ // After review succeeds, run verify command if present
596
+ let verifyPass = false;
597
+ let verifyRan = false;
598
+ if (!reviewOutput.includes('failed') && verifyCmd) {
599
+ verifyRan = true;
600
+ let t0 = Date.now();
601
+ try {
602
+ execSync(verifyCmd, { cwd, stdio: 'pipe' });
603
+ verifyPass = true;
604
+ const verifyTime = Math.round((Date.now() - t0) / 1000);
605
+ phaseResults.verify = {
606
+ output: `Verify passed (${verifyTime}s)`,
607
+ elapsedSeconds: verifyTime,
608
+ };
609
+ } catch (e) {
610
+ const verifyTime = Math.round((Date.now() - t0) / 1000);
611
+ phaseResults.verify = {
612
+ output: `Verify failed: ${e.message}`,
613
+ elapsedSeconds: verifyTime,
614
+ };
615
+ try {
616
+ const slug = (context.task || 'unknown').replace(/\s+/g, '-').toLowerCase().slice(0, 40);
617
+ writeLesson(cwd, `verify-fail-${slug}`, 'fail', `Verify command \`${verifyCmd}\` failed: ${e.message.split('\n')[0]}`);
618
+ } catch { /* lesson write must not crash the tick */ }
619
+ }
620
+ }
621
+
545
622
  return {
546
- success: !reviewOutput.includes('failed'),
623
+ success: !reviewOutput.includes('failed') && (!verifyRan || verifyPass),
547
624
  elapsedSeconds: Math.round((Date.now() - startedAt) / 1000),
548
625
  phaseResults,
549
626
  reviewOutput,
627
+ verifyCmd,
628
+ verifyPass,
629
+ verifyRan,
550
630
  };
551
631
  }
552
632
 
@@ -579,6 +659,47 @@ function logCompletion(description) {
579
659
  fs.writeFileSync(logFile, content);
580
660
  }
581
661
 
662
+ /**
663
+ * Compute per-tick reward score based on execution signals.
664
+ * Rewards:
665
+ * - commit landed: +1
666
+ * - verify passed: +3
667
+ * - npm test passed: +2
668
+ * - validator clean (review passed): +1
669
+ * - halt caught hallucination: -3
670
+ */
671
+ function computeTickReward(execution, tickOutcome, verifyCmd) {
672
+ let reward = 0;
673
+
674
+ // Validator clean: review passed without 'failed'
675
+ if (!execution.reviewOutput || !execution.reviewOutput.includes('failed')) {
676
+ reward += 1;
677
+ }
678
+
679
+ // Verify passed: +3
680
+ if (execution.verifyRan && execution.verifyPass) {
681
+ reward += 3;
682
+ }
683
+
684
+ // npm test passed: +2
685
+ if (execution.verifyRan && execution.verifyPass && verifyCmd === 'npm test') {
686
+ reward += 2;
687
+ }
688
+
689
+ // Commit landed: check do phase output for git commit patterns
690
+ const doOutput = execution.phaseResults.do.output || '';
691
+ if (doOutput.match(/\[.*\s\d+\sfile.*changed/i) || doOutput.includes('git commit') || doOutput.includes('committed')) {
692
+ reward += 1;
693
+ }
694
+
695
+ // Halt caught hallucination: -3
696
+ if (tickOutcome === 'halted') {
697
+ reward -= 3;
698
+ }
699
+
700
+ return reward;
701
+ }
702
+
582
703
  /**
583
704
  * Append a plain-language tick summary block to today's journal `## Notes`.
584
705
  * Fields:
@@ -588,9 +709,10 @@ function logCompletion(description) {
588
709
  * - nextStep: what the next tick will do
589
710
  * - idle: when true, block must contain literal "0 tasks in 0s"
590
711
  * so getIdleTickCount still works.
712
+ * - reward: optional tick reward score (from computeTickReward)
591
713
  * Safe to call inside a try/catch — a write failure must never crash a tick.
592
714
  */
593
- function appendTickSummary(cwd, { time, outcome, horizon, nextStep, idle } = {}) {
715
+ function appendTickSummary(cwd, { time, outcome, horizon, nextStep, idle, reward } = {}) {
594
716
  const now = new Date();
595
717
  const yyyy = now.getFullYear();
596
718
  const mm = String(now.getMonth() + 1).padStart(2, '0');
@@ -623,6 +745,10 @@ function appendTickSummary(cwd, { time, outcome, horizon, nextStep, idle } = {})
623
745
  ` ${horizonLine}`,
624
746
  ` ${nextLine}`,
625
747
  ];
748
+ // Add reward score if present
749
+ if (reward !== undefined && reward !== null) {
750
+ blockLines.push(` Reward: ${reward}`);
751
+ }
626
752
  // Idle marker must be the last non-empty line so getIdleTickCount, which
627
753
  // scans bottom-up, counts this block when idle=true.
628
754
  if (idleLine) blockLines.push(` ${idleLine}`);
@@ -644,20 +770,62 @@ function appendTickSummary(cwd, { time, outcome, horizon, nextStep, idle } = {})
644
770
  }
645
771
 
646
772
  /**
647
- * Read the current endgame slug from atris/TODO.md. Returns 'unset' on miss.
773
+ * Read the current endgame state from atris/TODO.md.
648
774
  */
649
- function readHorizonSlug(cwd) {
775
+ function readEndgameState(cwd) {
650
776
  try {
651
777
  const todoPath = path.join(cwd, 'atris', 'TODO.md');
652
- if (!fs.existsSync(todoPath)) return 'unset';
778
+ if (!fs.existsSync(todoPath)) {
779
+ return { slug: 'unset', pickedAt: null, horizon: '', remaining: 0, completed: 0 };
780
+ }
781
+
782
+ const todo = parseTodo(todoPath);
653
783
  const content = fs.readFileSync(todoPath, 'utf8');
654
- const match = content.match(/\*\*Slug:\*\*\s*(\S+)/);
655
- return match ? match[1].trim() : 'unset';
784
+ const endgameMatch = content.match(/##\s+Endgame\s*\n([\s\S]*?)(?=\n##|$)/);
785
+ const section = endgameMatch ? endgameMatch[1] : '';
786
+ const slugMatch = section.match(/\*\*Slug:\*\*\s*(\S+)/);
787
+ const pickedMatch = section.match(/\*\*Picked:\*\*\s*(.+)/);
788
+ const horizonMatch = section.match(/\*\*Horizon:\*\*\s*(.+)/);
789
+
790
+ return {
791
+ slug: slugMatch ? slugMatch[1].trim() : 'unset',
792
+ pickedAt: pickedMatch ? pickedMatch[1].trim() : null,
793
+ horizon: horizonMatch ? horizonMatch[1].trim() : '',
794
+ remaining: todo.backlog.filter(t => t.tag === 'endgame').length
795
+ + todo.inProgress.filter(t => t.tag === 'endgame').length,
796
+ completed: todo.completed.filter(t => t.tag === 'endgame').length,
797
+ };
656
798
  } catch {
657
- return 'unset';
799
+ return { slug: 'unset', pickedAt: null, horizon: '', remaining: 0, completed: 0 };
658
800
  }
659
801
  }
660
802
 
803
+ function readHorizonSlug(cwd) {
804
+ return readEndgameState(cwd).slug;
805
+ }
806
+
807
+ function maybeWriteCompletedEndgameScorecard(cwd, startingEndgame) {
808
+ if (!startingEndgame || startingEndgame.slug === 'unset' || startingEndgame.remaining === 0) {
809
+ return false;
810
+ }
811
+
812
+ const atrisDir = path.join(cwd, 'atris');
813
+ if (!fs.existsSync(atrisDir)) return false;
814
+
815
+ const { complete, endgameSlug } = detectEndgameCompletion(atrisDir);
816
+ if (!complete || endgameSlug !== startingEndgame.slug) return false;
817
+
818
+ const alreadyWritten = readScorecards(atrisDir).some(sc => sc.slug === endgameSlug);
819
+ if (alreadyWritten) return false;
820
+
821
+ const data = buildScorecardData(atrisDir, {
822
+ slug: endgameSlug,
823
+ pickedAt: startingEndgame.pickedAt,
824
+ });
825
+ writeScorecard(atrisDir, data);
826
+ return true;
827
+ }
828
+
661
829
  /**
662
830
  * Main loop. Suggest → justify → approve → execute, one at a time.
663
831
  */
@@ -737,24 +905,11 @@ function getTickStatus(cwd) {
737
905
  }
738
906
  }
739
907
 
740
- let slug = '(no endgame active — feed inbox or /endgame)';
741
- let horizon = '';
742
- const todoPath = path.join(atrisDir, 'TODO.md');
743
- let remaining = 0;
744
- let completedEndgame = 0;
745
- if (fs.existsSync(todoPath)) {
746
- const todoContent = fs.readFileSync(todoPath, 'utf8');
747
- const endgameMatch = todoContent.match(/##\s+Endgame\s*\n([\s\S]*?)(?=\n##|$)/);
748
- if (endgameMatch) {
749
- const slugMatch = endgameMatch[1].match(/\*\*Slug:\*\*\s*(.+)/);
750
- const horizonMatch = endgameMatch[1].match(/\*\*Horizon:\*\*\s*(.+)/);
751
- if (slugMatch) slug = slugMatch[1].trim();
752
- if (horizonMatch) horizon = horizonMatch[1].trim();
753
- }
754
- const todo = parseTodo(todoPath);
755
- remaining = todo.backlog.filter(t => t.tag === 'endgame').length;
756
- completedEndgame = todo.completed.filter(t => /^[A-Z]\d+[a-z]?[:\s]/.test((t.title || '').trim())).length;
757
- }
908
+ const endgame = readEndgameState(cwd);
909
+ const slug = endgame.slug === 'unset' ? '(no endgame active — feed inbox or /endgame)' : endgame.slug;
910
+ const horizon = endgame.horizon;
911
+ const remaining = endgame.remaining;
912
+ const completedEndgame = endgame.completed;
758
913
 
759
914
  const total = remaining + completedEndgame;
760
915
  const done = completedEndgame;
@@ -912,6 +1067,93 @@ function getRecentSignals(cwd) {
912
1067
  return { recentCommits, wikiHealth, recentLessons };
913
1068
  }
914
1069
 
1070
+ /**
1071
+ * Score endgame candidates by historical reward of similar horizon types.
1072
+ * Reads last 10 scorecards, infers type from slug prefix, calculates mean
1073
+ * reward per type, scores candidates by expected value, applies 80/20 exploit/explore.
1074
+ *
1075
+ * @param {string} cwd - Current working directory
1076
+ * @param {array} candidates - Array of { title, confidence, rationale }
1077
+ * @returns {object} - Single candidate: { title, confidence, rationale, scored: true, reason }
1078
+ */
1079
+ function scoreEndgameCandidates(cwd, candidates) {
1080
+ const atrisDir = path.join(cwd, 'atris');
1081
+ if (!fs.existsSync(atrisDir)) {
1082
+ // No atris folder yet - can't score, return best by confidence
1083
+ const best = candidates.reduce((a, b) => (a.confidence > b.confidence ? a : b), candidates[0]);
1084
+ return { ...best, scored: false, reason: 'no atris folder' };
1085
+ }
1086
+
1087
+ try {
1088
+ const scorecards = readScorecards(atrisDir).slice(-10); // Last 10
1089
+ if (scorecards.length === 0) {
1090
+ // No scorecards yet - return best by confidence
1091
+ const best = candidates.reduce((a, b) => (a.confidence > b.confidence ? a : b), candidates[0]);
1092
+ return { ...best, scored: false, reason: 'no scorecards' };
1093
+ }
1094
+
1095
+ // Infer type from slug/title by taking prefix before first dash
1096
+ const typeToRewards = {};
1097
+ for (const sc of scorecards) {
1098
+ const type = sc.slug.split('-')[0];
1099
+ if (!typeToRewards[type]) typeToRewards[type] = [];
1100
+ typeToRewards[type].push(sc.totalReward);
1101
+ }
1102
+
1103
+ // Calculate mean reward per type
1104
+ const typeMeans = {};
1105
+ for (const [type, rewards] of Object.entries(typeToRewards)) {
1106
+ const mean = rewards.reduce((a, b) => a + b, 0) / rewards.length;
1107
+ typeMeans[type] = mean;
1108
+ }
1109
+
1110
+ // Score each candidate by expected value based on historical type mean
1111
+ const scored = candidates.map(c => {
1112
+ // Infer type from title keywords that match scorecard slug prefixes
1113
+ const titleLower = (c.title || '').toLowerCase();
1114
+ const cType = Object.keys(typeMeans).find(t => titleLower.includes(t)) || titleLower.split(/[\s\-]+/)[0];
1115
+ const historicalMean = typeMeans[cType] !== undefined ? typeMeans[cType] : 0;
1116
+ const expectedValue = historicalMean * c.confidence;
1117
+ return {
1118
+ ...c,
1119
+ expectedValue,
1120
+ type: cType,
1121
+ historicalMean
1122
+ };
1123
+ });
1124
+
1125
+ // Sort by expected value (descending)
1126
+ scored.sort((a, b) => b.expectedValue - a.expectedValue);
1127
+
1128
+ // 80/20 split: 80% exploit (best), 20% explore (random)
1129
+ const choice = Math.random();
1130
+ let selected;
1131
+ if (choice < 0.8) {
1132
+ // Exploit: return highest expected value
1133
+ selected = scored[0];
1134
+ } else {
1135
+ // Explore: return random candidate
1136
+ selected = scored[Math.floor(Math.random() * scored.length)];
1137
+ }
1138
+
1139
+ const reason = choice < 0.8
1140
+ ? `exploit: type=${selected.type} mean-reward=${selected.historicalMean.toFixed(1)} expected-value=${selected.expectedValue.toFixed(1)}`
1141
+ : `explore: random-candidate type=${selected.type}`;
1142
+
1143
+ return {
1144
+ title: selected.title,
1145
+ confidence: selected.confidence,
1146
+ rationale: selected.rationale,
1147
+ scored: true,
1148
+ reason
1149
+ };
1150
+ } catch (err) {
1151
+ // If scoring fails, fall back to best by confidence
1152
+ const best = candidates.reduce((a, b) => (a.confidence > b.confidence ? a : b), candidates[0]);
1153
+ return { ...best, scored: false, reason: `scoring error: ${err.message}` };
1154
+ }
1155
+ }
1156
+
915
1157
  /**
916
1158
  * Propose 3 candidate next horizons for the autopilot loop. Combines
917
1159
  * `getIdleTickCount` + `getRecentSignals` into a prompt asking the LLM
@@ -1100,6 +1342,8 @@ async function autopilotAtris(description, options = {}) {
1100
1342
  let tickOutcomeText = 'I stopped for a manual check.';
1101
1343
  let tickNextStep = 'look for new work';
1102
1344
  let lastTaskTitle = null;
1345
+ let lastExecution = null;
1346
+ let lastVerifyCmd = null;
1103
1347
 
1104
1348
  for (let i = 0; i < maxIterations; i++) {
1105
1349
  // Check time budget
@@ -1212,6 +1456,7 @@ async function autopilotAtris(description, options = {}) {
1212
1456
  // Execute: plan → do → review
1213
1457
  lastTaskTitle = suggestion.task;
1214
1458
  const context = { task: suggestion.task, kind: suggestion.kind };
1459
+ const startingEndgame = readEndgameState(cwd);
1215
1460
 
1216
1461
  try {
1217
1462
  if (verbose) {
@@ -1224,7 +1469,9 @@ async function autopilotAtris(description, options = {}) {
1224
1469
  'Next I will report what happened and whether review passed.'
1225
1470
  ].join('\n'));
1226
1471
  }
1227
- const execution = runTaskOnce(context, { verbose });
1472
+ const execution = runTaskOnce(context, { verbose, cwd });
1473
+ lastExecution = execution;
1474
+ lastVerifyCmd = execution.verifyCmd;
1228
1475
  const planTime = execution.phaseResults.plan.elapsedSeconds;
1229
1476
  if (verbose) console.log(` planned (${planTime}s)`);
1230
1477
 
@@ -1253,11 +1500,32 @@ async function autopilotAtris(description, options = {}) {
1253
1500
  }
1254
1501
  if (verbose) console.log(` reviewed (${reviewTime}s)`);
1255
1502
 
1503
+ // Handle verify failure
1504
+ if (!execution.verifyPass) {
1505
+ tickOutcome = 'halted';
1506
+ tickOutcomeText = `I planned, built, and reviewed "${lastTaskTitle}" but verify failed.`;
1507
+ tickNextStep = 'verify failed, halting';
1508
+ writeLesson(cwd, 'verify-failed', 'fail', `Task "${lastTaskTitle}" passed review but failed verify command.`);
1509
+ if (verbose) {
1510
+ console.log(` verify failed. stopping for manual check.`);
1511
+ } else {
1512
+ printPlainBlock([
1513
+ `I planned, built, and reviewed the task, but the verify check failed.`,
1514
+ '',
1515
+ 'Next I stopped for a manual check.'
1516
+ ].join('\n'));
1517
+ }
1518
+ break;
1519
+ }
1520
+
1256
1521
  completed++;
1257
1522
  tickOutcome = 'built';
1258
1523
  tickOutcomeText = `I planned, built, and reviewed "${suggestion.task}".`;
1259
1524
  tickNextStep = 'pick the next endgame task';
1260
1525
  logCompletion(suggestion.task);
1526
+ if (maybeWriteCompletedEndgameScorecard(cwd, startingEndgame)) {
1527
+ tickNextStep = 'pick the next horizon';
1528
+ }
1261
1529
  if (verbose) {
1262
1530
  console.log(` done. ${completed} task${completed > 1 ? 's' : ''} completed.`);
1263
1531
  console.log('');
@@ -1301,12 +1569,20 @@ async function autopilotAtris(description, options = {}) {
1301
1569
  minute: '2-digit'
1302
1570
  }).toLowerCase();
1303
1571
  const idle = tickOutcome === 'idle' || (completed === 0 && tickOutcome !== 'halted');
1572
+
1573
+ // Compute reward score if we had an execution
1574
+ let tickReward = undefined;
1575
+ if (lastExecution && lastVerifyCmd) {
1576
+ tickReward = computeTickReward(lastExecution, tickOutcome, lastVerifyCmd);
1577
+ }
1578
+
1304
1579
  appendTickSummary(cwd, {
1305
1580
  time,
1306
1581
  outcome: tickOutcomeText,
1307
1582
  horizon: horizonSlug === 'unset' ? null : horizonSlug,
1308
1583
  nextStep: tickNextStep,
1309
- idle
1584
+ idle,
1585
+ reward: tickReward
1310
1586
  });
1311
1587
  } catch {
1312
1588
  /* journal write failure must not crash the tick */
@@ -1341,9 +1617,14 @@ module.exports = {
1341
1617
  getIdleTickCount,
1342
1618
  getRecentSignals,
1343
1619
  getTickStatus,
1620
+ getVerifyCommand,
1621
+ computeTickReward,
1622
+ maybeWriteCompletedEndgameScorecard,
1344
1623
  renderHumanSuggestion,
1345
1624
  renderHumanTickIntro,
1346
1625
  proposeCandidateHorizons,
1347
1626
  runTaskOnce,
1348
- suggestNextTask
1627
+ scoreEndgameCandidates,
1628
+ suggestNextTask,
1629
+ writeLesson
1349
1630
  };