@interleavelove/keating 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +274 -0
  2. package/bin/keating.js +31 -0
  3. package/dist/src/cli/main.js +165 -0
  4. package/dist/src/core/animation.js +372 -0
  5. package/dist/src/core/benchmark.js +238 -0
  6. package/dist/src/core/config.js +81 -0
  7. package/dist/src/core/evolution.js +224 -0
  8. package/dist/src/core/learner-state.js +88 -0
  9. package/dist/src/core/lesson-plan.js +155 -0
  10. package/dist/src/core/map.js +89 -0
  11. package/dist/src/core/paths.js +69 -0
  12. package/dist/src/core/pi-agent.js +58 -0
  13. package/dist/src/core/policy.js +53 -0
  14. package/dist/src/core/project.js +189 -0
  15. package/dist/src/core/prompt-evolution.js +337 -0
  16. package/dist/src/core/random.js +19 -0
  17. package/dist/src/core/self-improve.js +419 -0
  18. package/dist/src/core/topics.js +620 -0
  19. package/dist/src/core/types.js +1 -0
  20. package/dist/src/core/util.js +28 -0
  21. package/dist/src/core/verification.js +162 -0
  22. package/dist/src/pi/hyperteacher-extension.js +180 -0
  23. package/dist/src/runtime/pi.js +118 -0
  24. package/dist/test/animation.test.js +43 -0
  25. package/dist/test/config.test.js +36 -0
  26. package/dist/test/evolution.test.js +39 -0
  27. package/dist/test/fuzz.test.js +37 -0
  28. package/dist/test/hyperteacher-extension.test.js +122 -0
  29. package/dist/test/lesson-plan.test.js +35 -0
  30. package/dist/test/pipeline.test.js +57 -0
  31. package/dist/test/prompt-evolution.test.js +89 -0
  32. package/package.json +58 -0
  33. package/pi/prompts/bridge.md +14 -0
  34. package/pi/prompts/diagnose.md +15 -0
  35. package/pi/prompts/improve.md +39 -0
  36. package/pi/prompts/learn.md +21 -0
  37. package/pi/prompts/quiz.md +14 -0
  38. package/pi/skills/adaptive-teaching/SKILL.md +33 -0
  39. package/scripts/install/install.sh +307 -0
@@ -0,0 +1,419 @@
1
+ import { readFile, writeFile, mkdir } from "node:fs/promises";
2
+ import { join } from "node:path";
3
+ import { runBenchmarkSuite } from "./benchmark.js";
4
+ import { loadPolicy } from "./policy.js";
5
+ import { currentPolicyPath, stateDir, outputsDir } from "./paths.js";
6
+ import { mean } from "./util.js";
7
+ // ---------------------------------------------------------------------------
8
+ // Mutable source files the meta-evolution loop is allowed to touch
9
+ // ---------------------------------------------------------------------------
10
+ const MUTABLE_SOURCES = {
11
+ "lesson-plan": "src/core/lesson-plan.ts",
12
+ "benchmark-weights": "src/core/benchmark.ts",
13
+ "animation": "src/core/animation.ts",
14
+ "topics": "src/core/topics.ts",
15
+ "map": "src/core/map.ts",
16
+ "policy-defaults": "src/core/policy.ts"
17
+ };
18
+ /** Files that must never be modified by self-improvement. Checked at proposal time. */
19
+ export const IMMUTABLE_SOURCES = new Set([
20
+ "src/core/self-improve.ts",
21
+ "src/core/types.ts",
22
+ "src/core/config.ts",
23
+ "src/core/paths.ts",
24
+ "src/core/random.ts"
25
+ ]);
26
+ // ---------------------------------------------------------------------------
27
+ // Archive persistence
28
+ // ---------------------------------------------------------------------------
29
+ export function improvementArchivePath(cwd) {
30
+ return join(stateDir(cwd), "improvement-archive.json");
31
+ }
32
+ export function improvementsDir(cwd) {
33
+ return join(outputsDir(cwd), "improvements");
34
+ }
35
+ export function snapshotsDir(cwd) {
36
+ return join(stateDir(cwd), "snapshots");
37
+ }
38
+ export async function loadImprovementArchive(cwd) {
39
+ try {
40
+ const raw = await readFile(improvementArchivePath(cwd), "utf8");
41
+ return JSON.parse(raw);
42
+ }
43
+ catch {
44
+ return { attempts: [], totalAccepted: 0, totalRejected: 0, cumulativeImprovement: 0 };
45
+ }
46
+ }
47
+ async function saveImprovementArchive(cwd, archive) {
48
+ await writeFile(improvementArchivePath(cwd), JSON.stringify(archive, null, 2), "utf8");
49
+ }
50
+ function diagnoseFromBenchmark(result) {
51
+ const weaknesses = [];
52
+ // Find the weakest topic
53
+ const weakest = [...result.topicBenchmarks].sort((a, b) => a.meanScore - b.meanScore)[0];
54
+ if (weakest && weakest.meanScore < 55) {
55
+ weaknesses.push({
56
+ area: `topic:${weakest.topic.slug}`,
57
+ metric: "meanScore",
58
+ value: weakest.meanScore,
59
+ file: MUTABLE_SOURCES["topics"],
60
+ region: `Topic definition for "${weakest.topic.slug}"`,
61
+ explanation: `Topic "${weakest.topic.title}" scores ${weakest.meanScore.toFixed(1)}, well below the suite average of ${result.overallScore.toFixed(1)}. Its topic definition may need richer intuition, better misconceptions, or more targeted exercises.`
62
+ });
63
+ }
64
+ // Find metrics that are consistently weak across topics
65
+ const allConfusion = mean(result.topicBenchmarks.map(t => t.meanConfusion));
66
+ if (allConfusion > 0.3) {
67
+ weaknesses.push({
68
+ area: "simulation:overload",
69
+ metric: "meanConfusion",
70
+ value: allConfusion,
71
+ file: MUTABLE_SOURCES["benchmark-weights"],
72
+ region: "simulateTeaching overload calculation",
73
+ explanation: `Mean confusion across all topics is ${allConfusion.toFixed(2)} (target < 0.3). The overload formula or its interaction with policy parameters may be miscalibrated.`
74
+ });
75
+ }
76
+ const allTransfer = mean(result.topicBenchmarks.map(t => t.meanTransfer));
77
+ if (allTransfer < 0.35) {
78
+ weaknesses.push({
79
+ area: "simulation:transfer",
80
+ metric: "meanTransfer",
81
+ value: allTransfer,
82
+ file: MUTABLE_SOURCES["lesson-plan"],
83
+ region: "Transfer and Reflection phase",
84
+ explanation: `Mean transfer is ${allTransfer.toFixed(2)} (target > 0.35). The lesson plan's transfer phase or interdisciplinary hooks may need strengthening.`
85
+ });
86
+ }
87
+ const allEngagement = mean(result.topicBenchmarks.map(t => t.meanEngagement));
88
+ if (allEngagement < 0.45) {
89
+ weaknesses.push({
90
+ area: "simulation:engagement",
91
+ metric: "meanEngagement",
92
+ value: allEngagement,
93
+ file: MUTABLE_SOURCES["lesson-plan"],
94
+ region: "Socratic and practice phases",
95
+ explanation: `Mean engagement is ${allEngagement.toFixed(2)} (target > 0.45). Lesson phases may need more interactive elements, stronger Socratic scaffolding, or better diagram integration.`
96
+ });
97
+ }
98
+ // Check per-topic dominant weaknesses
99
+ for (const tb of result.topicBenchmarks) {
100
+ if (tb.dominantWeakness === "overload" && tb.meanConfusion > 0.35) {
101
+ weaknesses.push({
102
+ area: `topic-overload:${tb.topic.slug}`,
103
+ metric: "confusion",
104
+ value: tb.meanConfusion,
105
+ file: MUTABLE_SOURCES["lesson-plan"],
106
+ region: `Domain-specific guidance for "${tb.topic.domain}" topics`,
107
+ explanation: `Topic "${tb.topic.title}" (${tb.topic.domain}) causes excessive overload (confusion ${tb.meanConfusion.toFixed(2)}). The domain-specific lesson customization may need adjustment.`
108
+ });
109
+ }
110
+ if (tb.dominantWeakness === "diagramFit") {
111
+ weaknesses.push({
112
+ area: `visual:${tb.topic.slug}`,
113
+ metric: "diagramFit",
114
+ value: tb.meanScore,
115
+ file: MUTABLE_SOURCES["animation"],
116
+ region: `Scene generator for ${tb.topic.domain} domain`,
117
+ explanation: `Topic "${tb.topic.title}" has weak diagram fit. The animation scene for this domain may need richer visual representation.`
118
+ });
119
+ }
120
+ }
121
+ return weaknesses;
122
+ }
123
+ // ---------------------------------------------------------------------------
124
+ // Proposal generation
125
+ // ---------------------------------------------------------------------------
126
+ let proposalCounter = 0;
127
+ function generateProposalId() {
128
+ proposalCounter += 1;
129
+ const ts = Date.now().toString(36);
130
+ return `improve-${ts}-${proposalCounter}`;
131
+ }
132
+ export async function generateImprovementProposal(cwd) {
133
+ const policy = await loadPolicy(currentPolicyPath(cwd));
134
+ const benchmark = runBenchmarkSuite(policy);
135
+ const weaknesses = diagnoseFromBenchmark(benchmark);
136
+ // Prioritize: pick the top 3 weaknesses by severity
137
+ const sorted = weaknesses.sort((a, b) => a.value - b.value);
138
+ const targets = sorted.slice(0, 3).map(w => ({
139
+ file: w.file,
140
+ region: w.region,
141
+ weakness: w.area,
142
+ metric: w.metric,
143
+ currentValue: w.value,
144
+ rationale: w.explanation
145
+ }));
146
+ const hypothesis = targets.length > 0
147
+ ? `Improving ${targets.map(t => t.weakness).join(", ")} should raise the overall benchmark score from ${benchmark.overallScore.toFixed(2)} by addressing the identified weak areas.`
148
+ : `The benchmark score is ${benchmark.overallScore.toFixed(2)} with no severe weaknesses detected. Consider exploring novel teaching strategies.`;
149
+ const instructions = buildImprovementInstructions(cwd, targets, benchmark);
150
+ return {
151
+ id: generateProposalId(),
152
+ timestamp: new Date().toISOString(),
153
+ targets,
154
+ hypothesis,
155
+ instructions,
156
+ baselineScore: benchmark.overallScore,
157
+ status: "pending"
158
+ };
159
+ }
160
+ function buildImprovementInstructions(_cwd, targets, benchmark) {
161
+ const lines = [
162
+ "# Self-Improvement Instructions",
163
+ "",
164
+ "You are Keating's meta-evolution agent. Your task is to modify Keating's own source code",
165
+ "to improve teaching effectiveness as measured by the benchmark suite.",
166
+ "",
167
+ "## Current Baseline",
168
+ "",
169
+ `- Overall score: ${benchmark.overallScore.toFixed(2)}`,
170
+ `- Weakest topic: ${benchmark.weakestTopic}`,
171
+ "",
172
+ "## Safety Rules",
173
+ "",
174
+ "1. ONLY modify files listed in the targets below. Do not touch types, config, paths, or this self-improvement module.",
175
+ "2. After making changes, run `bun test ./test/*.test.ts` to verify no tests break.",
176
+ "3. Run `bun src/cli/main.ts bench` to measure the impact.",
177
+ "4. If the benchmark score decreases or tests fail, ROLLBACK all changes using the snapshots.",
178
+ "5. Keep changes small and focused. One logical change per target.",
179
+ "6. Do not change function signatures that are imported by other modules.",
180
+ "7. Add a comment `// [self-improve] <proposal-id>` near each changed region.",
181
+ "",
182
+ "## Targets",
183
+ ""
184
+ ];
185
+ for (let i = 0; i < targets.length; i++) {
186
+ const t = targets[i];
187
+ lines.push(`### Target ${i + 1}: ${t.weakness}`);
188
+ lines.push("");
189
+ lines.push(`- **File**: ${t.file}`);
190
+ lines.push(`- **Region**: ${t.region}`);
191
+ lines.push(`- **Metric**: ${t.metric} = ${t.currentValue.toFixed(2)}`);
192
+ lines.push(`- **Rationale**: ${t.rationale}`);
193
+ lines.push("");
194
+ lines.push("**Suggested approach**: Read the file, understand the region, and make a targeted change");
195
+ lines.push("that addresses the diagnosed weakness. Think about what the benchmark simulation actually");
196
+ lines.push("measures and how your code change will flow through to improve the metric.");
197
+ lines.push("");
198
+ }
199
+ lines.push("## Evaluation Protocol");
200
+ lines.push("");
201
+ lines.push("After applying changes:");
202
+ lines.push("1. Run `bun x tsc -p tsconfig.json` — must compile clean");
203
+ lines.push("2. Run `bun test ./test/*.test.ts` — all tests must pass");
204
+ lines.push("3. Run `bun src/cli/main.ts bench` — record the new overall score");
205
+ lines.push(`4. If new score > ${benchmark.overallScore.toFixed(2)}, the change is accepted`);
206
+ lines.push(`5. If new score <= ${benchmark.overallScore.toFixed(2)}, rollback using the snapshots`);
207
+ lines.push("6. Record the result using `/improve accept` or `/improve reject`");
208
+ lines.push("");
209
+ return lines.join("\n");
210
+ }
211
+ // ---------------------------------------------------------------------------
212
+ // Snapshot: save current state of mutable files before changes
213
+ // ---------------------------------------------------------------------------
214
+ export async function snapshotMutableSources(cwd, proposalId) {
215
+ const snapDir = join(snapshotsDir(cwd), proposalId);
216
+ await mkdir(snapDir, { recursive: true });
217
+ const snapshots = [];
218
+ for (const [_label, relativePath] of Object.entries(MUTABLE_SOURCES)) {
219
+ const fullPath = join(cwd, relativePath);
220
+ try {
221
+ const content = await readFile(fullPath, "utf8");
222
+ const snapshot = {
223
+ file: fullPath,
224
+ relativePath,
225
+ content,
226
+ snapshotAt: new Date().toISOString()
227
+ };
228
+ snapshots.push(snapshot);
229
+ await writeFile(join(snapDir, relativePath.replace(/\//g, "__")), content, "utf8");
230
+ }
231
+ catch {
232
+ // file doesn't exist, skip
233
+ }
234
+ }
235
+ return snapshots;
236
+ }
237
+ // ---------------------------------------------------------------------------
238
+ // Rollback: restore files from snapshot
239
+ // ---------------------------------------------------------------------------
240
+ export async function rollbackFromSnapshots(snapshots) {
241
+ for (const snapshot of snapshots) {
242
+ await writeFile(snapshot.file, snapshot.content, "utf8");
243
+ }
244
+ }
245
+ // ---------------------------------------------------------------------------
246
+ // Evaluate: compare before/after benchmark scores
247
+ // ---------------------------------------------------------------------------
248
+ export async function evaluateImprovement(cwd, baselineScore) {
249
+ const policy = await loadPolicy(currentPolicyPath(cwd));
250
+ const result = runBenchmarkSuite(policy);
251
+ const delta = result.overallScore - baselineScore;
252
+ return {
253
+ afterScore: result.overallScore,
254
+ improved: delta > 0,
255
+ delta
256
+ };
257
+ }
258
+ // ---------------------------------------------------------------------------
259
+ // Record: persist an improvement attempt to the archive
260
+ // ---------------------------------------------------------------------------
261
+ export async function recordAttempt(cwd, attempt) {
262
+ const archive = await loadImprovementArchive(cwd);
263
+ archive.attempts.push(attempt);
264
+ if (attempt.accepted) {
265
+ archive.totalAccepted += 1;
266
+ archive.cumulativeImprovement += attempt.scoreDelta ?? 0;
267
+ }
268
+ else {
269
+ archive.totalRejected += 1;
270
+ }
271
+ await saveImprovementArchive(cwd, archive);
272
+ }
273
+ export async function generateImprovementArtifact(cwd) {
274
+ const dir = improvementsDir(cwd);
275
+ await mkdir(dir, { recursive: true });
276
+ const proposal = await generateImprovementProposal(cwd);
277
+ const snapshots = await snapshotMutableSources(cwd, proposal.id);
278
+ // Write the proposal as a markdown artifact the agent can read and execute
279
+ const proposalPath = join(dir, `${proposal.id}.md`);
280
+ const content = [
281
+ `# Improvement Proposal: ${proposal.id}`,
282
+ "",
283
+ `**Timestamp**: ${proposal.timestamp}`,
284
+ `**Baseline score**: ${proposal.baselineScore.toFixed(2)}`,
285
+ `**Status**: ${proposal.status}`,
286
+ "",
287
+ `## Hypothesis`,
288
+ "",
289
+ proposal.hypothesis,
290
+ "",
291
+ `## Snapshotted Files`,
292
+ "",
293
+ ...snapshots.map(s => `- ${s.relativePath} (${s.content.length} bytes)`),
294
+ "",
295
+ proposal.instructions,
296
+ "",
297
+ "## Archive Context",
298
+ ""
299
+ ].join("\n");
300
+ // Append prior attempt summaries for the agent's learning
301
+ const archive = await loadImprovementArchive(cwd);
302
+ const history = archive.attempts.slice(-5).map(a => {
303
+ const status = a.accepted ? "ACCEPTED" : "REJECTED";
304
+ const delta = a.scoreDelta != null ? ` (delta: ${a.scoreDelta.toFixed(2)})` : "";
305
+ const targets = a.proposal.targets.map(t => t.weakness).join(", ");
306
+ return `- ${a.proposal.id}: ${status}${delta} — targeted ${targets}`;
307
+ });
308
+ const fullContent = history.length > 0
309
+ ? content + "Recent attempts (learn from these):\n\n" + history.join("\n") + "\n"
310
+ : content + "No prior improvement attempts. This is the first run.\n";
311
+ await writeFile(proposalPath, fullContent, "utf8");
312
+ return { proposalPath, proposal, snapshots };
313
+ }
314
+ // ---------------------------------------------------------------------------
315
+ // Accept / Reject helpers
316
+ // ---------------------------------------------------------------------------
317
+ export async function acceptImprovement(cwd, proposalId, afterScore) {
318
+ const archive = await loadImprovementArchive(cwd);
319
+ const existing = archive.attempts.find(a => a.proposal.id === proposalId);
320
+ if (existing) {
321
+ existing.accepted = true;
322
+ existing.afterScore = afterScore;
323
+ existing.scoreDelta = afterScore - existing.baselineScore;
324
+ existing.proposal.status = "accepted";
325
+ existing.completedAt = new Date().toISOString();
326
+ archive.totalAccepted += 1;
327
+ archive.cumulativeImprovement += existing.scoreDelta;
328
+ await saveImprovementArchive(cwd, archive);
329
+ return;
330
+ }
331
+ // If not found in archive, create a new entry
332
+ const proposal = {
333
+ id: proposalId,
334
+ timestamp: new Date().toISOString(),
335
+ targets: [],
336
+ hypothesis: "Accepted externally",
337
+ instructions: "",
338
+ baselineScore: afterScore,
339
+ status: "accepted"
340
+ };
341
+ await recordAttempt(cwd, {
342
+ proposal,
343
+ snapshots: [],
344
+ baselineScore: 0,
345
+ afterScore,
346
+ scoreDelta: null,
347
+ accepted: true,
348
+ rollbackPerformed: false,
349
+ completedAt: new Date().toISOString()
350
+ });
351
+ }
352
+ export async function rejectImprovement(cwd, proposalId, snapshots) {
353
+ await rollbackFromSnapshots(snapshots);
354
+ const archive = await loadImprovementArchive(cwd);
355
+ const existing = archive.attempts.find(a => a.proposal.id === proposalId);
356
+ if (existing) {
357
+ existing.accepted = false;
358
+ existing.proposal.status = "rejected";
359
+ existing.rollbackPerformed = true;
360
+ existing.completedAt = new Date().toISOString();
361
+ archive.totalRejected += 1;
362
+ await saveImprovementArchive(cwd, archive);
363
+ return;
364
+ }
365
+ await recordAttempt(cwd, {
366
+ proposal: {
367
+ id: proposalId,
368
+ timestamp: new Date().toISOString(),
369
+ targets: [],
370
+ hypothesis: "Rejected and rolled back",
371
+ instructions: "",
372
+ baselineScore: 0,
373
+ status: "rolled-back"
374
+ },
375
+ snapshots,
376
+ baselineScore: 0,
377
+ afterScore: null,
378
+ scoreDelta: null,
379
+ accepted: false,
380
+ rollbackPerformed: true,
381
+ completedAt: new Date().toISOString()
382
+ });
383
+ }
384
+ // ---------------------------------------------------------------------------
385
+ // Markdown report of improvement history
386
+ // ---------------------------------------------------------------------------
387
+ export function improvementHistoryToMarkdown(archive) {
388
+ const lines = [
389
+ "# Self-Improvement History",
390
+ "",
391
+ `- Total attempts: ${archive.attempts.length}`,
392
+ `- Accepted: ${archive.totalAccepted}`,
393
+ `- Rejected: ${archive.totalRejected}`,
394
+ `- Cumulative score improvement: ${archive.cumulativeImprovement.toFixed(2)}`,
395
+ ""
396
+ ];
397
+ if (archive.attempts.length === 0) {
398
+ lines.push("No improvement attempts yet. Run `/improve` to start the self-improvement loop.");
399
+ return lines.join("\n");
400
+ }
401
+ lines.push("## Attempts");
402
+ lines.push("");
403
+ for (const attempt of archive.attempts) {
404
+ const status = attempt.accepted ? "ACCEPTED" : attempt.rollbackPerformed ? "ROLLED BACK" : "REJECTED";
405
+ lines.push(`### ${attempt.proposal.id} — ${status}`);
406
+ lines.push("");
407
+ lines.push(`- Baseline: ${attempt.baselineScore.toFixed(2)}`);
408
+ if (attempt.afterScore != null) {
409
+ lines.push(`- After: ${attempt.afterScore.toFixed(2)}`);
410
+ lines.push(`- Delta: ${(attempt.scoreDelta ?? 0) >= 0 ? "+" : ""}${(attempt.scoreDelta ?? 0).toFixed(2)}`);
411
+ }
412
+ lines.push(`- Hypothesis: ${attempt.proposal.hypothesis}`);
413
+ if (attempt.proposal.targets.length > 0) {
414
+ lines.push(`- Targets: ${attempt.proposal.targets.map(t => `${t.file}:${t.region}`).join(", ")}`);
415
+ }
416
+ lines.push("");
417
+ }
418
+ return lines.join("\n");
419
+ }