@gianfrancopiana/openclaw-autoresearch 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Autonomous experiment loop for any optimization target.
4
4
 
5
- Faithful OpenClaw port of [`davebcn87/pi-autoresearch`](https://github.com/davebcn87/pi-autoresearch).
5
+ Faithful OpenClaw port of [`davebcn87/pi-autoresearch`](https://github.com/davebcn87/pi-autoresearch), including upstream statistical confidence scoring.
6
6
 
7
7
  ## How it works
8
8
 
@@ -13,19 +13,20 @@ Three tools drive the loop:
13
13
  | Tool | What it does |
14
14
  |---|---|
15
15
  | `init_experiment` | Configures the session: name, primary metric, unit, direction (lower/higher). Re-calling starts a new segment. |
16
- | `run_experiment` | Executes a shell command, times it, captures stdout/stderr, returns pass/fail via exit code. |
17
- | `log_experiment` | Records the result. `keep` auto-commits to git. `discard`/`crash` log without committing. Tracks secondary metrics alongside the primary. |
16
+ | `run_experiment` | Executes a shell command, times it, captures stdout/stderr, parses `METRIC name=number` lines, and opens a pending experiment window that must be logged before another run can start. |
17
+ | `log_experiment` | Records the pending run. `keep` auto-commits to git. `discard`/`crash` log without committing. If the prior `run_experiment` captured the primary metric, `log_experiment` can infer `commit` and `metric` automatically. After 3+ runs in a segment, it also reports a confidence score for the best improvement versus noise. |
18
18
 
19
19
  Each tool also accepts an optional `cwd` so callers can target a nested repo explicitly instead of relying on the current session working directory.
20
20
 
21
- All state lives in four repo-root files:
21
+ All state lives in five repo-root files:
22
22
 
23
23
  | File | Purpose |
24
24
  |---|---|
25
- | `autoresearch.md` | Session doc: objective, metrics, files in scope, constraints, what's been tried. A fresh agent reads this to resume. |
25
+ | `autoresearch.md` | Session doc. The plugin keeps the Metrics, How to Run, What's Been Tried, and Plugin Checkpoint sections synchronized so resumes are less agent-dependent. |
26
26
  | `autoresearch.sh` | Benchmark script. Outputs `METRIC name=number` lines. |
27
27
  | `autoresearch.jsonl` | Structured log: config headers + experiment entries (metric, status, timestamp, segment, commit hash). |
28
28
  | `autoresearch.ideas.md` | Backlog of promising ideas not yet tried. Optional. |
29
+ | `autoresearch.checkpoint.json` | Plugin-managed checkpoint: latest logged state, recent runs, and any pending unlogged run. |
29
30
 
30
31
  The design is file-first: any agent can pick up the repo-root files and continue the loop without prior context.
31
32
 
@@ -72,6 +73,15 @@ Verify:
72
73
 
73
74
  Prefer the explicit `/autoresearch` command surface in OpenClaw. The auto-generated native skill alias `/autoresearch_create` may not trigger reliably on some hosts, so use `/skill autoresearch-create` if you need to invoke the skill directly.
74
75
 
76
+ ## Workflow Guarantees
77
+
78
+ - `run_experiment` refuses to start a second run until the previous one is logged.
79
+ - `run_experiment` parses `METRIC name=number` lines and stores a pending run so `log_experiment` can default from the actual benchmark output.
80
+ - During active autoresearch mode, raw benchmark execution through OpenClaw `exec`/`bash` is blocked. Use `run_experiment` instead.
81
+ - `autoresearch_status` warns when a pending run is unlogged or git history has moved ahead of the last logged experiment.
82
+ - After 3+ positive-metric runs in a segment, `log_experiment`, `autoresearch_status`, and the synced session doc report a MAD-based confidence score so the agent can distinguish likely wins from noise.
83
+ - The plugin updates `autoresearch.checkpoint.json` and refreshes plugin-managed sections in `autoresearch.md` after init, run, and log transitions.
84
+
75
85
  ## Use
76
86
 
77
87
  In the repo you want to optimize:
@@ -80,7 +90,7 @@ In the repo you want to optimize:
80
90
  2. Run `/autoresearch` or `/autoresearch setup <goal>`.
81
91
  3. Send a normal message with the goal, command, metric (+ direction), files in scope, and constraints.
82
92
  4. If you need the raw skill invocation, use `/skill autoresearch-create`.
83
- 5. The agent writes `autoresearch.md` and `autoresearch.sh`, runs a baseline, then starts looping.
93
+ 5. The agent writes `autoresearch.md` and `autoresearch.sh`, runs a baseline with `run_experiment`, then records it with `log_experiment`.
84
94
  6. Use `/autoresearch` or `/autoresearch status` to re-prime context on a later turn.
85
95
 
86
96
  To resume an existing session, a new agent reads the repo-root files and continues from where the last one stopped.
@@ -99,6 +109,7 @@ This port preserves upstream semantics, names, and file contracts while adapting
99
109
 
100
110
  - upstream repo: `https://github.com/davebcn87/pi-autoresearch`
101
111
  - pinned upstream commit: `2227029fa5712944a36938b5fe59f709cb30ed22` (`2227029f`)
112
+ - later upstream parity cherry-pick: confidence scoring from `cf1bbf03debca8f3fb2cca2c3e799b9e23320f87` (`cf1bbf0`, March 19, 2026)
102
113
 
103
114
  ## Validation
104
115
 
@@ -6,6 +6,7 @@ Pinned upstream reference:
6
6
 
7
7
  - Repo: `https://github.com/davebcn87/pi-autoresearch`
8
8
  - Commit: `2227029fa5712944a36938b5fe59f709cb30ed22` (`2227029f`)
9
+ - Later upstream semantics also ported: confidence scoring from `cf1bbf03debca8f3fb2cca2c3e799b9e23320f87` (`cf1bbf0`)
9
10
 
10
11
  ## Principle
11
12
 
@@ -0,0 +1,82 @@
1
+ import * as fs from "node:fs";
2
+ import { AUTORESEARCH_ROOT_FILES, getAutoresearchRootFilePath } from "./files.js";
3
+ import type {
4
+ AutoresearchRunSnapshot,
5
+ AutoresearchStateSnapshot,
6
+ } from "./state.js";
7
+ import type { PendingExperimentRun } from "./runtime-state.js";
8
+
9
+ export type AutoresearchCheckpoint = {
10
+ readonly version: 1;
11
+ readonly updatedAt: number;
12
+ readonly sessionStartCommit: string | null;
13
+ readonly session: {
14
+ readonly name: string | null;
15
+ readonly metricName: string;
16
+ readonly metricUnit: string;
17
+ readonly bestDirection: "lower" | "higher";
18
+ readonly currentSegment: number;
19
+ readonly currentRunCount: number;
20
+ readonly totalRunCount: number;
21
+ readonly currentBaselineMetric: number | null;
22
+ readonly currentBestMetric: number | null;
23
+ readonly confidence: number | null;
24
+ };
25
+ readonly lastLoggedRun: AutoresearchRunSnapshot | null;
26
+ readonly recentLoggedRuns: readonly AutoresearchRunSnapshot[];
27
+ readonly pendingRun: PendingExperimentRun | null;
28
+ };
29
+
30
+ export function readAutoresearchCheckpoint(cwd: string): AutoresearchCheckpoint | null {
31
+ const checkpointPath = getAutoresearchRootFilePath(cwd, "checkpoint");
32
+ if (!fs.existsSync(checkpointPath)) {
33
+ return null;
34
+ }
35
+
36
+ try {
37
+ const parsed = JSON.parse(fs.readFileSync(checkpointPath, "utf8")) as AutoresearchCheckpoint;
38
+ return parsed?.version === 1 ? parsed : null;
39
+ } catch {
40
+ return null;
41
+ }
42
+ }
43
+
44
+ export function writeAutoresearchCheckpoint(options: {
45
+ cwd: string;
46
+ state: AutoresearchStateSnapshot;
47
+ sessionStartCommit: string | null;
48
+ recentLoggedRuns: readonly AutoresearchRunSnapshot[];
49
+ pendingRun: PendingExperimentRun | null;
50
+ }): AutoresearchCheckpoint {
51
+ const checkpoint: AutoresearchCheckpoint = {
52
+ version: 1,
53
+ updatedAt: Date.now(),
54
+ sessionStartCommit: options.sessionStartCommit,
55
+ session: {
56
+ name: options.state.name,
57
+ metricName: options.state.metricName,
58
+ metricUnit: options.state.metricUnit,
59
+ bestDirection: options.state.bestDirection,
60
+ currentSegment: options.state.currentSegment,
61
+ currentRunCount: options.state.currentRunCount,
62
+ totalRunCount: options.state.totalRunCount,
63
+ currentBaselineMetric: options.state.currentBaselineMetric,
64
+ currentBestMetric: options.state.currentBestMetric,
65
+ confidence: options.state.confidence,
66
+ },
67
+ lastLoggedRun: options.state.lastRun,
68
+ recentLoggedRuns: options.recentLoggedRuns,
69
+ pendingRun: options.pendingRun,
70
+ };
71
+
72
+ const checkpointPath = getAutoresearchRootFilePath(options.cwd, "checkpoint");
73
+ fs.writeFileSync(checkpointPath, `${JSON.stringify(checkpoint, null, 2)}\n`);
74
+ return checkpoint;
75
+ }
76
+
77
+ export function deleteAutoresearchCheckpoint(cwd: string): void {
78
+ const checkpointPath = getAutoresearchRootFilePath(cwd, "checkpoint");
79
+ if (fs.existsSync(checkpointPath)) {
80
+ fs.unlinkSync(checkpointPath);
81
+ }
82
+ }
@@ -0,0 +1,82 @@
1
+ export type ConfidenceRun = {
2
+ readonly metric: number;
3
+ readonly status: string;
4
+ };
5
+
6
+ export function computeConfidence(
7
+ runs: readonly ConfidenceRun[],
8
+ direction: "lower" | "higher",
9
+ ): number | null {
10
+ const usableRuns = runs.filter((run) => Number.isFinite(run.metric) && run.metric > 0);
11
+ if (usableRuns.length < 3) {
12
+ return null;
13
+ }
14
+
15
+ const baseline = runs.find((run) => Number.isFinite(run.metric));
16
+ if (!baseline) {
17
+ return null;
18
+ }
19
+
20
+ const values = usableRuns.map((run) => run.metric);
21
+ const median = sortedMedian(values);
22
+ const deviations = values.map((value) => Math.abs(value - median));
23
+ const mad = sortedMedian(deviations);
24
+ if (mad === 0) {
25
+ return null;
26
+ }
27
+
28
+ let bestKept: number | null = null;
29
+ for (const run of usableRuns) {
30
+ if (run.status !== "keep") {
31
+ continue;
32
+ }
33
+
34
+ if (bestKept === null || isBetter(run.metric, bestKept, direction)) {
35
+ bestKept = run.metric;
36
+ }
37
+ }
38
+
39
+ if (bestKept === null || bestKept === baseline.metric) {
40
+ return null;
41
+ }
42
+
43
+ return Math.abs(bestKept - baseline.metric) / mad;
44
+ }
45
+
46
+ export function formatConfidenceLine(
47
+ confidence: number | null,
48
+ label = "Confidence",
49
+ ): string {
50
+ return confidence === null ? `${label}: n/a` : `${label}: ${describeConfidence(confidence)}`;
51
+ }
52
+
53
+ export function describeConfidence(confidence: number): string {
54
+ const rendered = confidence.toFixed(1);
55
+ if (confidence >= 2.0) {
56
+ return `${rendered}x noise floor - improvement is likely real`;
57
+ }
58
+ if (confidence >= 1.0) {
59
+ return `${rendered}x noise floor - improvement is above noise but marginal`;
60
+ }
61
+ return `${rendered}x noise floor - improvement is within noise. Consider re-running to confirm before keeping`;
62
+ }
63
+
64
+ function sortedMedian(values: readonly number[]): number {
65
+ if (values.length === 0) {
66
+ return 0;
67
+ }
68
+
69
+ const sorted = [...values].sort((left, right) => left - right);
70
+ const mid = Math.floor(sorted.length / 2);
71
+ return sorted.length % 2 === 0
72
+ ? (sorted[mid - 1] + sorted[mid]) / 2
73
+ : sorted[mid];
74
+ }
75
+
76
+ function isBetter(
77
+ current: number,
78
+ best: number,
79
+ direction: "lower" | "higher",
80
+ ): boolean {
81
+ return direction === "lower" ? current < best : current > best;
82
+ }
@@ -5,6 +5,7 @@ export const AUTORESEARCH_ROOT_FILES = {
5
5
  runnerScript: "autoresearch.sh",
6
6
  resultsLog: "autoresearch.jsonl",
7
7
  ideasBacklog: "autoresearch.ideas.md",
8
+ checkpoint: "autoresearch.checkpoint.json",
8
9
  } as const;
9
10
 
10
11
  export type AutoresearchRootFileKey = keyof typeof AUTORESEARCH_ROOT_FILES;
@@ -19,6 +19,11 @@ export type GitKeepResult = {
19
19
  readonly command: GitCommandResult;
20
20
  };
21
21
 
22
+ export type GitRuntimeOptions = {
23
+ runCommandWithTimeout: RunCommandWithTimeout;
24
+ cwd: string;
25
+ };
26
+
22
27
  async function runGitCommand(
23
28
  runCommandWithTimeout: RunCommandWithTimeout,
24
29
  cwd: string,
@@ -40,6 +45,31 @@ async function runGitCommand(
40
45
  };
41
46
  }
42
47
 
48
+ export async function readShortHeadCommit(options: GitRuntimeOptions): Promise<string | null> {
49
+ const result = await runGitCommand(options.runCommandWithTimeout, options.cwd, [
50
+ "rev-parse",
51
+ "--short=7",
52
+ "HEAD",
53
+ ]);
54
+ return result.code === 0 && result.stdout.trim().length > 0 ? result.stdout.trim() : null;
55
+ }
56
+
57
+ export async function countCommitsSince(
58
+ options: GitRuntimeOptions & { sinceCommit: string },
59
+ ): Promise<number | null> {
60
+ const result = await runGitCommand(options.runCommandWithTimeout, options.cwd, [
61
+ "rev-list",
62
+ "--count",
63
+ `${options.sinceCommit}..HEAD`,
64
+ ]);
65
+ if (result.code !== 0) {
66
+ return null;
67
+ }
68
+
69
+ const count = Number.parseInt(result.stdout.trim(), 10);
70
+ return Number.isFinite(count) ? count : null;
71
+ }
72
+
43
73
  export async function commitKeptExperiment(options: {
44
74
  runCommandWithTimeout: RunCommandWithTimeout;
45
75
  cwd: string;
@@ -1,6 +1,7 @@
1
1
  import type { OpenClawPluginApi } from "openclaw/plugin-sdk";
2
2
  import { AUTORESEARCH_ROOT_FILES } from "./files.js";
3
3
  import { reconstructStateFromJsonl } from "./state.js";
4
+ import { formatConfidenceLine } from "./confidence.js";
4
5
  import {
5
6
  clearAutoresearchRuntimeState,
6
7
  consumeAutoresearchContinuationReminder,
@@ -64,6 +65,30 @@ export function registerAutoresearchHooks(api: OpenClawPluginApi): void {
64
65
  queueAutoresearchSteer(cwd, messageText);
65
66
  });
66
67
 
68
+ hookApi.on("before_tool_call", (event, ctx) => {
69
+ const cwd = resolveHookCwd(api, ctx);
70
+ if (cwd === null || !shouldEnforceAutoresearchMode(cwd)) {
71
+ return;
72
+ }
73
+
74
+ const record = event as Record<string, unknown>;
75
+ const toolName = typeof record.toolName === "string" ? record.toolName : "";
76
+ if (toolName !== "exec" && toolName !== "bash") {
77
+ return;
78
+ }
79
+
80
+ const command = extractToolCommand(record.params);
81
+ if (!command || !looksLikeExperimentCommand(command)) {
82
+ return;
83
+ }
84
+
85
+ return {
86
+ block: true,
87
+ blockReason:
88
+ "Autoresearch mode blocks raw benchmark execution through exec/bash. Use run_experiment so the result is captured and log_experiment can enforce the experiment lifecycle.",
89
+ };
90
+ });
91
+
67
92
  hookApi.on("agent_end", (_event, ctx) => {
68
93
  const cwd = resolveHookCwd(api, ctx);
69
94
  if (cwd === null) {
@@ -112,11 +137,7 @@ export function registerAutoresearchHooks(api: OpenClawPluginApi): void {
112
137
  export function buildBeforePromptBuildContext(cwd: string): string | null {
113
138
  const state = reconstructStateFromJsonl(cwd);
114
139
  const runtimeState = getAutoresearchRuntimeState(cwd);
115
- const modeEnabled =
116
- runtimeState.mode === "on" ||
117
- (runtimeState.mode !== "off" && (state.mode === "active" || state.hasSessionDoc));
118
-
119
- if (!modeEnabled) {
140
+ if (!shouldEnforceAutoresearchMode(cwd, state, runtimeState)) {
120
141
  return null;
121
142
  }
122
143
 
@@ -144,7 +165,14 @@ export function buildBeforePromptBuildContext(cwd: string): string | null {
144
165
  `Read ${AUTORESEARCH_ROOT_FILES.sessionDoc} before resuming or changing the experiment loop, and re-read it after compaction.`,
145
166
  "Resume the autonomous upstream loop: edit, run_experiment, log_experiment, keep/discard/crash, repeat.",
146
167
  "Use init_experiment, run_experiment, and log_experiment for experiment state changes. Never stop unless the user explicitly interrupts the loop.",
168
+ "Never run benchmark or test commands through raw exec/bash during autoresearch mode. Use run_experiment so the plugin can capture metrics, enforce logging, and preserve resumable state.",
169
+ "After every run_experiment, call log_experiment before starting another run. If METRIC lines were captured, log_experiment can infer commit and metric from the pending run.",
147
170
  );
171
+ if (state.confidence !== null) {
172
+ lines.push(
173
+ `${formatConfidenceLine(state.confidence, "Current confidence")}. Treat low-confidence wins as provisional and re-run before keeping when the score is below 1.0x.`,
174
+ );
175
+ }
148
176
  if (pendingCommand?.args) {
149
177
  lines.push(`Additional resume instruction from /autoresearch: ${pendingCommand.args}`);
150
178
  }
@@ -232,3 +260,49 @@ function firstString(...values: unknown[]): string | null {
232
260
  function isCommandLikeMessage(text: string): boolean {
233
261
  return /^[\/!]/.test(text.trim());
234
262
  }
263
+
264
+ function shouldEnforceAutoresearchMode(
265
+ cwd: string,
266
+ state = reconstructStateFromJsonl(cwd),
267
+ runtimeState = getAutoresearchRuntimeState(cwd),
268
+ ): boolean {
269
+ return (
270
+ runtimeState.mode === "on" ||
271
+ runtimeState.runInFlight ||
272
+ runtimeState.pendingRun !== null ||
273
+ (runtimeState.mode !== "off" && (state.mode === "active" || state.hasSessionDoc))
274
+ );
275
+ }
276
+
277
+ function extractToolCommand(params: unknown): string | null {
278
+ if (!params || typeof params !== "object") {
279
+ return null;
280
+ }
281
+
282
+ const record = params as Record<string, unknown>;
283
+ for (const key of ["command", "cmd", "args"]) {
284
+ const value = record[key];
285
+ if (typeof value === "string" && value.trim().length > 0) {
286
+ return value.trim();
287
+ }
288
+ }
289
+
290
+ return null;
291
+ }
292
+
293
+ function looksLikeExperimentCommand(command: string): boolean {
294
+ const normalized = command.trim();
295
+ if (!normalized) {
296
+ return false;
297
+ }
298
+
299
+ const readOnlyPatterns = [
300
+ /^(pwd|ls|find|rg|grep|sed|cat|head|tail|wc|stat)\b/,
301
+ /^git\s+(status|diff|show|log|rev-parse|branch|remote)\b/,
302
+ ];
303
+ if (readOnlyPatterns.some((pattern) => pattern.test(normalized))) {
304
+ return false;
305
+ }
306
+
307
+ return true;
308
+ }
@@ -47,6 +47,7 @@ export type AutoresearchResultEntry = {
47
47
  readonly description: string;
48
48
  readonly timestamp: number;
49
49
  readonly segment: number;
50
+ readonly confidence: number | null;
50
51
  };
51
52
 
52
53
  export function appendResultEntry(cwd: string, entry: AutoresearchResultEntry): void {
@@ -0,0 +1,24 @@
1
+ const METRIC_LINE_RE =
2
+ /^METRIC\s+([A-Za-z0-9_.\-µ]+)\s*=\s*(-?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?)\s*$/;
3
+
4
+ export function parseMetricLines(output: string): Record<string, number> {
5
+ const metrics = new Map<string, number>();
6
+
7
+ for (const rawLine of output.split(/\r?\n/)) {
8
+ const line = rawLine.trim();
9
+ const match = METRIC_LINE_RE.exec(line);
10
+ if (!match) {
11
+ continue;
12
+ }
13
+
14
+ const [, name, valueText] = match;
15
+ const value = Number(valueText);
16
+ if (!name || !Number.isFinite(value)) {
17
+ continue;
18
+ }
19
+
20
+ metrics.set(name, value);
21
+ }
22
+
23
+ return Object.fromEntries(metrics.entries());
24
+ }
@@ -7,12 +7,26 @@ export type PendingAutoresearchCommand =
7
7
  }
8
8
  | null;
9
9
 
10
+ export type PendingExperimentRun = {
11
+ readonly command: string;
12
+ readonly commit: string | null;
13
+ readonly primaryMetric: number | null;
14
+ readonly metrics: Record<string, number>;
15
+ readonly durationSeconds: number;
16
+ readonly exitCode: number | null;
17
+ readonly passed: boolean;
18
+ readonly timedOut: boolean;
19
+ readonly tailOutput: string;
20
+ readonly capturedAt: number;
21
+ };
22
+
10
23
  export type AutoresearchRuntimeSnapshot = {
11
24
  readonly mode: AutoresearchRuntimeMode;
12
25
  readonly runInFlight: boolean;
13
26
  readonly queuedSteers: readonly string[];
14
27
  readonly needsContinuationReminder: boolean;
15
28
  readonly pendingCommand: PendingAutoresearchCommand;
29
+ readonly pendingRun: PendingExperimentRun | null;
16
30
  };
17
31
 
18
32
  type MutableAutoresearchRuntimeState = {
@@ -21,6 +35,7 @@ type MutableAutoresearchRuntimeState = {
21
35
  queuedSteers: string[];
22
36
  needsContinuationReminder: boolean;
23
37
  pendingCommand: PendingAutoresearchCommand;
38
+ pendingRun: PendingExperimentRun | null;
24
39
  };
25
40
 
26
41
  const MAX_QUEUED_STEERS = 20;
@@ -33,6 +48,7 @@ function createDefaultRuntimeState(): MutableAutoresearchRuntimeState {
33
48
  queuedSteers: [],
34
49
  needsContinuationReminder: false,
35
50
  pendingCommand: null,
51
+ pendingRun: null,
36
52
  };
37
53
  }
38
54
 
@@ -53,6 +69,7 @@ export function getAutoresearchRuntimeState(cwd: string): AutoresearchRuntimeSna
53
69
  queuedSteers: [...state.queuedSteers],
54
70
  needsContinuationReminder: state.needsContinuationReminder,
55
71
  pendingCommand: state.pendingCommand,
72
+ pendingRun: state.pendingRun,
56
73
  };
57
74
  }
58
75
 
@@ -138,6 +155,26 @@ export function consumeAutoresearchContinuationReminder(cwd: string): boolean {
138
155
  return needsReminder;
139
156
  }
140
157
 
158
+ export function setAutoresearchPendingRun(
159
+ cwd: string,
160
+ pendingRun: PendingExperimentRun | null,
161
+ ): AutoresearchRuntimeSnapshot {
162
+ const state = getMutableRuntimeState(cwd);
163
+ state.pendingRun = pendingRun;
164
+ return getAutoresearchRuntimeState(cwd);
165
+ }
166
+
167
+ export function getAutoresearchPendingRun(cwd: string): PendingExperimentRun | null {
168
+ return getMutableRuntimeState(cwd).pendingRun;
169
+ }
170
+
171
+ export function consumeAutoresearchPendingRun(cwd: string): PendingExperimentRun | null {
172
+ const state = getMutableRuntimeState(cwd);
173
+ const pendingRun = state.pendingRun;
174
+ state.pendingRun = null;
175
+ return pendingRun;
176
+ }
177
+
141
178
  export function clearAutoresearchRuntimeState(cwd: string): void {
142
179
  runtimeStates.delete(cwd);
143
180
  }
@@ -0,0 +1,104 @@
1
+ import * as fs from "node:fs";
2
+ import { AUTORESEARCH_ROOT_FILES, getAutoresearchRootFilePath } from "./files.js";
3
+ import type { AutoresearchCheckpoint } from "./checkpoint.js";
4
+ import { formatConfidenceLine } from "./confidence.js";
5
+
6
+ export function syncAutoresearchSessionDoc(
7
+ cwd: string,
8
+ checkpoint: AutoresearchCheckpoint,
9
+ ): void {
10
+ const sessionDocPath = getAutoresearchRootFilePath(cwd, "sessionDoc");
11
+ const existing = fs.existsSync(sessionDocPath) ? fs.readFileSync(sessionDocPath, "utf8") : "";
12
+ let doc = ensureTitle(existing, checkpoint.session.name);
13
+
14
+ doc = upsertSection(
15
+ doc,
16
+ "## Metrics",
17
+ [
18
+ `- **Primary**: ${checkpoint.session.metricName} (${checkpoint.session.metricUnit || "unitless"}, ${checkpoint.session.bestDirection} is better)`,
19
+ ].join("\n"),
20
+ );
21
+
22
+ doc = upsertSection(
23
+ doc,
24
+ "## How to Run",
25
+ `\`${AUTORESEARCH_ROOT_FILES.runnerScript}\` — should emit \`METRIC name=number\` lines for ${checkpoint.session.metricName}.`,
26
+ );
27
+
28
+ doc = upsertSection(doc, "## What's Been Tried", buildTriedSection(checkpoint));
29
+ doc = upsertSection(doc, "## Plugin Checkpoint", buildCheckpointSection(checkpoint));
30
+
31
+ fs.writeFileSync(sessionDocPath, `${doc.trimEnd()}\n`);
32
+ }
33
+
34
+ function ensureTitle(doc: string, sessionName: string | null): string {
35
+ const trimmed = doc.trim();
36
+ if (!trimmed) {
37
+ return `# Autoresearch: ${sessionName ?? "Session"}\n`;
38
+ }
39
+
40
+ if (/^#\s+/m.test(trimmed)) {
41
+ return trimmed;
42
+ }
43
+
44
+ return `# Autoresearch: ${sessionName ?? "Session"}\n\n${trimmed}`;
45
+ }
46
+
47
+ function upsertSection(doc: string, heading: string, body: string): string {
48
+ const escapedHeading = heading.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
49
+ const sectionRe = new RegExp(`(^${escapedHeading}\\n)([\\s\\S]*?)(?=^##\\s|\\Z)`, "m");
50
+ const rendered = `${heading}\n${body.trim()}\n\n`;
51
+
52
+ if (sectionRe.test(doc)) {
53
+ return doc.replace(sectionRe, rendered);
54
+ }
55
+
56
+ return `${doc.trimEnd()}\n\n${rendered}`;
57
+ }
58
+
59
+ function buildTriedSection(checkpoint: AutoresearchCheckpoint): string {
60
+ if (checkpoint.recentLoggedRuns.length === 0) {
61
+ return "- No logged experiments yet.";
62
+ }
63
+
64
+ return checkpoint.recentLoggedRuns
65
+ .map((run) => {
66
+ const metricUnit = checkpoint.session.metricUnit;
67
+ const renderedMetric =
68
+ metricUnit && metricUnit.length > 0 ? `${run.metric}${metricUnit}` : `${run.metric}`;
69
+ return `- #${run.run} ${run.status} ${renderedMetric} ${run.commit} — ${run.description}`;
70
+ })
71
+ .join("\n");
72
+ }
73
+
74
+ function buildCheckpointSection(checkpoint: AutoresearchCheckpoint): string {
75
+ const lines = [
76
+ `- Last updated: ${new Date(checkpoint.updatedAt).toISOString()}`,
77
+ `- Runs tracked: ${checkpoint.session.currentRunCount} current / ${checkpoint.session.totalRunCount} total`,
78
+ `- Baseline: ${formatMetric(checkpoint.session.currentBaselineMetric, checkpoint.session.metricUnit)}`,
79
+ `- Best kept: ${formatMetric(checkpoint.session.currentBestMetric, checkpoint.session.metricUnit)}`,
80
+ `- ${formatConfidenceLine(checkpoint.session.confidence)}`,
81
+ ];
82
+
83
+ if (checkpoint.lastLoggedRun) {
84
+ lines.push(
85
+ `- Last logged run: #${checkpoint.lastLoggedRun.run} ${checkpoint.lastLoggedRun.status} ${checkpoint.lastLoggedRun.commit} — ${checkpoint.lastLoggedRun.description}`,
86
+ );
87
+ }
88
+
89
+ if (checkpoint.pendingRun) {
90
+ lines.push(
91
+ `- Pending run awaiting log_experiment: ${checkpoint.pendingRun.command} (${formatMetric(checkpoint.pendingRun.primaryMetric, checkpoint.session.metricUnit)})`,
92
+ );
93
+ }
94
+
95
+ return lines.join("\n");
96
+ }
97
+
98
+ function formatMetric(value: number | null, unit: string): string {
99
+ if (value === null) {
100
+ return "n/a";
101
+ }
102
+
103
+ return `${value}${unit}`;
104
+ }