@oh-my-pi/pi-coding-agent 14.5.14 → 14.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -0
- package/package.json +7 -7
- package/src/autoresearch/command-resume.md +5 -8
- package/src/autoresearch/git.ts +41 -51
- package/src/autoresearch/helpers.ts +43 -359
- package/src/autoresearch/index.ts +281 -273
- package/src/autoresearch/prompt-setup.md +43 -0
- package/src/autoresearch/prompt.md +52 -193
- package/src/autoresearch/resume-message.md +2 -8
- package/src/autoresearch/state.ts +59 -166
- package/src/autoresearch/storage.ts +687 -0
- package/src/autoresearch/tools/init-experiment.ts +201 -290
- package/src/autoresearch/tools/log-experiment.ts +304 -517
- package/src/autoresearch/tools/run-experiment.ts +117 -296
- package/src/autoresearch/tools/update-notes.ts +116 -0
- package/src/autoresearch/types.ts +16 -66
- package/src/config/settings-schema.ts +1 -1
- package/src/config/settings.ts +20 -1
- package/src/cursor.ts +1 -1
- package/src/edit/index.ts +9 -31
- package/src/edit/line-hash.ts +70 -43
- package/src/edit/modes/hashline.lark +26 -0
- package/src/edit/modes/hashline.ts +898 -1099
- package/src/edit/modes/patch.ts +0 -7
- package/src/edit/modes/replace.ts +0 -4
- package/src/edit/renderer.ts +22 -20
- package/src/edit/streaming.ts +8 -28
- package/src/eval/eval.lark +24 -30
- package/src/eval/js/context-manager.ts +5 -162
- package/src/eval/js/prelude.txt +0 -12
- package/src/eval/parse.ts +129 -129
- package/src/eval/py/prelude.py +1 -219
- package/src/export/html/template.generated.ts +1 -1
- package/src/export/html/template.js +2 -2
- package/src/internal-urls/docs-index.generated.ts +1 -1
- package/src/modes/components/session-observer-overlay.ts +5 -2
- package/src/modes/components/status-line/segments.ts +1 -1
- package/src/modes/components/status-line.ts +3 -5
- package/src/modes/components/tree-selector.ts +4 -5
- package/src/modes/components/welcome.ts +11 -1
- package/src/modes/controllers/command-controller.ts +2 -6
- package/src/modes/controllers/event-controller.ts +1 -2
- package/src/modes/controllers/extension-ui-controller.ts +3 -15
- package/src/modes/controllers/input-controller.ts +0 -1
- package/src/modes/controllers/selector-controller.ts +1 -1
- package/src/modes/interactive-mode.ts +5 -7
- package/src/prompts/system/system-prompt.md +14 -38
- package/src/prompts/tools/ast-edit.md +8 -8
- package/src/prompts/tools/ast-grep.md +10 -10
- package/src/prompts/tools/eval.md +13 -31
- package/src/prompts/tools/find.md +2 -1
- package/src/prompts/tools/hashline.md +66 -57
- package/src/prompts/tools/search.md +2 -2
- package/src/session/session-manager.ts +17 -13
- package/src/tools/ast-edit.ts +141 -44
- package/src/tools/ast-grep.ts +112 -36
- package/src/tools/eval.ts +2 -53
- package/src/tools/find.ts +16 -15
- package/src/tools/path-utils.ts +36 -196
- package/src/tools/search.ts +56 -35
- package/src/utils/edit-mode.ts +2 -11
- package/src/utils/file-display-mode.ts +1 -1
- package/src/utils/git.ts +17 -0
- package/src/utils/session-color.ts +0 -12
- package/src/utils/title-generator.ts +22 -38
- package/src/autoresearch/apply-contract-to-state.ts +0 -24
- package/src/autoresearch/contract.ts +0 -288
- package/src/edit/modes/atom.lark +0 -29
- package/src/edit/modes/atom.ts +0 -1773
- package/src/prompts/tools/atom.md +0 -150
|
@@ -2,36 +2,21 @@ import * as fs from "node:fs";
|
|
|
2
2
|
import * as path from "node:path";
|
|
3
3
|
import { StringEnum } from "@oh-my-pi/pi-ai";
|
|
4
4
|
import { Text } from "@oh-my-pi/pi-tui";
|
|
5
|
-
import { logger } from "@oh-my-pi/pi-utils";
|
|
6
5
|
import { Type } from "@sinclair/typebox";
|
|
7
6
|
import type { ToolDefinition } from "../../extensibility/extensions";
|
|
8
7
|
import type { Theme } from "../../modes/theme/theme";
|
|
9
8
|
import { replaceTabs, truncateToWidth } from "../../tools/render-utils";
|
|
10
9
|
import * as git from "../../utils/git";
|
|
11
|
-
import {
|
|
12
|
-
import {
|
|
13
|
-
import { computeRunModifiedPaths, getCurrentAutoresearchBranch, parseWorkDirDirtyPathsWithStatus } from "../git";
|
|
10
|
+
import { computeRunModifiedPaths, getCurrentAutoresearchBranch, parseWorkDirDirtyPaths } from "../git";
|
|
11
|
+
import { ensureNumericMetricMap, formatNum, mergeAsi, pathMatchesSpec, sanitizeAsi } from "../helpers";
|
|
14
12
|
import {
|
|
15
|
-
|
|
16
|
-
formatNum,
|
|
17
|
-
inferMetricUnitFromName,
|
|
18
|
-
isAutoresearchCommittableFile,
|
|
19
|
-
isAutoresearchLocalStatePath,
|
|
20
|
-
isAutoresearchShCommand,
|
|
21
|
-
isBetter,
|
|
22
|
-
mergeAsi,
|
|
23
|
-
readPendingRunSummary,
|
|
24
|
-
resolveWorkDir,
|
|
25
|
-
validateWorkDir,
|
|
26
|
-
} from "../helpers";
|
|
27
|
-
import {
|
|
28
|
-
cloneExperimentState,
|
|
13
|
+
buildExperimentState,
|
|
29
14
|
computeConfidence,
|
|
30
15
|
currentResults,
|
|
31
|
-
findBaselineMetric,
|
|
32
16
|
findBaselineSecondary,
|
|
33
17
|
findBestKeptMetric,
|
|
34
18
|
} from "../state";
|
|
19
|
+
import { openAutoresearchStorageIfExists, type SessionRow } from "../storage";
|
|
35
20
|
import type {
|
|
36
21
|
ASIData,
|
|
37
22
|
AutoresearchToolFactoryOptions,
|
|
@@ -41,50 +26,50 @@ import type {
|
|
|
41
26
|
NumericMetricMap,
|
|
42
27
|
} from "../types";
|
|
43
28
|
|
|
44
|
-
const EXPERIMENT_TOOL_NAMES = ["init_experiment", "run_experiment", "log_experiment"];
|
|
29
|
+
const EXPERIMENT_TOOL_NAMES = ["init_experiment", "run_experiment", "log_experiment", "update_notes"];
|
|
45
30
|
|
|
46
31
|
const logExperimentSchema = Type.Object({
|
|
47
|
-
commit: Type.String({
|
|
48
|
-
description: "Current git commit hash or placeholder.",
|
|
49
|
-
}),
|
|
50
32
|
metric: Type.Number({
|
|
51
|
-
description: "Primary metric value for this run.",
|
|
33
|
+
description: "Primary metric value for this run. May differ from the parsed value; deviation is recorded.",
|
|
52
34
|
}),
|
|
53
35
|
status: StringEnum(["keep", "discard", "crash", "checks_failed"], {
|
|
54
36
|
description: "Outcome for this run.",
|
|
55
37
|
}),
|
|
56
|
-
description: Type.String({
|
|
57
|
-
description: "Short description of the experiment.",
|
|
58
|
-
}),
|
|
38
|
+
description: Type.String({ description: "Short description of the experiment." }),
|
|
59
39
|
metrics: Type.Optional(
|
|
60
|
-
Type.Record(Type.String(), Type.Number(), {
|
|
61
|
-
description: "Secondary metrics for this run.",
|
|
62
|
-
}),
|
|
40
|
+
Type.Record(Type.String(), Type.Number(), { description: "Secondary metrics for this run." }),
|
|
63
41
|
),
|
|
64
|
-
|
|
65
|
-
Type.
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
42
|
+
asi: Type.Optional(
|
|
43
|
+
Type.Object(
|
|
44
|
+
{},
|
|
45
|
+
{
|
|
46
|
+
additionalProperties: Type.Unknown(),
|
|
47
|
+
description: "Free-form structured metadata captured for this run (hypothesis, learnings, etc.).",
|
|
48
|
+
},
|
|
49
|
+
),
|
|
69
50
|
),
|
|
70
|
-
|
|
71
|
-
Type.
|
|
51
|
+
commit: Type.Optional(
|
|
52
|
+
Type.String({ description: "Override the commit hash recorded for this run. Defaults to the current HEAD." }),
|
|
53
|
+
),
|
|
54
|
+
justification: Type.Optional(
|
|
55
|
+
Type.String({
|
|
72
56
|
description:
|
|
73
|
-
"
|
|
57
|
+
"Required when the run modifies paths outside scope or inside off-limits and you still want it kept. Free-form explanation.",
|
|
74
58
|
}),
|
|
75
59
|
),
|
|
76
|
-
|
|
77
|
-
Type.
|
|
78
|
-
|
|
79
|
-
|
|
60
|
+
flag_runs: Type.Optional(
|
|
61
|
+
Type.Array(
|
|
62
|
+
Type.Object({
|
|
63
|
+
run_id: Type.Number({ description: "Run id (#) of a previously logged run to flag as suspect." }),
|
|
64
|
+
reason: Type.String({
|
|
65
|
+
description: "Why this earlier run is suspect (e.g. reward-hacked, broken metric).",
|
|
66
|
+
}),
|
|
67
|
+
}),
|
|
68
|
+
{ description: "Mark earlier runs as flagged. Flagged runs are excluded from baseline and best-metric math." },
|
|
69
|
+
),
|
|
80
70
|
),
|
|
81
71
|
});
|
|
82
72
|
|
|
83
|
-
interface KeepCommitResult {
|
|
84
|
-
error?: string;
|
|
85
|
-
note?: string;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
73
|
export function createLogExperimentTool(
|
|
89
74
|
options: AutoresearchToolFactoryOptions,
|
|
90
75
|
): ToolDefinition<typeof logExperimentSchema, LogDetails> {
|
|
@@ -92,189 +77,111 @@ export function createLogExperimentTool(
|
|
|
92
77
|
name: "log_experiment",
|
|
93
78
|
label: "Log Experiment",
|
|
94
79
|
description:
|
|
95
|
-
"Log the
|
|
80
|
+
"Log the result of the latest run_experiment. Records the metric, optional ASI metadata, modified paths, and scope deviations. On `keep`, modified files are committed; on `discard`/`crash`/`checks_failed`, the worktree is reverted. Pass `flag_runs` to mark earlier runs as suspect; flagged runs are excluded from baseline and best-metric math.",
|
|
96
81
|
parameters: logExperimentSchema,
|
|
97
82
|
defaultInactive: true,
|
|
98
83
|
async execute(_toolCallId, params, _signal, _onUpdate, ctx) {
|
|
99
|
-
const
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
};
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
const runtime = options.getRuntime(ctx);
|
|
107
|
-
const state = runtime.state;
|
|
108
|
-
const workDir = resolveWorkDir(ctx.cwd);
|
|
109
|
-
|
|
110
|
-
const contractResult = readAutoresearchContract(workDir);
|
|
111
|
-
const scriptSnapshot = loadAutoresearchScriptSnapshot(workDir);
|
|
112
|
-
const contractErrors = [...contractResult.errors, ...scriptSnapshot.errors];
|
|
113
|
-
if (contractErrors.length > 0) {
|
|
114
|
-
return {
|
|
115
|
-
content: [{ type: "text", text: `Error: ${contractErrors.join(" ")}` }],
|
|
116
|
-
};
|
|
117
|
-
}
|
|
118
|
-
const benchmarkForSync = contractResult.contract.benchmark;
|
|
119
|
-
if (benchmarkForSync.command && !isAutoresearchShCommand(benchmarkForSync.command)) {
|
|
84
|
+
const storage = await openAutoresearchStorageIfExists(ctx.cwd);
|
|
85
|
+
const currentBranch = (await git.branch.current(ctx.cwd)) ?? null;
|
|
86
|
+
const session = storage?.getActiveSessionForBranch(currentBranch) ?? null;
|
|
87
|
+
if (!storage || !session) {
|
|
120
88
|
return {
|
|
121
89
|
content: [
|
|
122
90
|
{
|
|
123
91
|
type: "text",
|
|
124
|
-
text:
|
|
125
|
-
"Error: Benchmark.command in autoresearch.md must invoke `autoresearch.sh` directly before logging. " +
|
|
126
|
-
"Fix autoresearch.md or move the workload into autoresearch.sh.",
|
|
92
|
+
text: "Error: no active autoresearch session for the current branch. Call init_experiment first.",
|
|
127
93
|
},
|
|
128
94
|
],
|
|
129
95
|
};
|
|
130
96
|
}
|
|
131
|
-
|
|
132
|
-
const pendingRun =
|
|
133
|
-
runtime.lastRunSummary ?? (await readPendingRunSummary(workDir, collectLoggedRunNumbers(state.results)));
|
|
97
|
+
const pendingRun = storage.getPendingRun(session.id);
|
|
134
98
|
if (!pendingRun) {
|
|
135
99
|
return {
|
|
136
|
-
content: [{ type: "text", text: "Error: no
|
|
137
|
-
};
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
applyAutoresearchContractToExperimentState(contractResult.contract, state);
|
|
141
|
-
const logPreamble =
|
|
142
|
-
"Refreshed session fields from autoresearch.md before logging (benchmark, scope, constraints).\n\n";
|
|
143
|
-
runtime.lastRunSummary = pendingRun;
|
|
144
|
-
runtime.lastRunAsi = pendingRun.parsedAsi;
|
|
145
|
-
runtime.lastRunChecks =
|
|
146
|
-
pendingRun.checksPass === null
|
|
147
|
-
? null
|
|
148
|
-
: {
|
|
149
|
-
pass: pendingRun.checksPass,
|
|
150
|
-
output: "",
|
|
151
|
-
duration: pendingRun.checksDurationSeconds ?? 0,
|
|
152
|
-
};
|
|
153
|
-
runtime.lastRunDuration = pendingRun.durationSeconds;
|
|
154
|
-
|
|
155
|
-
if (pendingRun.parsedPrimary !== null && params.metric !== pendingRun.parsedPrimary) {
|
|
156
|
-
return {
|
|
157
|
-
content: [
|
|
158
|
-
{
|
|
159
|
-
type: "text",
|
|
160
|
-
text:
|
|
161
|
-
"Error: metric does not match the parsed primary metric from the pending run.\n" +
|
|
162
|
-
`Expected: ${pendingRun.parsedPrimary}\nReceived: ${params.metric}`,
|
|
163
|
-
},
|
|
164
|
-
],
|
|
165
|
-
};
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
if (params.status === "keep" && !pendingRun.passed) {
|
|
169
|
-
return {
|
|
170
|
-
content: [
|
|
171
|
-
{
|
|
172
|
-
type: "text",
|
|
173
|
-
text: "Error: cannot keep this run because the pending benchmark did not pass. Log it as crash or checks_failed instead.",
|
|
174
|
-
},
|
|
175
|
-
],
|
|
176
|
-
};
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
if (params.status === "keep" && runtime.lastRunChecks && !runtime.lastRunChecks.pass) {
|
|
180
|
-
return {
|
|
181
|
-
content: [
|
|
182
|
-
{
|
|
183
|
-
type: "text",
|
|
184
|
-
text: "Error: cannot keep this run because autoresearch.checks.sh failed. Log it as checks_failed instead.",
|
|
185
|
-
},
|
|
186
|
-
],
|
|
187
|
-
};
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
const observedStatusError = validateObservedStatus(params.status, pendingRun);
|
|
191
|
-
if (observedStatusError) {
|
|
192
|
-
return {
|
|
193
|
-
content: [{ type: "text", text: `Error: ${observedStatusError}` }],
|
|
100
|
+
content: [{ type: "text", text: "Error: no pending run available. Run run_experiment first." }],
|
|
194
101
|
};
|
|
195
102
|
}
|
|
196
103
|
|
|
197
|
-
const
|
|
198
|
-
const secondaryMetrics = buildSecondaryMetrics(params.metrics, pendingRun.parsedMetrics, state.metricName);
|
|
104
|
+
const runtime = options.getRuntime(ctx);
|
|
199
105
|
|
|
200
|
-
const
|
|
201
|
-
|
|
202
|
-
const
|
|
203
|
-
if (
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
};
|
|
207
|
-
}
|
|
106
|
+
const flaggedRuns: LogDetails["flaggedRuns"] = [];
|
|
107
|
+
for (const flag of params.flag_runs ?? []) {
|
|
108
|
+
const target = storage.getRunById(flag.run_id);
|
|
109
|
+
if (!target || target.sessionId !== session.id) continue;
|
|
110
|
+
storage.flagRun(flag.run_id, flag.reason);
|
|
111
|
+
flaggedRuns.push({ runId: flag.run_id, reason: flag.reason });
|
|
208
112
|
}
|
|
209
113
|
|
|
210
|
-
const
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
type: "text",
|
|
230
|
-
text:
|
|
231
|
-
"Error: cannot keep this run because the primary metric regressed.\n" +
|
|
232
|
-
`Current best: ${currentBestMetric}\nReceived: ${params.metric}`,
|
|
233
|
-
},
|
|
234
|
-
],
|
|
235
|
-
};
|
|
236
|
-
}
|
|
237
|
-
keepScopeValidation = scopeValidation;
|
|
114
|
+
const branchName = await getCurrentAutoresearchBranch(options.pi, ctx.cwd);
|
|
115
|
+
const onAutoresearchBranch = branchName !== null;
|
|
116
|
+
|
|
117
|
+
let allModified: string[];
|
|
118
|
+
if (onAutoresearchBranch) {
|
|
119
|
+
// On a dedicated autoresearch branch every iteration starts from a clean
|
|
120
|
+
// worktree (init_experiment baseline + previous keep commit / discard reset),
|
|
121
|
+
// so any currently-dirty path is the agent's iteration change. Off-branch we
|
|
122
|
+
// can't tell user dirt apart from agent edits, so we keep the (lossy)
|
|
123
|
+
// preRunDirtyPaths filter.
|
|
124
|
+
const statusText = await tryGitStatus(ctx.cwd);
|
|
125
|
+
const workDirPrefix = await tryGitPrefix(ctx.cwd);
|
|
126
|
+
allModified = parseWorkDirDirtyPaths(statusText, workDirPrefix);
|
|
127
|
+
} else {
|
|
128
|
+
const { modifiedTracked, modifiedUntracked } = await detectModifiedPaths(
|
|
129
|
+
ctx.cwd,
|
|
130
|
+
pendingRun.preRunDirtyPaths,
|
|
131
|
+
);
|
|
132
|
+
allModified = [...modifiedTracked, ...modifiedUntracked];
|
|
238
133
|
}
|
|
134
|
+
const scopeDeviations = computeScopeDeviations(allModified, session);
|
|
239
135
|
|
|
240
|
-
const
|
|
241
|
-
|
|
242
|
-
commit: params.commit.slice(0, 7),
|
|
243
|
-
metric: params.metric,
|
|
244
|
-
metrics: secondaryMetrics,
|
|
245
|
-
status: params.status,
|
|
246
|
-
description: params.description,
|
|
247
|
-
timestamp: Date.now(),
|
|
248
|
-
segment: state.currentSegment,
|
|
249
|
-
confidence: null,
|
|
250
|
-
asi: mergedAsi,
|
|
251
|
-
};
|
|
136
|
+
const justification = params.justification?.trim() || null;
|
|
137
|
+
const warnings: string[] = [];
|
|
252
138
|
|
|
253
|
-
const
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
content: [
|
|
257
|
-
{
|
|
258
|
-
type: "text",
|
|
259
|
-
text:
|
|
260
|
-
"Error: autoresearch keep/discard actions require an active `autoresearch/...` branch. " +
|
|
261
|
-
"Run `/autoresearch` again to restore the protected branch before logging this run.",
|
|
262
|
-
},
|
|
263
|
-
],
|
|
264
|
-
};
|
|
265
|
-
}
|
|
139
|
+
const headSha = await tryReadHeadSha(ctx.cwd);
|
|
140
|
+
const explicitCommit = params.commit?.trim();
|
|
141
|
+
let commitHash = explicitCommit && explicitCommit.length > 0 ? explicitCommit : headSha;
|
|
266
142
|
|
|
267
143
|
let gitNote: string | null = null;
|
|
268
144
|
if (params.status === "keep") {
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
145
|
+
if (onAutoresearchBranch && allModified.length > 0) {
|
|
146
|
+
const commitResult = await commitKeptExperiment(
|
|
147
|
+
ctx.cwd,
|
|
148
|
+
params.description,
|
|
149
|
+
params.status,
|
|
150
|
+
params.metric,
|
|
151
|
+
params.metrics ?? {},
|
|
152
|
+
allModified,
|
|
153
|
+
session.primaryMetric,
|
|
154
|
+
);
|
|
155
|
+
if (commitResult.error) {
|
|
156
|
+
return {
|
|
157
|
+
content: [{ type: "text", text: `Error: ${commitResult.error}` }],
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
gitNote = commitResult.note ?? null;
|
|
161
|
+
const newSha = await tryReadHeadSha(ctx.cwd);
|
|
162
|
+
if (newSha) commitHash = newSha;
|
|
163
|
+
} else if (!onAutoresearchBranch) {
|
|
164
|
+
warnings.push(
|
|
165
|
+
"Auto-commit skipped: not on a dedicated autoresearch branch. Modified files remain in the worktree.",
|
|
166
|
+
);
|
|
167
|
+
} else if (allModified.length === 0) {
|
|
168
|
+
gitNote = "nothing to commit";
|
|
169
|
+
}
|
|
170
|
+
if (scopeDeviations.length > 0) {
|
|
171
|
+
if (justification === null) {
|
|
172
|
+
warnings.push(
|
|
173
|
+
`Kept with unjustified scope deviations: ${scopeDeviations.join(", ")}. Pass \`justification\` next time or \`flag_runs\` this entry on a future log_experiment if it was a mistake.`,
|
|
174
|
+
);
|
|
175
|
+
} else {
|
|
176
|
+
warnings.push(`Kept with scope deviations (justified): ${scopeDeviations.join(", ")}`);
|
|
177
|
+
}
|
|
274
178
|
}
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
179
|
+
} else {
|
|
180
|
+
const revertResult = await revertFailedExperiment(
|
|
181
|
+
ctx.cwd,
|
|
182
|
+
pendingRun.preRunDirtyPaths,
|
|
183
|
+
onAutoresearchBranch,
|
|
184
|
+
);
|
|
278
185
|
if (revertResult.error) {
|
|
279
186
|
return {
|
|
280
187
|
content: [{ type: "text", text: `Error: ${revertResult.error}` }],
|
|
@@ -283,57 +190,78 @@ export function createLogExperimentTool(
|
|
|
283
190
|
gitNote = revertResult.note ?? null;
|
|
284
191
|
}
|
|
285
192
|
|
|
286
|
-
const
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
options.dashboard.updateWidget(ctx, runtime);
|
|
299
|
-
options.dashboard.requestRender();
|
|
300
|
-
throw error;
|
|
301
|
-
}
|
|
302
|
-
try {
|
|
303
|
-
await updateRunMetadata(runtime.lastRunArtifactDir ?? pendingRun.runDirectory, {
|
|
304
|
-
commit: experiment.commit,
|
|
305
|
-
confidence: experiment.confidence,
|
|
306
|
-
description: experiment.description,
|
|
307
|
-
gitNote,
|
|
308
|
-
loggedAt: new Date(experiment.timestamp).toISOString(),
|
|
309
|
-
loggedAsi: experiment.asi,
|
|
310
|
-
loggedMetric: experiment.metric,
|
|
311
|
-
loggedMetrics: experiment.metrics,
|
|
312
|
-
runNumber: runtime.lastRunNumber ?? pendingRun.runNumber,
|
|
313
|
-
status: experiment.status,
|
|
314
|
-
wallClockSeconds,
|
|
315
|
-
});
|
|
316
|
-
} catch (error) {
|
|
317
|
-
logger.warn("Failed to update autoresearch run metadata after persisting JSONL history", {
|
|
318
|
-
error: error instanceof Error ? error.message : String(error),
|
|
319
|
-
runDirectory: runtime.lastRunArtifactDir ?? pendingRun.runDirectory,
|
|
320
|
-
runNumber: runtime.lastRunNumber ?? pendingRun.runNumber,
|
|
321
|
-
});
|
|
193
|
+
const metric = params.metric;
|
|
194
|
+
const secondaryMetrics: NumericMetricMap = mergeMetrics(
|
|
195
|
+
pendingRun.parsedMetrics,
|
|
196
|
+
params.metrics,
|
|
197
|
+
session.primaryMetric,
|
|
198
|
+
);
|
|
199
|
+
const asi: ASIData | undefined = mergeAsi(pendingRun.parsedAsi, sanitizeAsi(params.asi));
|
|
200
|
+
|
|
201
|
+
if (pendingRun.parsedPrimary !== null && metric !== pendingRun.parsedPrimary) {
|
|
202
|
+
warnings.push(
|
|
203
|
+
`Logged metric ${metric} differs from parsed primary ${pendingRun.parsedPrimary}. Both values stored.`,
|
|
204
|
+
);
|
|
322
205
|
}
|
|
323
206
|
|
|
207
|
+
const loggedAt = Date.now();
|
|
208
|
+
const tentativeRun = storage.markRunLogged({
|
|
209
|
+
runId: pendingRun.id,
|
|
210
|
+
status: params.status,
|
|
211
|
+
description: params.description,
|
|
212
|
+
metric,
|
|
213
|
+
metrics: secondaryMetrics,
|
|
214
|
+
asi: asi ?? null,
|
|
215
|
+
commitHash,
|
|
216
|
+
confidence: null,
|
|
217
|
+
modifiedPaths: allModified,
|
|
218
|
+
scopeDeviations,
|
|
219
|
+
justification,
|
|
220
|
+
loggedAt,
|
|
221
|
+
});
|
|
222
|
+
|
|
223
|
+
// Recompute confidence with this run included
|
|
224
|
+
const refreshedSession = storage.getSessionById(session.id) ?? session;
|
|
225
|
+
const loggedRuns = storage.listLoggedRuns(session.id);
|
|
226
|
+
const stateForConfidence = buildExperimentState(refreshedSession, loggedRuns);
|
|
227
|
+
const confidence = computeConfidence(
|
|
228
|
+
stateForConfidence.results,
|
|
229
|
+
stateForConfidence.currentSegment,
|
|
230
|
+
stateForConfidence.bestDirection,
|
|
231
|
+
);
|
|
232
|
+
storage.updateRunConfidence(tentativeRun.id, confidence);
|
|
233
|
+
|
|
234
|
+
const finalState = buildExperimentState(refreshedSession, storage.listLoggedRuns(session.id));
|
|
235
|
+
runtime.state = finalState;
|
|
324
236
|
runtime.runningExperiment = null;
|
|
325
|
-
runtime.
|
|
237
|
+
runtime.lastRunSummary = null;
|
|
326
238
|
runtime.lastRunDuration = null;
|
|
327
239
|
runtime.lastRunAsi = null;
|
|
328
240
|
runtime.lastRunArtifactDir = null;
|
|
329
241
|
runtime.lastRunNumber = null;
|
|
330
|
-
runtime.lastRunSummary = null;
|
|
331
242
|
runtime.autoResumeArmed = true;
|
|
332
243
|
runtime.lastAutoResumePendingRunNumber = null;
|
|
333
244
|
|
|
334
|
-
const
|
|
335
|
-
|
|
336
|
-
|
|
245
|
+
const experiment: ExperimentResult = {
|
|
246
|
+
runNumber: tentativeRun.id,
|
|
247
|
+
commit: (commitHash ?? "").slice(0, 12),
|
|
248
|
+
metric,
|
|
249
|
+
metrics: secondaryMetrics,
|
|
250
|
+
status: params.status,
|
|
251
|
+
description: params.description,
|
|
252
|
+
timestamp: loggedAt,
|
|
253
|
+
segment: pendingRun.segment,
|
|
254
|
+
confidence,
|
|
255
|
+
asi,
|
|
256
|
+
modifiedPaths: allModified,
|
|
257
|
+
scopeDeviations,
|
|
258
|
+
justification,
|
|
259
|
+
flagged: false,
|
|
260
|
+
flaggedReason: null,
|
|
261
|
+
};
|
|
262
|
+
|
|
263
|
+
const segmentRunCount = currentResults(finalState.results, finalState.currentSegment).length;
|
|
264
|
+
if (finalState.maxExperiments !== null && segmentRunCount >= finalState.maxExperiments) {
|
|
337
265
|
runtime.autoresearchMode = false;
|
|
338
266
|
options.pi.appendEntry(
|
|
339
267
|
"autoresearch-control",
|
|
@@ -343,19 +271,30 @@ export function createLogExperimentTool(
|
|
|
343
271
|
options.pi.getActiveTools().filter(name => !EXPERIMENT_TOOL_NAMES.includes(name)),
|
|
344
272
|
);
|
|
345
273
|
}
|
|
274
|
+
|
|
346
275
|
options.dashboard.updateWidget(ctx, runtime);
|
|
347
276
|
options.dashboard.requestRender();
|
|
348
277
|
|
|
278
|
+
const wallClockSeconds = pendingRun.durationMs !== null ? pendingRun.durationMs / 1000 : null;
|
|
279
|
+
const text = buildLogText(
|
|
280
|
+
finalState,
|
|
281
|
+
experiment,
|
|
282
|
+
segmentRunCount,
|
|
283
|
+
wallClockSeconds,
|
|
284
|
+
gitNote,
|
|
285
|
+
warnings,
|
|
286
|
+
flaggedRuns,
|
|
287
|
+
);
|
|
288
|
+
|
|
349
289
|
return {
|
|
350
290
|
content: [{ type: "text", text }],
|
|
351
291
|
details: {
|
|
352
|
-
experiment
|
|
353
|
-
|
|
354
|
-
metrics: { ...experiment.metrics },
|
|
355
|
-
asi: experiment.asi ? structuredClone(experiment.asi) : undefined,
|
|
356
|
-
},
|
|
357
|
-
state: cloneExperimentState(state),
|
|
292
|
+
experiment,
|
|
293
|
+
state: finalState,
|
|
358
294
|
wallClockSeconds,
|
|
295
|
+
scopeDeviations,
|
|
296
|
+
justification,
|
|
297
|
+
flaggedRuns,
|
|
359
298
|
},
|
|
360
299
|
};
|
|
361
300
|
},
|
|
@@ -373,320 +312,163 @@ export function createLogExperimentTool(
|
|
|
373
312
|
if (!details) {
|
|
374
313
|
return new Text(replaceTabs(result.content.find(part => part.type === "text")?.text ?? ""), 0, 0);
|
|
375
314
|
}
|
|
376
|
-
|
|
377
|
-
return new Text(summary, 0, 0);
|
|
315
|
+
return new Text(renderSummary(details, theme), 0, 0);
|
|
378
316
|
},
|
|
379
317
|
};
|
|
380
318
|
}
|
|
381
319
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
function buildSecondaryMetrics(
|
|
387
|
-
overrides: NumericMetricMap | undefined,
|
|
388
|
-
parsedMetrics: NumericMetricMap | null,
|
|
389
|
-
primaryMetricName: string,
|
|
390
|
-
): NumericMetricMap {
|
|
391
|
-
const merged: NumericMetricMap = {};
|
|
392
|
-
for (const [name, value] of Object.entries(parsedMetrics ?? {})) {
|
|
393
|
-
if (name === "__proto__" || name === "constructor" || name === "prototype") continue;
|
|
394
|
-
if (name === primaryMetricName) continue;
|
|
395
|
-
merged[name] = value;
|
|
396
|
-
}
|
|
397
|
-
for (const [name, value] of Object.entries(cloneMetrics(overrides))) {
|
|
398
|
-
if (name === "__proto__" || name === "constructor" || name === "prototype") continue;
|
|
399
|
-
merged[name] = value;
|
|
400
|
-
}
|
|
401
|
-
return merged;
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
function sanitizeAsi(value: { [key: string]: unknown } | undefined): ASIData | undefined {
|
|
405
|
-
if (!value) return undefined;
|
|
406
|
-
const result: ASIData = {};
|
|
407
|
-
for (const [key, entryValue] of Object.entries(value)) {
|
|
408
|
-
if (key === "__proto__" || key === "constructor" || key === "prototype") continue;
|
|
409
|
-
const sanitized = sanitizeAsiValue(entryValue);
|
|
410
|
-
if (sanitized !== undefined) {
|
|
411
|
-
result[key] = sanitized;
|
|
412
|
-
}
|
|
413
|
-
}
|
|
414
|
-
return Object.keys(result).length > 0 ? result : undefined;
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
function sanitizeAsiValue(value: unknown): ASIData[string] | undefined {
|
|
418
|
-
if (value === null) return null;
|
|
419
|
-
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") return value;
|
|
420
|
-
if (Array.isArray(value)) {
|
|
421
|
-
const items = value
|
|
422
|
-
.map(item => sanitizeAsiValue(item))
|
|
423
|
-
.filter((item): item is NonNullable<typeof item> => item !== undefined);
|
|
424
|
-
return items;
|
|
425
|
-
}
|
|
426
|
-
if (typeof value === "object") {
|
|
427
|
-
const objectValue = value as { [key: string]: unknown };
|
|
428
|
-
const result: ASIData = {};
|
|
429
|
-
for (const [key, entryValue] of Object.entries(objectValue)) {
|
|
430
|
-
if (key === "__proto__" || key === "constructor" || key === "prototype") continue;
|
|
431
|
-
const sanitized = sanitizeAsiValue(entryValue);
|
|
432
|
-
if (sanitized !== undefined) {
|
|
433
|
-
result[key] = sanitized;
|
|
434
|
-
}
|
|
435
|
-
}
|
|
436
|
-
return result;
|
|
437
|
-
}
|
|
438
|
-
return undefined;
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
export function validateAsiRequirements(asi: ASIData | undefined, status: ExperimentResult["status"]): string | null {
|
|
442
|
-
if (!asi) {
|
|
443
|
-
return "asi is required. Include at minimum a non-empty hypothesis.";
|
|
444
|
-
}
|
|
445
|
-
if (typeof asi.hypothesis !== "string" || asi.hypothesis.trim().length === 0) {
|
|
446
|
-
return "asi.hypothesis is required and must be a non-empty string.";
|
|
447
|
-
}
|
|
448
|
-
if (status === "keep") return null;
|
|
449
|
-
if (typeof asi.rollback_reason !== "string" || asi.rollback_reason.trim().length === 0) {
|
|
450
|
-
return "asi.rollback_reason is required for discard, crash, and checks_failed results.";
|
|
451
|
-
}
|
|
452
|
-
if (typeof asi.next_action_hint !== "string" || asi.next_action_hint.trim().length === 0) {
|
|
453
|
-
return "asi.next_action_hint is required for discard, crash, and checks_failed results.";
|
|
454
|
-
}
|
|
455
|
-
return null;
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
function registerSecondaryMetrics(state: ExperimentState, metrics: NumericMetricMap): void {
|
|
459
|
-
for (const name of Object.keys(metrics)) {
|
|
460
|
-
if (state.secondaryMetrics.some(metric => metric.name === name)) continue;
|
|
461
|
-
state.secondaryMetrics.push({
|
|
462
|
-
name,
|
|
463
|
-
unit: inferMetricUnitFromName(name),
|
|
464
|
-
});
|
|
465
|
-
}
|
|
466
|
-
}
|
|
467
|
-
|
|
468
|
-
function persistRun(workDir: string, experiment: ExperimentResult): void {
|
|
469
|
-
const entry = {
|
|
470
|
-
run: experiment.runNumber,
|
|
471
|
-
...experiment,
|
|
472
|
-
};
|
|
473
|
-
const jsonlPath = path.join(workDir, "autoresearch.jsonl");
|
|
474
|
-
fs.appendFileSync(jsonlPath, `${JSON.stringify(entry)}\n`);
|
|
475
|
-
}
|
|
476
|
-
function validateObservedStatus(
|
|
477
|
-
status: ExperimentResult["status"],
|
|
478
|
-
pendingRun: { checksPass: boolean | null; passed: boolean },
|
|
479
|
-
): string | null {
|
|
480
|
-
if (pendingRun.checksPass === false) {
|
|
481
|
-
return status === "checks_failed"
|
|
482
|
-
? null
|
|
483
|
-
: "benchmark checks failed for the pending run. Log it as checks_failed.";
|
|
484
|
-
}
|
|
485
|
-
if (!pendingRun.passed) {
|
|
486
|
-
return status === "crash" ? null : "the pending benchmark failed. Log it as crash.";
|
|
487
|
-
}
|
|
488
|
-
return status === "keep" || status === "discard" ? null : "the pending benchmark passed. Log it as keep or discard.";
|
|
320
|
+
interface KeepCommitResult {
|
|
321
|
+
error?: string;
|
|
322
|
+
note?: string;
|
|
489
323
|
}
|
|
490
324
|
|
|
491
325
|
async function commitKeptExperiment(
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
326
|
+
cwd: string,
|
|
327
|
+
description: string,
|
|
328
|
+
status: ExperimentResult["status"],
|
|
329
|
+
metric: number,
|
|
330
|
+
metrics: NumericMetricMap,
|
|
331
|
+
files: string[],
|
|
332
|
+
primaryMetric: string,
|
|
497
333
|
): Promise<KeepCommitResult> {
|
|
498
|
-
if (
|
|
499
|
-
return { note: "nothing to commit" };
|
|
500
|
-
}
|
|
501
|
-
|
|
334
|
+
if (files.length === 0) return { note: "nothing to commit" };
|
|
502
335
|
try {
|
|
503
|
-
await git.stage.files(
|
|
336
|
+
await git.stage.files(cwd, files);
|
|
504
337
|
} catch (err) {
|
|
505
|
-
return {
|
|
506
|
-
error: `git add failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
507
|
-
};
|
|
338
|
+
return { error: `git add failed: ${err instanceof Error ? err.message : String(err)}` };
|
|
508
339
|
}
|
|
509
|
-
|
|
510
|
-
if (!(await git.diff.has(workDir, { cached: true, files: scopeValidation.committablePaths }))) {
|
|
340
|
+
if (!(await git.diff.has(cwd, { cached: true, files }))) {
|
|
511
341
|
return { note: "nothing to commit" };
|
|
512
342
|
}
|
|
513
|
-
|
|
514
343
|
const payload: { [key: string]: string | number } = {
|
|
515
|
-
status
|
|
516
|
-
[
|
|
344
|
+
status,
|
|
345
|
+
[primaryMetric]: metric,
|
|
517
346
|
};
|
|
518
|
-
for (const [name, value] of Object.entries(
|
|
347
|
+
for (const [name, value] of Object.entries(metrics)) {
|
|
519
348
|
payload[name] = value;
|
|
520
349
|
}
|
|
521
|
-
const commitMessage = `${
|
|
522
|
-
let commitResultText = "";
|
|
350
|
+
const commitMessage = `${description}\n\nResult: ${JSON.stringify(payload)}`;
|
|
523
351
|
try {
|
|
524
|
-
const commitResult = await git.commit(
|
|
525
|
-
|
|
526
|
-
}
|
|
527
|
-
commitResultText = mergeStdoutStderr(commitResult);
|
|
352
|
+
const commitResult = await git.commit(cwd, commitMessage, { files });
|
|
353
|
+
const summary = `${commitResult.stdout}${commitResult.stderr}`.split("\n").find(line => line.trim().length > 0);
|
|
354
|
+
return { note: summary?.trim() ?? "committed" };
|
|
528
355
|
} catch (err) {
|
|
529
|
-
return {
|
|
530
|
-
error: `git commit failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
531
|
-
};
|
|
532
|
-
}
|
|
533
|
-
|
|
534
|
-
const newCommit = (await git.head.short(workDir, 7)) ?? "";
|
|
535
|
-
if (newCommit.length >= 7) {
|
|
536
|
-
experiment.commit = newCommit;
|
|
356
|
+
return { error: `git commit failed: ${err instanceof Error ? err.message : String(err)}` };
|
|
537
357
|
}
|
|
538
|
-
const summaryLine = commitResultText.split("\n").find(line => line.trim().length > 0) ?? "committed";
|
|
539
|
-
return { note: summaryLine.trim() };
|
|
540
358
|
}
|
|
541
359
|
|
|
542
360
|
async function revertFailedExperiment(
|
|
543
|
-
|
|
544
|
-
workDir: string,
|
|
361
|
+
cwd: string,
|
|
545
362
|
preRunDirtyPaths: string[],
|
|
363
|
+
onAutoresearchBranch: boolean,
|
|
546
364
|
): Promise<KeepCommitResult> {
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
};
|
|
365
|
+
if (onAutoresearchBranch) {
|
|
366
|
+
// Discard reverts only the current iteration's uncommitted changes — never
|
|
367
|
+
// rewinds prior `keep` commits. Reset to HEAD so any kept improvements
|
|
368
|
+
// already on the branch survive.
|
|
369
|
+
try {
|
|
370
|
+
await git.reset(cwd, { hard: true, target: "HEAD" });
|
|
371
|
+
await git.clean(cwd);
|
|
372
|
+
return { note: "worktree reset to HEAD" };
|
|
373
|
+
} catch (err) {
|
|
374
|
+
return { error: `git reset/clean failed: ${err instanceof Error ? err.message : String(err)}` };
|
|
375
|
+
}
|
|
559
376
|
}
|
|
560
377
|
|
|
561
|
-
const
|
|
378
|
+
const statusText = await tryGitStatus(cwd);
|
|
379
|
+
const workDirPrefix = await tryGitPrefix(cwd);
|
|
562
380
|
const { tracked, untracked } = computeRunModifiedPaths(preRunDirtyPaths, statusText, workDirPrefix);
|
|
563
|
-
const
|
|
564
|
-
if (
|
|
565
|
-
return { note: "nothing to revert" };
|
|
566
|
-
}
|
|
567
|
-
|
|
381
|
+
const total = tracked.length + untracked.length;
|
|
382
|
+
if (total === 0) return { note: "nothing to revert" };
|
|
568
383
|
if (tracked.length > 0) {
|
|
569
384
|
try {
|
|
570
|
-
await git.restore(
|
|
385
|
+
await git.restore(cwd, { files: tracked, source: "HEAD", staged: true, worktree: true });
|
|
571
386
|
} catch (err) {
|
|
572
|
-
return {
|
|
573
|
-
error: `git restore failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
574
|
-
};
|
|
387
|
+
return { error: `git restore failed: ${err instanceof Error ? err.message : String(err)}` };
|
|
575
388
|
}
|
|
576
389
|
}
|
|
577
|
-
|
|
578
390
|
for (const filePath of untracked) {
|
|
579
|
-
const absolutePath = path.join(workDir, filePath);
|
|
580
391
|
try {
|
|
581
|
-
fs.rmSync(
|
|
392
|
+
fs.rmSync(path.join(cwd, filePath), { force: true, recursive: true });
|
|
582
393
|
} catch {
|
|
583
|
-
//
|
|
394
|
+
// best effort
|
|
584
395
|
}
|
|
585
396
|
}
|
|
397
|
+
return { note: `reverted ${total} file${total === 1 ? "" : "s"}` };
|
|
398
|
+
}
|
|
586
399
|
|
|
587
|
-
|
|
400
|
+
async function detectModifiedPaths(
|
|
401
|
+
cwd: string,
|
|
402
|
+
preRunDirtyPaths: string[],
|
|
403
|
+
): Promise<{ modifiedTracked: string[]; modifiedUntracked: string[] }> {
|
|
404
|
+
const statusText = await tryGitStatus(cwd);
|
|
405
|
+
const workDirPrefix = await tryGitPrefix(cwd);
|
|
406
|
+
const { tracked, untracked } = computeRunModifiedPaths(preRunDirtyPaths, statusText, workDirPrefix);
|
|
407
|
+
return { modifiedTracked: tracked, modifiedUntracked: untracked };
|
|
588
408
|
}
|
|
589
409
|
|
|
590
|
-
function
|
|
591
|
-
|
|
410
|
+
function computeScopeDeviations(modifiedPaths: string[], session: SessionRow): string[] {
|
|
411
|
+
const deviations: string[] = [];
|
|
412
|
+
for (const filePath of modifiedPaths) {
|
|
413
|
+
if (session.offLimits.some(spec => pathMatchesSpec(filePath, spec))) {
|
|
414
|
+
deviations.push(filePath);
|
|
415
|
+
continue;
|
|
416
|
+
}
|
|
417
|
+
if (session.scopePaths.length > 0 && !session.scopePaths.some(spec => pathMatchesSpec(filePath, spec))) {
|
|
418
|
+
deviations.push(filePath);
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
return deviations;
|
|
592
422
|
}
|
|
593
423
|
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
):
|
|
599
|
-
|
|
600
|
-
|
|
424
|
+
function mergeMetrics(
|
|
425
|
+
parsed: NumericMetricMap | null,
|
|
426
|
+
overrides: NumericMetricMap | undefined,
|
|
427
|
+
primaryMetricName: string,
|
|
428
|
+
): NumericMetricMap {
|
|
429
|
+
const merged: NumericMetricMap = {};
|
|
430
|
+
for (const [name, value] of Object.entries(parsed ?? {})) {
|
|
431
|
+
if (name === primaryMetricName) continue;
|
|
432
|
+
merged[name] = value;
|
|
601
433
|
}
|
|
434
|
+
for (const [name, value] of Object.entries(ensureNumericMetricMap(overrides))) {
|
|
435
|
+
merged[name] = value;
|
|
436
|
+
}
|
|
437
|
+
return merged;
|
|
438
|
+
}
|
|
602
439
|
|
|
603
|
-
|
|
440
|
+
async function tryReadHeadSha(cwd: string): Promise<string | null> {
|
|
604
441
|
try {
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
untrackedFiles: "all",
|
|
609
|
-
z: true,
|
|
610
|
-
});
|
|
611
|
-
} catch (err) {
|
|
612
|
-
return `git status failed: ${err instanceof Error ? err.message : String(err)}`;
|
|
442
|
+
return (await git.head.sha(cwd)) ?? null;
|
|
443
|
+
} catch {
|
|
444
|
+
return null;
|
|
613
445
|
}
|
|
446
|
+
}
|
|
614
447
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
}
|
|
621
|
-
if (isAutoresearchCommittableFile(entry.path)) {
|
|
622
|
-
committablePaths.push(entry.path);
|
|
623
|
-
continue;
|
|
624
|
-
}
|
|
625
|
-
if (state.offLimits.some(spec => pathMatchesContractPath(entry.path, spec))) {
|
|
626
|
-
return `cannot keep this run because ${entry.path} is listed under Off Limits in autoresearch.md`;
|
|
627
|
-
}
|
|
628
|
-
if (!state.scopePaths.some(spec => pathMatchesContractPath(entry.path, spec))) {
|
|
629
|
-
return `cannot keep this run because ${entry.path} is outside Files in Scope`;
|
|
630
|
-
}
|
|
631
|
-
committablePaths.push(entry.path);
|
|
448
|
+
async function tryGitStatus(cwd: string): Promise<string> {
|
|
449
|
+
try {
|
|
450
|
+
return await git.status(cwd, { porcelainV1: true, untrackedFiles: "all", z: true });
|
|
451
|
+
} catch {
|
|
452
|
+
return "";
|
|
632
453
|
}
|
|
633
|
-
|
|
634
|
-
return { committablePaths };
|
|
635
454
|
}
|
|
636
455
|
|
|
637
|
-
async function
|
|
638
|
-
runDirectory: string | null,
|
|
639
|
-
metadata: {
|
|
640
|
-
commit: string;
|
|
641
|
-
confidence: number | null;
|
|
642
|
-
description: string;
|
|
643
|
-
gitNote: string | null;
|
|
644
|
-
loggedAt: string;
|
|
645
|
-
loggedAsi: ASIData | undefined;
|
|
646
|
-
loggedMetric: number;
|
|
647
|
-
loggedMetrics: NumericMetricMap;
|
|
648
|
-
runNumber: number | null;
|
|
649
|
-
status: ExperimentResult["status"];
|
|
650
|
-
wallClockSeconds: number | null;
|
|
651
|
-
},
|
|
652
|
-
): Promise<void> {
|
|
653
|
-
if (!runDirectory) return;
|
|
654
|
-
const runJsonPath = path.join(runDirectory, "run.json");
|
|
655
|
-
let existing: Record<string, unknown> = {};
|
|
456
|
+
async function tryGitPrefix(cwd: string): Promise<string> {
|
|
656
457
|
try {
|
|
657
|
-
|
|
458
|
+
return await git.show.prefix(cwd);
|
|
658
459
|
} catch {
|
|
659
|
-
|
|
460
|
+
return "";
|
|
660
461
|
}
|
|
661
|
-
await Bun.write(
|
|
662
|
-
runJsonPath,
|
|
663
|
-
JSON.stringify(
|
|
664
|
-
{
|
|
665
|
-
...existing,
|
|
666
|
-
loggedRunNumber: metadata.runNumber,
|
|
667
|
-
loggedAt: metadata.loggedAt,
|
|
668
|
-
loggedAsi: metadata.loggedAsi,
|
|
669
|
-
loggedMetric: metadata.loggedMetric,
|
|
670
|
-
loggedMetrics: metadata.loggedMetrics,
|
|
671
|
-
status: metadata.status,
|
|
672
|
-
description: metadata.description,
|
|
673
|
-
commit: metadata.commit,
|
|
674
|
-
gitNote: metadata.gitNote,
|
|
675
|
-
confidence: metadata.confidence,
|
|
676
|
-
wallClockSeconds: metadata.wallClockSeconds,
|
|
677
|
-
},
|
|
678
|
-
null,
|
|
679
|
-
2,
|
|
680
|
-
),
|
|
681
|
-
);
|
|
682
462
|
}
|
|
683
463
|
|
|
684
464
|
function buildLogText(
|
|
685
465
|
state: ExperimentState,
|
|
686
466
|
experiment: ExperimentResult,
|
|
687
|
-
|
|
467
|
+
segmentRunCount: number,
|
|
688
468
|
wallClockSeconds: number | null,
|
|
689
469
|
gitNote: string | null,
|
|
470
|
+
warnings: string[],
|
|
471
|
+
flaggedRuns: LogDetails["flaggedRuns"],
|
|
690
472
|
): string {
|
|
691
473
|
const displayRunNumber = experiment.runNumber ?? state.results.length;
|
|
692
474
|
const lines = [`Logged run #${displayRunNumber}: ${experiment.status} - ${experiment.description}`];
|
|
@@ -696,7 +478,7 @@ function buildLogText(
|
|
|
696
478
|
if (state.bestMetric !== null) {
|
|
697
479
|
lines.push(`Baseline ${state.metricName}: ${formatNum(state.bestMetric, state.metricUnit)}`);
|
|
698
480
|
}
|
|
699
|
-
if (
|
|
481
|
+
if (segmentRunCount > 1 && state.bestMetric !== null && experiment.metric !== state.bestMetric) {
|
|
700
482
|
const delta = ((experiment.metric - state.bestMetric) / state.bestMetric) * 100;
|
|
701
483
|
const sign = delta > 0 ? "+" : "";
|
|
702
484
|
lines.push(`This run: ${formatNum(experiment.metric, state.metricUnit)} (${sign}${delta.toFixed(1)}%)`);
|
|
@@ -708,7 +490,7 @@ function buildLogText(
|
|
|
708
490
|
const parts = Object.entries(experiment.metrics).map(([name, value]) => {
|
|
709
491
|
const unit = state.secondaryMetrics.find(metric => metric.name === name)?.unit ?? "";
|
|
710
492
|
const baseline = baselineSecondary[name];
|
|
711
|
-
if (baseline === undefined || baseline === 0 ||
|
|
493
|
+
if (baseline === undefined || baseline === 0 || segmentRunCount === 1) {
|
|
712
494
|
return `${name}: ${formatNum(value, unit)}`;
|
|
713
495
|
}
|
|
714
496
|
const delta = ((value - baseline) / baseline) * 100;
|
|
@@ -717,6 +499,10 @@ function buildLogText(
|
|
|
717
499
|
});
|
|
718
500
|
lines.push(`Secondary metrics: ${parts.join(" ")}`);
|
|
719
501
|
}
|
|
502
|
+
const bestKept = findBestKeptMetric(state.results, state.currentSegment, state.bestDirection);
|
|
503
|
+
if (bestKept !== null && state.bestMetric !== null && bestKept !== state.bestMetric) {
|
|
504
|
+
lines.push(`Best kept ${state.metricName}: ${formatNum(bestKept, state.metricUnit)}`);
|
|
505
|
+
}
|
|
720
506
|
if (experiment.asi) {
|
|
721
507
|
const asiSummary = Object.entries(experiment.asi)
|
|
722
508
|
.map(([key, value]) => `${key}: ${truncateAsiValue(value)}`)
|
|
@@ -731,21 +517,19 @@ function buildLogText(
|
|
|
731
517
|
lines.push(`Git: ${gitNote}`);
|
|
732
518
|
}
|
|
733
519
|
if (state.maxExperiments !== null) {
|
|
734
|
-
lines.push(`Progress: ${
|
|
735
|
-
if (
|
|
520
|
+
lines.push(`Progress: ${segmentRunCount}/${state.maxExperiments} runs in current segment`);
|
|
521
|
+
if (segmentRunCount >= state.maxExperiments) {
|
|
736
522
|
lines.push(`Maximum experiments reached (${state.maxExperiments}). Autoresearch mode is now off.`);
|
|
737
523
|
}
|
|
738
524
|
}
|
|
739
|
-
|
|
740
|
-
}
|
|
741
|
-
|
|
742
|
-
async function readGitWorkDirPrefix(options: AutoresearchToolFactoryOptions, workDir: string): Promise<string> {
|
|
743
|
-
void options;
|
|
744
|
-
try {
|
|
745
|
-
return await git.show.prefix(workDir);
|
|
746
|
-
} catch {
|
|
747
|
-
return "";
|
|
525
|
+
if (flaggedRuns.length > 0) {
|
|
526
|
+
const formatted = flaggedRuns.map(({ runId, reason }) => `#${runId} (${reason})`).join(", ");
|
|
527
|
+
lines.push(`Flagged: ${formatted}`);
|
|
748
528
|
}
|
|
529
|
+
for (const warning of warnings) {
|
|
530
|
+
lines.push(`Warning: ${warning}`);
|
|
531
|
+
}
|
|
532
|
+
return lines.join("\n");
|
|
749
533
|
}
|
|
750
534
|
|
|
751
535
|
function truncateAsiValue(value: ASIData[string]): string {
|
|
@@ -764,5 +548,8 @@ function renderSummary(details: LogDetails, theme: Theme): string {
|
|
|
764
548
|
if (state.confidence !== null) {
|
|
765
549
|
summary += ` ${theme.fg("dim", `conf ${state.confidence.toFixed(1)}x`)}`;
|
|
766
550
|
}
|
|
551
|
+
if (details.scopeDeviations.length > 0) {
|
|
552
|
+
summary += ` ${theme.fg("warning", `deviations:${details.scopeDeviations.length}`)}`;
|
|
553
|
+
}
|
|
767
554
|
return summary;
|
|
768
555
|
}
|