@oh-my-pi/pi-coding-agent 14.5.14 → 14.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -0
- package/package.json +7 -7
- package/src/autoresearch/command-resume.md +5 -8
- package/src/autoresearch/git.ts +41 -51
- package/src/autoresearch/helpers.ts +43 -359
- package/src/autoresearch/index.ts +281 -273
- package/src/autoresearch/prompt-setup.md +43 -0
- package/src/autoresearch/prompt.md +52 -193
- package/src/autoresearch/resume-message.md +2 -8
- package/src/autoresearch/state.ts +59 -166
- package/src/autoresearch/storage.ts +687 -0
- package/src/autoresearch/tools/init-experiment.ts +201 -290
- package/src/autoresearch/tools/log-experiment.ts +304 -517
- package/src/autoresearch/tools/run-experiment.ts +117 -296
- package/src/autoresearch/tools/update-notes.ts +116 -0
- package/src/autoresearch/types.ts +16 -66
- package/src/config/settings-schema.ts +1 -1
- package/src/config/settings.ts +20 -1
- package/src/cursor.ts +1 -1
- package/src/edit/index.ts +9 -31
- package/src/edit/line-hash.ts +70 -43
- package/src/edit/modes/hashline.lark +26 -0
- package/src/edit/modes/hashline.ts +898 -1099
- package/src/edit/modes/patch.ts +0 -7
- package/src/edit/modes/replace.ts +0 -4
- package/src/edit/renderer.ts +22 -20
- package/src/edit/streaming.ts +8 -28
- package/src/eval/eval.lark +24 -30
- package/src/eval/js/context-manager.ts +5 -162
- package/src/eval/js/prelude.txt +0 -12
- package/src/eval/parse.ts +129 -129
- package/src/eval/py/prelude.py +1 -219
- package/src/export/html/template.generated.ts +1 -1
- package/src/export/html/template.js +2 -2
- package/src/internal-urls/docs-index.generated.ts +1 -1
- package/src/modes/components/session-observer-overlay.ts +5 -2
- package/src/modes/components/status-line/segments.ts +1 -1
- package/src/modes/components/status-line.ts +3 -5
- package/src/modes/components/tree-selector.ts +4 -5
- package/src/modes/components/welcome.ts +11 -1
- package/src/modes/controllers/command-controller.ts +2 -6
- package/src/modes/controllers/event-controller.ts +1 -2
- package/src/modes/controllers/extension-ui-controller.ts +3 -15
- package/src/modes/controllers/input-controller.ts +0 -1
- package/src/modes/controllers/selector-controller.ts +1 -1
- package/src/modes/interactive-mode.ts +5 -7
- package/src/prompts/system/system-prompt.md +14 -38
- package/src/prompts/tools/ast-edit.md +8 -8
- package/src/prompts/tools/ast-grep.md +10 -10
- package/src/prompts/tools/eval.md +13 -31
- package/src/prompts/tools/find.md +2 -1
- package/src/prompts/tools/hashline.md +66 -57
- package/src/prompts/tools/search.md +2 -2
- package/src/session/session-manager.ts +17 -13
- package/src/tools/ast-edit.ts +141 -44
- package/src/tools/ast-grep.ts +112 -36
- package/src/tools/eval.ts +2 -53
- package/src/tools/find.ts +16 -15
- package/src/tools/path-utils.ts +36 -196
- package/src/tools/search.ts +56 -35
- package/src/utils/edit-mode.ts +2 -11
- package/src/utils/file-display-mode.ts +1 -1
- package/src/utils/git.ts +17 -0
- package/src/utils/session-color.ts +0 -12
- package/src/utils/title-generator.ts +22 -38
- package/src/autoresearch/apply-contract-to-state.ts +0 -24
- package/src/autoresearch/contract.ts +0 -288
- package/src/edit/modes/atom.lark +0 -29
- package/src/edit/modes/atom.ts +0 -1773
- package/src/prompts/tools/atom.md +0 -150
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import * as fs from "node:fs";
|
|
2
1
|
import * as path from "node:path";
|
|
3
2
|
import { StringEnum } from "@oh-my-pi/pi-ai";
|
|
4
3
|
import { Text } from "@oh-my-pi/pi-tui";
|
|
@@ -6,87 +5,66 @@ import { Type } from "@sinclair/typebox";
|
|
|
6
5
|
import type { ToolDefinition } from "../../extensibility/extensions";
|
|
7
6
|
import type { Theme } from "../../modes/theme/theme";
|
|
8
7
|
import { replaceTabs, truncateToWidth } from "../../tools/render-utils";
|
|
9
|
-
import
|
|
10
|
-
import {
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
readAutoresearchContract,
|
|
15
|
-
} from "../contract";
|
|
16
|
-
import {
|
|
17
|
-
abandonUnloggedAutoresearchRuns,
|
|
18
|
-
collectLoggedRunNumbers,
|
|
19
|
-
isAutoresearchShCommand,
|
|
20
|
-
readMaxExperiments,
|
|
21
|
-
readPendingRunSummary,
|
|
22
|
-
resolveWorkDir,
|
|
23
|
-
validateWorkDir,
|
|
24
|
-
} from "../helpers";
|
|
25
|
-
import { cloneExperimentState } from "../state";
|
|
8
|
+
import * as git from "../../utils/git";
|
|
9
|
+
import { parseWorkDirDirtyPaths } from "../git";
|
|
10
|
+
import { dedupeStrings, normalizePathSpec } from "../helpers";
|
|
11
|
+
import { buildExperimentState } from "../state";
|
|
12
|
+
import { openAutoresearchStorage, type SessionRow } from "../storage";
|
|
26
13
|
import type { AutoresearchToolFactoryOptions, ExperimentState } from "../types";
|
|
27
14
|
|
|
15
|
+
export const HARNESS_FILENAME = "autoresearch.sh";
|
|
16
|
+
export const DEFAULT_HARNESS_COMMAND = `bash ${HARNESS_FILENAME}`;
|
|
17
|
+
const HARNESS_COMMIT_TITLE = "autoresearch: harness setup";
|
|
18
|
+
|
|
28
19
|
const initExperimentSchema = Type.Object({
|
|
29
|
-
name: Type.String({
|
|
30
|
-
|
|
20
|
+
name: Type.String({ description: "Human-readable experiment name." }),
|
|
21
|
+
goal: Type.Optional(Type.String({ description: "Free-form description of what this session optimizes." })),
|
|
22
|
+
primary_metric: Type.String({
|
|
23
|
+
description:
|
|
24
|
+
"Primary metric name shown in the dashboard. Match the `METRIC <name>=<value>` lines printed by the benchmark.",
|
|
31
25
|
}),
|
|
32
|
-
from_autoresearch_md: Type.Optional(
|
|
33
|
-
Type.Boolean({
|
|
34
|
-
description:
|
|
35
|
-
"When true, load benchmark command, metrics, scope, off-limits, and constraints from autoresearch.md instead of passing mirrored fields below.",
|
|
36
|
-
}),
|
|
37
|
-
),
|
|
38
|
-
abandon_unlogged_runs: Type.Optional(
|
|
39
|
-
Type.Boolean({
|
|
40
|
-
description:
|
|
41
|
-
"When true, mark all completed but unlogged run artifacts as abandoned so initialization can proceed without logging them first.",
|
|
42
|
-
}),
|
|
43
|
-
),
|
|
44
|
-
new_segment: Type.Optional(
|
|
45
|
-
Type.Boolean({
|
|
46
|
-
description:
|
|
47
|
-
"When true, force a new segment even when the contract fields have not changed. Without this, re-initialization with matching contract is a no-op.",
|
|
48
|
-
}),
|
|
49
|
-
),
|
|
50
|
-
metric_name: Type.Optional(
|
|
51
|
-
Type.String({
|
|
52
|
-
description: "Primary metric name shown in the dashboard. Required when from_autoresearch_md is false.",
|
|
53
|
-
}),
|
|
54
|
-
),
|
|
55
26
|
metric_unit: Type.Optional(
|
|
56
|
-
Type.String({
|
|
57
|
-
description: "Unit for the primary metric, for example µs, ms, s, kb, or empty.",
|
|
58
|
-
}),
|
|
27
|
+
Type.String({ description: "Unit for the primary metric (e.g. ms, µs, mb). Empty when unitless." }),
|
|
59
28
|
),
|
|
60
29
|
direction: Type.Optional(
|
|
61
|
-
StringEnum(["lower", "higher"], {
|
|
62
|
-
description: "Whether lower or higher values are better. Defaults to lower.",
|
|
63
|
-
}),
|
|
30
|
+
StringEnum(["lower", "higher"], { description: "Whether lower or higher values are better. Defaults to lower." }),
|
|
64
31
|
),
|
|
65
|
-
|
|
66
|
-
Type.String({
|
|
67
|
-
description: "
|
|
32
|
+
secondary_metrics: Type.Optional(
|
|
33
|
+
Type.Array(Type.String(), {
|
|
34
|
+
description: "Names of secondary metrics tracked alongside the primary metric.",
|
|
68
35
|
}),
|
|
69
36
|
),
|
|
70
37
|
scope_paths: Type.Optional(
|
|
71
38
|
Type.Array(Type.String(), {
|
|
72
|
-
description:
|
|
73
|
-
|
|
39
|
+
description:
|
|
40
|
+
"Files or directories the agent expects to modify. Used post-hoc to flag scope deviations on log_experiment; never used to block edits.",
|
|
74
41
|
}),
|
|
75
42
|
),
|
|
76
43
|
off_limits: Type.Optional(
|
|
77
44
|
Type.Array(Type.String(), {
|
|
78
|
-
description:
|
|
45
|
+
description:
|
|
46
|
+
"Paths the agent SHOULD NOT modify. Used post-hoc to flag scope deviations on log_experiment; never used to block edits.",
|
|
79
47
|
}),
|
|
80
48
|
),
|
|
81
49
|
constraints: Type.Optional(
|
|
82
|
-
Type.Array(Type.String(), {
|
|
83
|
-
|
|
50
|
+
Type.Array(Type.String(), { description: "Free-form constraints (e.g. 'no api break')." }),
|
|
51
|
+
),
|
|
52
|
+
max_iterations: Type.Optional(Type.Number({ description: "Soft cap on iterations per segment. Optional." })),
|
|
53
|
+
new_segment: Type.Optional(
|
|
54
|
+
Type.Boolean({
|
|
55
|
+
description:
|
|
56
|
+
"When true, bump to a new segment even when an active session exists. New baselines and best-metric reset.",
|
|
84
57
|
}),
|
|
85
58
|
),
|
|
86
59
|
});
|
|
87
60
|
|
|
88
61
|
interface InitExperimentDetails {
|
|
89
62
|
state: ExperimentState;
|
|
63
|
+
createdSession: boolean;
|
|
64
|
+
bumpedSegment: boolean;
|
|
65
|
+
abandonedRuns: number;
|
|
66
|
+
harnessCommitted: boolean;
|
|
67
|
+
baselineCommit: string | null;
|
|
90
68
|
}
|
|
91
69
|
|
|
92
70
|
export function createInitExperimentTool(
|
|
@@ -96,253 +74,117 @@ export function createInitExperimentTool(
|
|
|
96
74
|
name: "init_experiment",
|
|
97
75
|
label: "Init Experiment",
|
|
98
76
|
description:
|
|
99
|
-
"Initialize or
|
|
77
|
+
"Initialize or reconfigure the autoresearch session. On first call (Phase 1 → Phase 2 transition), requires `./autoresearch.sh` to exist and pending harness changes are auto-committed on an autoresearch branch. Pass `new_segment: true` to start a fresh baseline within an existing session.",
|
|
100
78
|
parameters: initExperimentSchema,
|
|
101
79
|
defaultInactive: true,
|
|
102
80
|
async execute(_toolCallId, params, _signal, _onUpdate, ctx) {
|
|
103
|
-
const
|
|
104
|
-
if (workDirError) {
|
|
105
|
-
return {
|
|
106
|
-
content: [{ type: "text", text: `Error: ${workDirError}` }],
|
|
107
|
-
};
|
|
108
|
-
}
|
|
109
|
-
|
|
81
|
+
const storage = await openAutoresearchStorage(ctx.cwd);
|
|
110
82
|
const runtime = options.getRuntime(ctx);
|
|
111
|
-
const state = runtime.state;
|
|
112
|
-
const isReinitializing = state.results.length > 0;
|
|
113
|
-
const workDir = resolveWorkDir(ctx.cwd);
|
|
114
|
-
const loggedRunNumbers = collectLoggedRunNumbers(state.results);
|
|
115
|
-
|
|
116
|
-
let abandonSummary = "";
|
|
117
|
-
if (params.abandon_unlogged_runs === true) {
|
|
118
|
-
const abandoned = await abandonUnloggedAutoresearchRuns(workDir, loggedRunNumbers);
|
|
119
|
-
if (abandoned > 0) {
|
|
120
|
-
abandonSummary =
|
|
121
|
-
abandoned === 1
|
|
122
|
-
? "Abandoned 1 unlogged run artifact.\n"
|
|
123
|
-
: `Abandoned ${abandoned} unlogged run artifacts.\n`;
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
const pendingRun = await readPendingRunSummary(workDir, loggedRunNumbers);
|
|
128
|
-
if (pendingRun) {
|
|
129
|
-
const metricInfo = pendingRun.parsedPrimary !== null ? `, metric=${pendingRun.parsedPrimary}` : "";
|
|
130
|
-
const passedInfo = pendingRun.passed ? "passed" : "failed";
|
|
131
|
-
return {
|
|
132
|
-
content: [
|
|
133
|
-
{
|
|
134
|
-
type: "text",
|
|
135
|
-
text:
|
|
136
|
-
abandonSummary +
|
|
137
|
-
`Error: run #${pendingRun.runNumber} has not been logged yet.\n` +
|
|
138
|
-
`Pending: command="${pendingRun.command}"${metricInfo}, ${passedInfo}\n` +
|
|
139
|
-
"Call log_experiment before re-initializing, or pass abandon_unlogged_runs=true.",
|
|
140
|
-
},
|
|
141
|
-
],
|
|
142
|
-
};
|
|
143
|
-
}
|
|
144
83
|
|
|
145
|
-
const
|
|
146
|
-
const
|
|
147
|
-
const
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
84
|
+
const direction = params.direction ?? "lower";
|
|
85
|
+
const metricUnit = params.metric_unit ?? "";
|
|
86
|
+
const scopePaths = dedupeStrings((params.scope_paths ?? []).map(normalizePathSpec));
|
|
87
|
+
const offLimits = dedupeStrings((params.off_limits ?? []).map(normalizePathSpec));
|
|
88
|
+
const constraints = dedupeStrings(params.constraints ?? []);
|
|
89
|
+
const secondaryMetrics = dedupeStrings(params.secondary_metrics ?? []);
|
|
90
|
+
const goal = params.goal?.trim() || null;
|
|
91
|
+
const maxIterations =
|
|
92
|
+
params.max_iterations !== undefined && Number.isFinite(params.max_iterations) && params.max_iterations > 0
|
|
93
|
+
? Math.floor(params.max_iterations)
|
|
94
|
+
: null;
|
|
95
|
+
const branch = (await git.branch.current(ctx.cwd)) ?? null;
|
|
96
|
+
const onAutoresearchBranch = branch?.startsWith("autoresearch/") ?? false;
|
|
153
97
|
|
|
154
|
-
const
|
|
155
|
-
const
|
|
156
|
-
const
|
|
157
|
-
if (benchmarkContract.command && !isAutoresearchShCommand(benchmarkContract.command)) {
|
|
158
|
-
return {
|
|
159
|
-
content: [
|
|
160
|
-
{
|
|
161
|
-
type: "text",
|
|
162
|
-
text:
|
|
163
|
-
abandonSummary +
|
|
164
|
-
"Error: Benchmark.command in autoresearch.md must invoke `autoresearch.sh` directly. " +
|
|
165
|
-
"Move the real workload into `autoresearch.sh` and re-run init_experiment.",
|
|
166
|
-
},
|
|
167
|
-
],
|
|
168
|
-
};
|
|
169
|
-
}
|
|
98
|
+
const existing = storage.getActiveSessionForBranch(branch);
|
|
99
|
+
const isNewSegmentInit = existing !== null && params.new_segment === true;
|
|
100
|
+
const requiresHarness = !existing || isNewSegmentInit;
|
|
170
101
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
const benchmarkCommand = params.benchmark_command?.trim();
|
|
175
|
-
const scopePaths = params.scope_paths;
|
|
176
|
-
if (!metricName || !benchmarkCommand || !scopePaths || scopePaths.length === 0) {
|
|
177
|
-
return {
|
|
178
|
-
content: [
|
|
179
|
-
{
|
|
180
|
-
type: "text",
|
|
181
|
-
text:
|
|
182
|
-
abandonSummary +
|
|
183
|
-
"Error: when from_autoresearch_md is false or omitted, metric_name, benchmark_command, and scope_paths are required and must match autoresearch.md. " +
|
|
184
|
-
"Alternatively pass from_autoresearch_md=true with only name (plus optional flags).",
|
|
185
|
-
},
|
|
186
|
-
],
|
|
187
|
-
};
|
|
188
|
-
}
|
|
189
|
-
if (benchmarkContract.command !== benchmarkCommand) {
|
|
190
|
-
return {
|
|
191
|
-
content: [
|
|
192
|
-
{
|
|
193
|
-
type: "text",
|
|
194
|
-
text:
|
|
195
|
-
abandonSummary +
|
|
196
|
-
"Error: benchmark_command does not match autoresearch.md. " +
|
|
197
|
-
`Expected: ${benchmarkContract.command ?? "(missing)"}\nReceived: ${params.benchmark_command}`,
|
|
198
|
-
},
|
|
199
|
-
],
|
|
200
|
-
};
|
|
201
|
-
}
|
|
202
|
-
if (benchmarkContract.primaryMetric !== metricName) {
|
|
203
|
-
return {
|
|
204
|
-
content: [
|
|
205
|
-
{
|
|
206
|
-
type: "text",
|
|
207
|
-
text:
|
|
208
|
-
abandonSummary +
|
|
209
|
-
"Error: metric_name does not match autoresearch.md. " +
|
|
210
|
-
`Expected: ${benchmarkContract.primaryMetric ?? "(missing)"}\nReceived: ${params.metric_name}`,
|
|
211
|
-
},
|
|
212
|
-
],
|
|
213
|
-
};
|
|
214
|
-
}
|
|
215
|
-
if ((params.metric_unit ?? "") !== expectedMetricUnit) {
|
|
216
|
-
return {
|
|
217
|
-
content: [
|
|
218
|
-
{
|
|
219
|
-
type: "text",
|
|
220
|
-
text:
|
|
221
|
-
abandonSummary +
|
|
222
|
-
"Error: metric_unit does not match autoresearch.md. " +
|
|
223
|
-
`Expected: ${expectedMetricUnit || "(empty)"}\nReceived: ${params.metric_unit ?? "(empty)"}`,
|
|
224
|
-
},
|
|
225
|
-
],
|
|
226
|
-
};
|
|
227
|
-
}
|
|
228
|
-
if ((params.direction ?? "lower") !== expectedDirection) {
|
|
229
|
-
return {
|
|
230
|
-
content: [
|
|
231
|
-
{
|
|
232
|
-
type: "text",
|
|
233
|
-
text:
|
|
234
|
-
abandonSummary +
|
|
235
|
-
"Error: direction does not match autoresearch.md. " +
|
|
236
|
-
`Expected: ${expectedDirection}\nReceived: ${params.direction ?? "lower"}`,
|
|
237
|
-
},
|
|
238
|
-
],
|
|
239
|
-
};
|
|
240
|
-
}
|
|
241
|
-
if (!contractPathListsEqual(scopePaths, contractResult.contract.scopePaths)) {
|
|
242
|
-
return {
|
|
243
|
-
content: [
|
|
244
|
-
{
|
|
245
|
-
type: "text",
|
|
246
|
-
text:
|
|
247
|
-
abandonSummary +
|
|
248
|
-
"Error: scope_paths do not match autoresearch.md. " +
|
|
249
|
-
`Expected: ${contractResult.contract.scopePaths.join(", ")}`,
|
|
250
|
-
},
|
|
251
|
-
],
|
|
252
|
-
};
|
|
253
|
-
}
|
|
254
|
-
if (!contractPathListsEqual(params.off_limits ?? [], contractResult.contract.offLimits)) {
|
|
255
|
-
return {
|
|
256
|
-
content: [
|
|
257
|
-
{
|
|
258
|
-
type: "text",
|
|
259
|
-
text:
|
|
260
|
-
abandonSummary +
|
|
261
|
-
"Error: off_limits do not match autoresearch.md. " +
|
|
262
|
-
`Expected: ${contractResult.contract.offLimits.join(", ") || "(empty)"}`,
|
|
263
|
-
},
|
|
264
|
-
],
|
|
265
|
-
};
|
|
266
|
-
}
|
|
267
|
-
if (!contractListsEqual(params.constraints ?? [], contractResult.contract.constraints)) {
|
|
102
|
+
if (requiresHarness) {
|
|
103
|
+
const harnessExists = await Bun.file(path.join(ctx.cwd, HARNESS_FILENAME)).exists();
|
|
104
|
+
if (!harnessExists) {
|
|
268
105
|
return {
|
|
269
106
|
content: [
|
|
270
107
|
{
|
|
271
108
|
type: "text",
|
|
272
|
-
text:
|
|
273
|
-
abandonSummary +
|
|
274
|
-
"Error: constraints do not match autoresearch.md. " +
|
|
275
|
-
`Expected: ${contractResult.contract.constraints.join(", ") || "(empty)"}`,
|
|
109
|
+
text: `Error: ./${HARNESS_FILENAME} does not exist. Phase 1 of autoresearch is harness setup — write \`./${HARNESS_FILENAME}\` so it exits 0 and prints \`METRIC <name>=<value>\`, validate it via \`bash ${HARNESS_FILENAME}\`, then call init_experiment again.`,
|
|
276
110
|
},
|
|
277
111
|
],
|
|
278
112
|
};
|
|
279
113
|
}
|
|
280
114
|
}
|
|
281
115
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
const
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
runtime.autoresearchMode = true;
|
|
296
|
-
runtime.autoResumeArmed = true;
|
|
297
|
-
options.dashboard.updateWidget(ctx, runtime);
|
|
298
|
-
options.dashboard.requestRender();
|
|
299
|
-
return {
|
|
300
|
-
content: [
|
|
301
|
-
{
|
|
302
|
-
type: "text",
|
|
303
|
-
text:
|
|
304
|
-
abandonSummary +
|
|
305
|
-
`Experiment session already initialized with matching contract. Continuing segment ${state.currentSegment}.`,
|
|
306
|
-
},
|
|
307
|
-
],
|
|
308
|
-
details: { state: cloneExperimentState(state) },
|
|
309
|
-
};
|
|
116
|
+
let harnessCommitted = false;
|
|
117
|
+
let commitWarning: string | null = null;
|
|
118
|
+
if (requiresHarness && onAutoresearchBranch) {
|
|
119
|
+
const dirty = await detectPendingChanges(ctx.cwd);
|
|
120
|
+
if (dirty) {
|
|
121
|
+
try {
|
|
122
|
+
await git.stage.files(ctx.cwd, []);
|
|
123
|
+
const message = buildHarnessCommitMessage(goal, params.name);
|
|
124
|
+
await git.commit(ctx.cwd, message);
|
|
125
|
+
harnessCommitted = true;
|
|
126
|
+
} catch (err) {
|
|
127
|
+
commitWarning = `Failed to auto-commit harness changes: ${err instanceof Error ? err.message : String(err)}. Recording baseline at current HEAD; discard may not preserve uncommitted harness files.`;
|
|
128
|
+
}
|
|
310
129
|
}
|
|
311
130
|
}
|
|
312
131
|
|
|
313
|
-
|
|
314
|
-
state.name = params.name;
|
|
315
|
-
state.maxExperiments = readMaxExperiments(ctx.cwd);
|
|
316
|
-
state.bestMetric = null;
|
|
317
|
-
state.confidence = null;
|
|
318
|
-
if (isReinitializing) {
|
|
319
|
-
state.currentSegment += 1;
|
|
320
|
-
}
|
|
132
|
+
const baselineCommit = await tryReadHeadSha(ctx.cwd);
|
|
321
133
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
metricName: state.metricName,
|
|
327
|
-
metricUnit: state.metricUnit,
|
|
328
|
-
bestDirection: state.bestDirection,
|
|
329
|
-
benchmarkCommand: state.benchmarkCommand,
|
|
330
|
-
secondaryMetrics: state.secondaryMetrics.map(metric => metric.name),
|
|
331
|
-
scopePaths: state.scopePaths,
|
|
332
|
-
offLimits: state.offLimits,
|
|
333
|
-
constraints: state.constraints,
|
|
334
|
-
});
|
|
134
|
+
let session: SessionRow;
|
|
135
|
+
let createdSession = false;
|
|
136
|
+
let bumpedSegment = false;
|
|
137
|
+
let abandonedRuns = 0;
|
|
335
138
|
|
|
336
|
-
if (
|
|
337
|
-
|
|
139
|
+
if (!existing) {
|
|
140
|
+
session = storage.openSession({
|
|
141
|
+
name: params.name,
|
|
142
|
+
goal,
|
|
143
|
+
primaryMetric: params.primary_metric,
|
|
144
|
+
metricUnit,
|
|
145
|
+
direction,
|
|
146
|
+
preferredCommand: DEFAULT_HARNESS_COMMAND,
|
|
147
|
+
branch,
|
|
148
|
+
baselineCommit,
|
|
149
|
+
maxIterations,
|
|
150
|
+
scopePaths,
|
|
151
|
+
offLimits,
|
|
152
|
+
constraints,
|
|
153
|
+
secondaryMetrics,
|
|
154
|
+
});
|
|
155
|
+
createdSession = true;
|
|
338
156
|
} else {
|
|
339
|
-
|
|
157
|
+
abandonedRuns = storage.abandonPendingRuns(existing.id);
|
|
158
|
+
const updates: Parameters<typeof storage.updateSession>[1] = {
|
|
159
|
+
goal,
|
|
160
|
+
maxIterations,
|
|
161
|
+
scopePaths,
|
|
162
|
+
offLimits,
|
|
163
|
+
constraints,
|
|
164
|
+
secondaryMetrics,
|
|
165
|
+
primaryMetric: params.primary_metric,
|
|
166
|
+
metricUnit,
|
|
167
|
+
direction,
|
|
168
|
+
branch,
|
|
169
|
+
};
|
|
170
|
+
if (isNewSegmentInit) {
|
|
171
|
+
updates.baselineCommit = baselineCommit;
|
|
172
|
+
}
|
|
173
|
+
let updated = storage.updateSession(existing.id, updates);
|
|
174
|
+
if (isNewSegmentInit) {
|
|
175
|
+
updated = storage.bumpSegment(existing.id);
|
|
176
|
+
bumpedSegment = true;
|
|
177
|
+
}
|
|
178
|
+
session = updated;
|
|
340
179
|
}
|
|
341
180
|
|
|
181
|
+
const loggedRuns = storage.listLoggedRuns(session.id);
|
|
182
|
+
const state = buildExperimentState(session, loggedRuns);
|
|
183
|
+
runtime.state = state;
|
|
184
|
+
runtime.goal = session.goal;
|
|
342
185
|
runtime.autoresearchMode = true;
|
|
343
186
|
runtime.autoResumeArmed = true;
|
|
344
187
|
runtime.lastAutoResumePendingRunNumber = null;
|
|
345
|
-
runtime.lastRunChecks = null;
|
|
346
188
|
runtime.lastRunDuration = null;
|
|
347
189
|
runtime.lastRunAsi = null;
|
|
348
190
|
runtime.lastRunArtifactDir = null;
|
|
@@ -351,24 +193,65 @@ export function createInitExperimentTool(
|
|
|
351
193
|
options.dashboard.updateWidget(ctx, runtime);
|
|
352
194
|
options.dashboard.requestRender();
|
|
353
195
|
|
|
354
|
-
const lines = [
|
|
355
|
-
|
|
356
|
-
`
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
`
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
196
|
+
const lines: string[] = [];
|
|
197
|
+
if (abandonedRuns > 0) {
|
|
198
|
+
lines.push(`Abandoned ${abandonedRuns} pending run${abandonedRuns === 1 ? "" : "s"} before reconfiguring.`);
|
|
199
|
+
}
|
|
200
|
+
if (harnessCommitted && session.baselineCommit) {
|
|
201
|
+
lines.push(`Committed harness setup at ${session.baselineCommit.slice(0, 12)}.`);
|
|
202
|
+
}
|
|
203
|
+
if (commitWarning) {
|
|
204
|
+
lines.push(commitWarning);
|
|
205
|
+
}
|
|
206
|
+
if (createdSession) {
|
|
207
|
+
lines.push(`Started session #${session.id}: ${session.name}`);
|
|
208
|
+
} else if (bumpedSegment) {
|
|
209
|
+
lines.push(`Bumped segment to ${session.currentSegment} for session #${session.id}: ${session.name}`);
|
|
210
|
+
} else {
|
|
211
|
+
lines.push(`Updated session #${session.id} (segment ${session.currentSegment}): ${session.name}`);
|
|
212
|
+
}
|
|
213
|
+
lines.push(
|
|
214
|
+
`Metric: ${session.primaryMetric} (${session.metricUnit || "unitless"}, ${session.direction} is better)`,
|
|
215
|
+
);
|
|
216
|
+
lines.push(`Benchmark entrypoint: ${DEFAULT_HARNESS_COMMAND}`);
|
|
217
|
+
if (session.scopePaths.length > 0) {
|
|
218
|
+
lines.push(`Files in scope: ${session.scopePaths.join(", ")}`);
|
|
219
|
+
}
|
|
220
|
+
if (session.offLimits.length > 0) {
|
|
221
|
+
lines.push(`Off limits: ${session.offLimits.join(", ")}`);
|
|
222
|
+
}
|
|
223
|
+
if (session.maxIterations !== null) {
|
|
224
|
+
lines.push(`Max iterations per segment: ${session.maxIterations}`);
|
|
225
|
+
}
|
|
226
|
+
if (session.branch) {
|
|
227
|
+
lines.push(`Active branch: ${session.branch}`);
|
|
228
|
+
}
|
|
229
|
+
if (session.baselineCommit) {
|
|
230
|
+
lines.push(`Baseline commit: ${session.baselineCommit.slice(0, 12)}`);
|
|
231
|
+
}
|
|
232
|
+
if (createdSession) {
|
|
233
|
+
lines.push(
|
|
234
|
+
"Phase 2: iteration loop is active. Run the baseline experiment with `run_experiment` and log it.",
|
|
235
|
+
);
|
|
236
|
+
} else if (bumpedSegment) {
|
|
237
|
+
lines.push("Run a fresh baseline for the new segment.");
|
|
238
|
+
}
|
|
239
|
+
if (requiresHarness && !onAutoresearchBranch) {
|
|
240
|
+
lines.push(
|
|
241
|
+
"Note: not on a dedicated `autoresearch/*` branch — `log_experiment discard` will only revert run-modified files, not reset to baseline.",
|
|
242
|
+
);
|
|
367
243
|
}
|
|
368
244
|
|
|
369
245
|
return {
|
|
370
246
|
content: [{ type: "text", text: lines.join("\n") }],
|
|
371
|
-
details: {
|
|
247
|
+
details: {
|
|
248
|
+
state,
|
|
249
|
+
createdSession,
|
|
250
|
+
bumpedSegment,
|
|
251
|
+
abandonedRuns,
|
|
252
|
+
harnessCommitted,
|
|
253
|
+
baselineCommit: session.baselineCommit,
|
|
254
|
+
},
|
|
372
255
|
};
|
|
373
256
|
},
|
|
374
257
|
renderCall(args, _options, theme): Text {
|
|
@@ -384,3 +267,31 @@ export function createInitExperimentTool(
|
|
|
384
267
|
function renderInitCall(name: string, theme: Theme): string {
|
|
385
268
|
return `${theme.fg("toolTitle", theme.bold("init_experiment"))} ${theme.fg("accent", truncateToWidth(replaceTabs(name), 100))}`;
|
|
386
269
|
}
|
|
270
|
+
|
|
271
|
+
async function tryReadHeadSha(cwd: string): Promise<string | null> {
|
|
272
|
+
try {
|
|
273
|
+
return (await git.head.sha(cwd)) ?? null;
|
|
274
|
+
} catch {
|
|
275
|
+
return null;
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
async function detectPendingChanges(cwd: string): Promise<boolean> {
|
|
280
|
+
try {
|
|
281
|
+
const statusText = await git.status(cwd, { porcelainV1: true, untrackedFiles: "all", z: true });
|
|
282
|
+
const workDirPrefix = await git.show.prefix(cwd).catch(() => "");
|
|
283
|
+
return parseWorkDirDirtyPaths(statusText, workDirPrefix).length > 0;
|
|
284
|
+
} catch {
|
|
285
|
+
return false;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
function buildHarnessCommitMessage(goal: string | null, name: string): string {
|
|
290
|
+
const lines = [HARNESS_COMMIT_TITLE, "", `Benchmark entrypoint: ${DEFAULT_HARNESS_COMMAND}`];
|
|
291
|
+
if (goal) {
|
|
292
|
+
lines.push(`Goal: ${goal}`);
|
|
293
|
+
} else {
|
|
294
|
+
lines.push(`Session: ${name}`);
|
|
295
|
+
}
|
|
296
|
+
return lines.join("\n");
|
|
297
|
+
}
|