@oh-my-pi/pi-coding-agent 14.5.14 → 14.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -0
- package/package.json +7 -7
- package/src/autoresearch/command-resume.md +5 -8
- package/src/autoresearch/git.ts +41 -51
- package/src/autoresearch/helpers.ts +43 -359
- package/src/autoresearch/index.ts +281 -273
- package/src/autoresearch/prompt-setup.md +43 -0
- package/src/autoresearch/prompt.md +52 -193
- package/src/autoresearch/resume-message.md +2 -8
- package/src/autoresearch/state.ts +59 -166
- package/src/autoresearch/storage.ts +687 -0
- package/src/autoresearch/tools/init-experiment.ts +201 -290
- package/src/autoresearch/tools/log-experiment.ts +304 -517
- package/src/autoresearch/tools/run-experiment.ts +117 -296
- package/src/autoresearch/tools/update-notes.ts +116 -0
- package/src/autoresearch/types.ts +16 -66
- package/src/config/settings-schema.ts +1 -1
- package/src/config/settings.ts +20 -1
- package/src/cursor.ts +1 -1
- package/src/edit/index.ts +9 -31
- package/src/edit/line-hash.ts +70 -43
- package/src/edit/modes/hashline.lark +26 -0
- package/src/edit/modes/hashline.ts +898 -1099
- package/src/edit/modes/patch.ts +0 -7
- package/src/edit/modes/replace.ts +0 -4
- package/src/edit/renderer.ts +22 -20
- package/src/edit/streaming.ts +8 -28
- package/src/eval/eval.lark +24 -30
- package/src/eval/js/context-manager.ts +5 -162
- package/src/eval/js/prelude.txt +0 -12
- package/src/eval/parse.ts +129 -129
- package/src/eval/py/prelude.py +1 -219
- package/src/export/html/template.generated.ts +1 -1
- package/src/export/html/template.js +2 -2
- package/src/internal-urls/docs-index.generated.ts +1 -1
- package/src/modes/components/session-observer-overlay.ts +5 -2
- package/src/modes/components/status-line/segments.ts +1 -1
- package/src/modes/components/status-line.ts +3 -5
- package/src/modes/components/tree-selector.ts +4 -5
- package/src/modes/components/welcome.ts +11 -1
- package/src/modes/controllers/command-controller.ts +2 -6
- package/src/modes/controllers/event-controller.ts +1 -2
- package/src/modes/controllers/extension-ui-controller.ts +3 -15
- package/src/modes/controllers/input-controller.ts +0 -1
- package/src/modes/controllers/selector-controller.ts +1 -1
- package/src/modes/interactive-mode.ts +5 -7
- package/src/prompts/system/system-prompt.md +14 -38
- package/src/prompts/tools/ast-edit.md +8 -8
- package/src/prompts/tools/ast-grep.md +10 -10
- package/src/prompts/tools/eval.md +13 -31
- package/src/prompts/tools/find.md +2 -1
- package/src/prompts/tools/hashline.md +66 -57
- package/src/prompts/tools/search.md +2 -2
- package/src/session/session-manager.ts +17 -13
- package/src/tools/ast-edit.ts +141 -44
- package/src/tools/ast-grep.ts +112 -36
- package/src/tools/eval.ts +2 -53
- package/src/tools/find.ts +16 -15
- package/src/tools/path-utils.ts +36 -196
- package/src/tools/search.ts +56 -35
- package/src/utils/edit-mode.ts +2 -11
- package/src/utils/file-display-mode.ts +1 -1
- package/src/utils/git.ts +17 -0
- package/src/utils/session-color.ts +0 -12
- package/src/utils/title-generator.ts +22 -38
- package/src/autoresearch/apply-contract-to-state.ts +0 -24
- package/src/autoresearch/contract.ts +0 -288
- package/src/edit/modes/atom.lark +0 -29
- package/src/edit/modes/atom.ts +0 -1773
- package/src/prompts/tools/atom.md +0 -150
|
@@ -7,48 +7,25 @@ import { Type } from "@sinclair/typebox";
|
|
|
7
7
|
import type { ToolDefinition } from "../../extensibility/extensions";
|
|
8
8
|
import type { Theme } from "../../modes/theme/theme";
|
|
9
9
|
import { DEFAULT_MAX_BYTES, DEFAULT_MAX_LINES, truncateTail } from "../../session/streaming-output";
|
|
10
|
-
import { replaceTabs, shortenPath
|
|
10
|
+
import { replaceTabs, shortenPath } from "../../tools/render-utils";
|
|
11
11
|
import * as git from "../../utils/git";
|
|
12
12
|
import { parseWorkDirDirtyPaths } from "../git";
|
|
13
13
|
import {
|
|
14
|
-
collectLoggedRunNumbers,
|
|
15
14
|
EXPERIMENT_MAX_BYTES,
|
|
16
15
|
EXPERIMENT_MAX_LINES,
|
|
17
16
|
formatElapsed,
|
|
18
17
|
formatNum,
|
|
19
|
-
getAutoresearchRunDirectory,
|
|
20
|
-
getNextAutoresearchRunNumber,
|
|
21
|
-
isAutoresearchLocalStatePath,
|
|
22
|
-
isAutoresearchShCommand,
|
|
23
18
|
killTree,
|
|
24
19
|
parseAsiLines,
|
|
25
20
|
parseMetricLines,
|
|
26
|
-
readPendingRunSummary,
|
|
27
|
-
resolveWorkDir,
|
|
28
|
-
validateWorkDir,
|
|
29
21
|
} from "../helpers";
|
|
22
|
+
import { buildExperimentState } from "../state";
|
|
23
|
+
import { openAutoresearchStorageIfExists } from "../storage";
|
|
30
24
|
import type { AutoresearchToolFactoryOptions, RunDetails, RunExperimentProgressDetails } from "../types";
|
|
25
|
+
import { DEFAULT_HARNESS_COMMAND } from "./init-experiment";
|
|
31
26
|
|
|
32
27
|
const runExperimentSchema = Type.Object({
|
|
33
|
-
|
|
34
|
-
description: "Shell command to run for this experiment.",
|
|
35
|
-
}),
|
|
36
|
-
timeout_seconds: Type.Optional(
|
|
37
|
-
Type.Number({
|
|
38
|
-
description: "Timeout in seconds. Defaults to 600.",
|
|
39
|
-
}),
|
|
40
|
-
),
|
|
41
|
-
checks_timeout_seconds: Type.Optional(
|
|
42
|
-
Type.Number({
|
|
43
|
-
description: "Timeout in seconds for autoresearch.checks.sh. Defaults to 300.",
|
|
44
|
-
}),
|
|
45
|
-
),
|
|
46
|
-
force: Type.Optional(
|
|
47
|
-
Type.Boolean({
|
|
48
|
-
description:
|
|
49
|
-
"When true, allow a command that differs from the segment benchmark command and skip the rule that autoresearch.sh must be invoked directly when that script exists.",
|
|
50
|
-
}),
|
|
51
|
-
),
|
|
28
|
+
timeout_seconds: Type.Optional(Type.Number({ description: "Timeout in seconds. Defaults to 600." })),
|
|
52
29
|
});
|
|
53
30
|
|
|
54
31
|
interface ProcessExecutionResult {
|
|
@@ -58,13 +35,6 @@ interface ProcessExecutionResult {
|
|
|
58
35
|
output: string;
|
|
59
36
|
}
|
|
60
37
|
|
|
61
|
-
interface ChecksExecutionResult {
|
|
62
|
-
code: number | null;
|
|
63
|
-
killed: boolean;
|
|
64
|
-
logPath: string;
|
|
65
|
-
output: string;
|
|
66
|
-
}
|
|
67
|
-
|
|
68
38
|
interface ProgressSnapshot {
|
|
69
39
|
elapsed: string;
|
|
70
40
|
runDirectory: string;
|
|
@@ -80,136 +50,73 @@ export function createRunExperimentTool(
|
|
|
80
50
|
name: "run_experiment",
|
|
81
51
|
label: "Run Experiment",
|
|
82
52
|
description:
|
|
83
|
-
"Run
|
|
53
|
+
"Run any benchmark command. Output is captured automatically; `METRIC name=value` and `ASI key=value` lines printed by the command are parsed.",
|
|
84
54
|
parameters: runExperimentSchema,
|
|
85
55
|
defaultInactive: true,
|
|
86
56
|
async execute(_toolCallId, params, signal, onUpdate, ctx) {
|
|
87
|
-
const
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
};
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
const runtime = options.getRuntime(ctx);
|
|
95
|
-
const state = runtime.state;
|
|
96
|
-
const workDir = resolveWorkDir(ctx.cwd);
|
|
97
|
-
const checksPath = path.join(workDir, "autoresearch.checks.sh");
|
|
98
|
-
const autoresearchScriptPath = path.join(workDir, "autoresearch.sh");
|
|
99
|
-
|
|
100
|
-
const forceCommand = params.force === true;
|
|
101
|
-
if (!forceCommand && state.benchmarkCommand && params.command.trim() !== state.benchmarkCommand) {
|
|
57
|
+
const storage = await openAutoresearchStorageIfExists(ctx.cwd);
|
|
58
|
+
const currentBranch = (await git.branch.current(ctx.cwd)) ?? null;
|
|
59
|
+
const session = storage?.getActiveSessionForBranch(currentBranch) ?? null;
|
|
60
|
+
if (!storage || !session) {
|
|
102
61
|
return {
|
|
103
62
|
content: [
|
|
104
63
|
{
|
|
105
64
|
type: "text",
|
|
106
|
-
text:
|
|
107
|
-
"Error: command does not match the benchmark command recorded for this segment.\n" +
|
|
108
|
-
`Expected: ${state.benchmarkCommand}\nReceived: ${params.command}`,
|
|
65
|
+
text: "Error: no active autoresearch session for the current branch. Call init_experiment first.",
|
|
109
66
|
},
|
|
110
67
|
],
|
|
111
68
|
};
|
|
112
69
|
}
|
|
113
70
|
|
|
114
|
-
|
|
115
|
-
return {
|
|
116
|
-
content: [
|
|
117
|
-
{
|
|
118
|
-
type: "text",
|
|
119
|
-
text:
|
|
120
|
-
`Error: autoresearch.sh exists. Run it directly instead of using a different command.\n` +
|
|
121
|
-
`Expected something like: bash autoresearch.sh\n` +
|
|
122
|
-
`Received: ${params.command}`,
|
|
123
|
-
},
|
|
124
|
-
],
|
|
125
|
-
};
|
|
126
|
-
}
|
|
71
|
+
const runtime = options.getRuntime(ctx);
|
|
127
72
|
|
|
128
|
-
|
|
129
|
-
const
|
|
130
|
-
if (
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
type: "text",
|
|
135
|
-
text: `Maximum experiments reached (${state.maxExperiments}). Re-initialize to start a new segment.`,
|
|
136
|
-
},
|
|
137
|
-
],
|
|
138
|
-
};
|
|
139
|
-
}
|
|
140
|
-
}
|
|
73
|
+
const abandonedPriorRun = (() => {
|
|
74
|
+
const pending = storage.getPendingRun(session.id);
|
|
75
|
+
if (!pending) return null;
|
|
76
|
+
storage.abandonPendingRuns(session.id);
|
|
77
|
+
return pending.id;
|
|
78
|
+
})();
|
|
141
79
|
|
|
142
|
-
const
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
content: [
|
|
147
|
-
{
|
|
148
|
-
type: "text",
|
|
149
|
-
text:
|
|
150
|
-
`Error: run #${pendingRun.runNumber} has not been logged yet. ` +
|
|
151
|
-
"Call log_experiment before starting another benchmark run.",
|
|
152
|
-
},
|
|
153
|
-
],
|
|
154
|
-
};
|
|
155
|
-
}
|
|
80
|
+
const resolvedCommand = DEFAULT_HARNESS_COMMAND;
|
|
81
|
+
const preRunStatus = await tryGitStatus(ctx.cwd);
|
|
82
|
+
const workDirPrefix = await tryGitPrefix(ctx.cwd);
|
|
83
|
+
const preRunDirtyPaths = parseWorkDirDirtyPaths(preRunStatus, workDirPrefix);
|
|
156
84
|
|
|
157
|
-
const
|
|
158
|
-
const
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
porcelainV1: true,
|
|
166
|
-
untrackedFiles: "all",
|
|
167
|
-
z: true,
|
|
85
|
+
const startedAt = Date.now();
|
|
86
|
+
const insertedRun = storage.insertRun({
|
|
87
|
+
sessionId: session.id,
|
|
88
|
+
segment: session.currentSegment,
|
|
89
|
+
command: resolvedCommand,
|
|
90
|
+
logPath: "", // patched after we know the run id
|
|
91
|
+
preRunDirtyPaths,
|
|
92
|
+
startedAt,
|
|
168
93
|
});
|
|
169
|
-
const workDirPrefix = await git.show.prefix(workDir);
|
|
170
|
-
const preRunDirtyPaths = parseWorkDirDirtyPaths(preRunStatus, workDirPrefix).filter(
|
|
171
|
-
p => !isAutoresearchLocalStatePath(p),
|
|
172
|
-
);
|
|
173
94
|
|
|
174
|
-
|
|
95
|
+
const runDirectory = path.join(storage.projectDir, "runs", String(insertedRun.id).padStart(4, "0"));
|
|
96
|
+
const benchmarkLogPath = path.join(runDirectory, "benchmark.log");
|
|
97
|
+
fs.mkdirSync(runDirectory, { recursive: true });
|
|
98
|
+
storage.updateRunLogPath(insertedRun.id, benchmarkLogPath);
|
|
99
|
+
|
|
175
100
|
runtime.lastRunDuration = null;
|
|
176
101
|
runtime.lastRunAsi = null;
|
|
177
102
|
runtime.lastRunArtifactDir = runDirectory;
|
|
178
|
-
runtime.lastRunNumber =
|
|
103
|
+
runtime.lastRunNumber = insertedRun.id;
|
|
179
104
|
runtime.lastRunSummary = null;
|
|
180
|
-
await Bun.write(
|
|
181
|
-
runJsonPath,
|
|
182
|
-
JSON.stringify(
|
|
183
|
-
{
|
|
184
|
-
runNumber,
|
|
185
|
-
runDirectory,
|
|
186
|
-
benchmarkLogPath,
|
|
187
|
-
checksLogPath,
|
|
188
|
-
command: params.command,
|
|
189
|
-
preRunDirtyPaths,
|
|
190
|
-
startedAt: new Date().toISOString(),
|
|
191
|
-
},
|
|
192
|
-
null,
|
|
193
|
-
2,
|
|
194
|
-
),
|
|
195
|
-
);
|
|
196
|
-
|
|
197
105
|
runtime.runningExperiment = {
|
|
198
|
-
startedAt
|
|
199
|
-
command:
|
|
106
|
+
startedAt,
|
|
107
|
+
command: resolvedCommand,
|
|
200
108
|
runDirectory,
|
|
201
|
-
runNumber,
|
|
109
|
+
runNumber: insertedRun.id,
|
|
202
110
|
};
|
|
203
111
|
options.dashboard.updateWidget(ctx, runtime);
|
|
204
112
|
options.dashboard.requestRender();
|
|
205
113
|
|
|
206
114
|
const timeoutMs = Math.max(0, Math.floor((params.timeout_seconds ?? 600) * 1000));
|
|
207
|
-
const startedAt = Date.now();
|
|
208
115
|
let execution: ProcessExecutionResult;
|
|
209
116
|
try {
|
|
210
117
|
execution = await executeProcess({
|
|
211
|
-
command: ["bash", "-lc",
|
|
212
|
-
cwd:
|
|
118
|
+
command: ["bash", "-lc", resolvedCommand],
|
|
119
|
+
cwd: ctx.cwd,
|
|
213
120
|
logPath: benchmarkLogPath,
|
|
214
121
|
timeoutMs,
|
|
215
122
|
signal,
|
|
@@ -232,41 +139,11 @@ export function createRunExperimentTool(
|
|
|
232
139
|
options.dashboard.requestRender();
|
|
233
140
|
}
|
|
234
141
|
|
|
235
|
-
const
|
|
142
|
+
const completedAt = Date.now();
|
|
143
|
+
const durationMs = completedAt - startedAt;
|
|
144
|
+
const durationSeconds = durationMs / 1000;
|
|
236
145
|
runtime.lastRunDuration = durationSeconds;
|
|
237
146
|
|
|
238
|
-
const benchmarkPassed = execution.exitCode === 0 && !execution.killed;
|
|
239
|
-
let checksPass: boolean | null = null;
|
|
240
|
-
let checksTimedOut = false;
|
|
241
|
-
let checksOutput = "";
|
|
242
|
-
let checksDuration = 0;
|
|
243
|
-
let checksLogPathValue: string | undefined;
|
|
244
|
-
|
|
245
|
-
if (benchmarkPassed && fs.existsSync(checksPath)) {
|
|
246
|
-
const checksStartedAt = Date.now();
|
|
247
|
-
const checksResult = await runChecks({
|
|
248
|
-
cwd: workDir,
|
|
249
|
-
pathToChecks: checksPath,
|
|
250
|
-
logPath: checksLogPath,
|
|
251
|
-
timeoutMs: Math.max(0, Math.floor((params.checks_timeout_seconds ?? 300) * 1000)),
|
|
252
|
-
signal,
|
|
253
|
-
});
|
|
254
|
-
checksDuration = (Date.now() - checksStartedAt) / 1000;
|
|
255
|
-
checksTimedOut = checksResult.killed;
|
|
256
|
-
checksPass = checksResult.code === 0 && !checksResult.killed;
|
|
257
|
-
checksOutput = checksResult.output;
|
|
258
|
-
checksLogPathValue = checksResult.logPath;
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
runtime.lastRunChecks =
|
|
262
|
-
checksPass === null
|
|
263
|
-
? null
|
|
264
|
-
: {
|
|
265
|
-
pass: checksPass,
|
|
266
|
-
output: checksOutput,
|
|
267
|
-
duration: checksDuration,
|
|
268
|
-
};
|
|
269
|
-
|
|
270
147
|
const llmTruncation = truncateTail(execution.output, {
|
|
271
148
|
maxBytes: EXPERIMENT_MAX_BYTES,
|
|
272
149
|
maxLines: EXPERIMENT_MAX_LINES,
|
|
@@ -278,113 +155,87 @@ export function createRunExperimentTool(
|
|
|
278
155
|
|
|
279
156
|
const parsedMetricsMap = parseMetricLines(execution.output);
|
|
280
157
|
const parsedMetrics = parsedMetricsMap.size > 0 ? Object.fromEntries(parsedMetricsMap.entries()) : null;
|
|
281
|
-
const parsedPrimary = parsedMetricsMap.get(
|
|
158
|
+
const parsedPrimary = parsedMetricsMap.get(session.primaryMetric) ?? null;
|
|
282
159
|
const parsedAsi = parseAsiLines(execution.output);
|
|
283
160
|
runtime.lastRunAsi = parsedAsi;
|
|
284
161
|
|
|
162
|
+
storage.markRunCompleted({
|
|
163
|
+
runId: insertedRun.id,
|
|
164
|
+
completedAt,
|
|
165
|
+
durationMs,
|
|
166
|
+
exitCode: execution.exitCode,
|
|
167
|
+
timedOut: execution.killed,
|
|
168
|
+
parsedPrimary,
|
|
169
|
+
parsedMetrics,
|
|
170
|
+
parsedAsi,
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
const passed = execution.exitCode === 0 && !execution.killed;
|
|
285
174
|
const resultDetails: RunDetails = {
|
|
286
|
-
runNumber,
|
|
175
|
+
runNumber: insertedRun.id,
|
|
287
176
|
runDirectory,
|
|
288
177
|
benchmarkLogPath,
|
|
289
|
-
|
|
290
|
-
command: params.command,
|
|
178
|
+
command: resolvedCommand,
|
|
291
179
|
exitCode: execution.exitCode,
|
|
292
180
|
durationSeconds,
|
|
293
|
-
passed
|
|
294
|
-
crashed: execution.exitCode !== 0 || execution.killed
|
|
181
|
+
passed,
|
|
182
|
+
crashed: execution.exitCode !== 0 || execution.killed,
|
|
295
183
|
timedOut: execution.killed,
|
|
296
184
|
tailOutput: displayTruncation.content,
|
|
297
|
-
checksPass,
|
|
298
|
-
checksTimedOut,
|
|
299
|
-
checksOutput: checksOutput.split("\n").slice(-80).join("\n"),
|
|
300
|
-
checksDuration,
|
|
301
185
|
parsedMetrics,
|
|
302
186
|
parsedPrimary,
|
|
303
187
|
parsedAsi,
|
|
304
|
-
metricName:
|
|
305
|
-
metricUnit:
|
|
188
|
+
metricName: session.primaryMetric,
|
|
189
|
+
metricUnit: session.metricUnit,
|
|
306
190
|
preRunDirtyPaths,
|
|
191
|
+
abandonedPriorRun,
|
|
307
192
|
truncation: llmTruncation.truncated ? llmTruncation : undefined,
|
|
308
193
|
fullOutputPath: execution.logPath,
|
|
309
194
|
};
|
|
195
|
+
|
|
310
196
|
runtime.lastRunSummary = {
|
|
311
|
-
|
|
312
|
-
checksPass,
|
|
313
|
-
checksTimedOut,
|
|
314
|
-
command: params.command,
|
|
197
|
+
command: resolvedCommand,
|
|
315
198
|
durationSeconds,
|
|
316
199
|
parsedAsi,
|
|
317
200
|
parsedMetrics,
|
|
318
201
|
parsedPrimary,
|
|
319
|
-
passed
|
|
202
|
+
passed,
|
|
320
203
|
preRunDirtyPaths,
|
|
321
204
|
runDirectory,
|
|
322
|
-
runNumber,
|
|
205
|
+
runNumber: insertedRun.id,
|
|
206
|
+
exitCode: execution.exitCode,
|
|
207
|
+
timedOut: execution.killed,
|
|
323
208
|
};
|
|
324
209
|
runtime.autoResumeArmed = true;
|
|
325
210
|
runtime.lastAutoResumePendingRunNumber = null;
|
|
211
|
+
|
|
212
|
+
// Refresh state to reflect any prior abandonment changes (logged set unchanged).
|
|
213
|
+
const refreshedSession = storage.getSessionById(session.id);
|
|
214
|
+
if (refreshedSession) {
|
|
215
|
+
runtime.state = buildExperimentState(refreshedSession, storage.listLoggedRuns(session.id));
|
|
216
|
+
}
|
|
326
217
|
options.dashboard.updateWidget(ctx, runtime);
|
|
327
218
|
options.dashboard.requestRender();
|
|
328
219
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
{
|
|
333
|
-
runNumber,
|
|
334
|
-
runDirectory,
|
|
335
|
-
benchmarkLogPath,
|
|
336
|
-
checksLogPath: checksLogPathValue,
|
|
337
|
-
command: params.command,
|
|
338
|
-
completedAt: new Date().toISOString(),
|
|
339
|
-
durationSeconds,
|
|
340
|
-
exitCode: execution.exitCode,
|
|
341
|
-
timedOut: execution.killed,
|
|
342
|
-
checks: {
|
|
343
|
-
durationSeconds: checksDuration,
|
|
344
|
-
passed: checksPass,
|
|
345
|
-
timedOut: checksTimedOut,
|
|
346
|
-
},
|
|
347
|
-
parsedMetrics,
|
|
348
|
-
parsedPrimary,
|
|
349
|
-
parsedAsi,
|
|
350
|
-
preRunDirtyPaths,
|
|
351
|
-
truncation: resultDetails.truncation,
|
|
352
|
-
fullOutputPath: resultDetails.fullOutputPath,
|
|
353
|
-
},
|
|
354
|
-
null,
|
|
355
|
-
2,
|
|
356
|
-
),
|
|
357
|
-
);
|
|
358
|
-
|
|
359
|
-
const commandWarnings: string[] = [];
|
|
360
|
-
if (forceCommand) {
|
|
361
|
-
if (state.benchmarkCommand && params.command.trim() !== state.benchmarkCommand) {
|
|
362
|
-
commandWarnings.push(
|
|
363
|
-
`Warning: command override (force=true). Segment benchmark is ${state.benchmarkCommand}; ran ${params.command}.`,
|
|
364
|
-
);
|
|
365
|
-
}
|
|
366
|
-
if (fs.existsSync(autoresearchScriptPath) && !isAutoresearchShCommand(params.command)) {
|
|
367
|
-
commandWarnings.push(
|
|
368
|
-
"Warning: autoresearch.sh exists but the command was not a direct autoresearch.sh invocation (force=true).",
|
|
369
|
-
);
|
|
370
|
-
}
|
|
220
|
+
const headerLines: string[] = [];
|
|
221
|
+
if (abandonedPriorRun !== null) {
|
|
222
|
+
headerLines.push(`Note: abandoned prior pending run #${abandonedPriorRun} before starting this run.`);
|
|
371
223
|
}
|
|
372
|
-
const warningPrefix =
|
|
224
|
+
const warningPrefix = headerLines.length > 0 ? `${headerLines.join("\n")}\n\n` : "";
|
|
373
225
|
|
|
374
226
|
return {
|
|
375
227
|
content: [
|
|
376
228
|
{
|
|
377
229
|
type: "text",
|
|
378
|
-
text: warningPrefix + buildRunText(resultDetails, llmTruncation.content, state.bestMetric),
|
|
230
|
+
text: warningPrefix + buildRunText(resultDetails, llmTruncation.content, runtime.state.bestMetric),
|
|
379
231
|
},
|
|
380
232
|
],
|
|
381
233
|
details: resultDetails,
|
|
382
234
|
};
|
|
383
235
|
},
|
|
384
|
-
renderCall(
|
|
385
|
-
const commandPreview = truncateToWidth(replaceTabs(args.command), 100);
|
|
236
|
+
renderCall(_args, _options, theme): Text {
|
|
386
237
|
return new Text(
|
|
387
|
-
`${theme.fg("toolTitle", theme.bold("run_experiment"))} ${theme.fg("muted",
|
|
238
|
+
`${theme.fg("toolTitle", theme.bold("run_experiment"))} ${theme.fg("muted", DEFAULT_HARNESS_COMMAND)}`,
|
|
388
239
|
0,
|
|
389
240
|
0,
|
|
390
241
|
);
|
|
@@ -395,17 +246,14 @@ export function createRunExperimentTool(
|
|
|
395
246
|
const preview = replaceTabs(result.content.find(part => part.type === "text")?.text ?? "");
|
|
396
247
|
return new Text(preview ? `${header}\n${theme.fg("dim", preview)}` : header, 0, 0);
|
|
397
248
|
}
|
|
398
|
-
|
|
399
249
|
const details = result.details;
|
|
400
250
|
if (!details || !isRunDetails(details)) {
|
|
401
251
|
return new Text(replaceTabs(result.content.find(part => part.type === "text")?.text ?? ""), 0, 0);
|
|
402
252
|
}
|
|
403
|
-
|
|
404
253
|
const statusText = renderStatus(details, theme);
|
|
405
254
|
if (!options.expanded && details.tailOutput.trim().length === 0) {
|
|
406
255
|
return new Text(statusText, 0, 0);
|
|
407
256
|
}
|
|
408
|
-
|
|
409
257
|
const preview = replaceTabs(
|
|
410
258
|
options.expanded ? details.tailOutput : details.tailOutput.split("\n").slice(-5).join("\n"),
|
|
411
259
|
);
|
|
@@ -418,7 +266,23 @@ export function createRunExperimentTool(
|
|
|
418
266
|
};
|
|
419
267
|
}
|
|
420
268
|
|
|
421
|
-
async function
|
|
269
|
+
async function tryGitStatus(cwd: string): Promise<string> {
|
|
270
|
+
try {
|
|
271
|
+
return await git.status(cwd, { porcelainV1: true, untrackedFiles: "all", z: true });
|
|
272
|
+
} catch {
|
|
273
|
+
return "";
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
async function tryGitPrefix(cwd: string): Promise<string> {
|
|
278
|
+
try {
|
|
279
|
+
return await git.show.prefix(cwd);
|
|
280
|
+
} catch {
|
|
281
|
+
return "";
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
async function executeProcess(opts: {
|
|
422
286
|
command: string[];
|
|
423
287
|
cwd: string;
|
|
424
288
|
logPath: string;
|
|
@@ -427,8 +291,8 @@ async function executeProcess(options: {
|
|
|
427
291
|
onProgress?(details: ProgressSnapshot): void;
|
|
428
292
|
}): Promise<ProcessExecutionResult> {
|
|
429
293
|
const { promise, resolve, reject } = Promise.withResolvers<ProcessExecutionResult>();
|
|
430
|
-
const child = childProcess.spawn(
|
|
431
|
-
cwd:
|
|
294
|
+
const child = childProcess.spawn(opts.command[0] ?? "bash", opts.command.slice(1), {
|
|
295
|
+
cwd: opts.cwd,
|
|
432
296
|
detached: true,
|
|
433
297
|
stdio: ["ignore", "pipe", "pipe"],
|
|
434
298
|
});
|
|
@@ -437,7 +301,7 @@ async function executeProcess(options: {
|
|
|
437
301
|
let chunksBytes = 0;
|
|
438
302
|
let killedByTimeout = false;
|
|
439
303
|
let resolved = false;
|
|
440
|
-
let writeStream: fs.WriteStream | undefined = fs.createWriteStream(
|
|
304
|
+
let writeStream: fs.WriteStream | undefined = fs.createWriteStream(opts.logPath);
|
|
441
305
|
let forceKillTimeout: NodeJS.Timeout | undefined;
|
|
442
306
|
|
|
443
307
|
const closeWriteStream = (): Promise<void> => {
|
|
@@ -459,7 +323,7 @@ async function executeProcess(options: {
|
|
|
459
323
|
if (progressTimer) clearInterval(progressTimer);
|
|
460
324
|
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
461
325
|
if (forceKillTimeout) clearTimeout(forceKillTimeout);
|
|
462
|
-
|
|
326
|
+
opts.signal?.removeEventListener("abort", abortHandler);
|
|
463
327
|
};
|
|
464
328
|
|
|
465
329
|
const finish = (callback: () => void): void => {
|
|
@@ -486,8 +350,8 @@ async function executeProcess(options: {
|
|
|
486
350
|
});
|
|
487
351
|
return {
|
|
488
352
|
elapsed: formatElapsed(Date.now() - startedAt),
|
|
489
|
-
runDirectory: path.dirname(
|
|
490
|
-
fullOutputPath:
|
|
353
|
+
runDirectory: path.dirname(opts.logPath),
|
|
354
|
+
fullOutputPath: opts.logPath,
|
|
491
355
|
tailOutput: tail.content,
|
|
492
356
|
truncation: tail.truncated ? tail : undefined,
|
|
493
357
|
};
|
|
@@ -503,26 +367,26 @@ async function executeProcess(options: {
|
|
|
503
367
|
};
|
|
504
368
|
|
|
505
369
|
const startedAt = Date.now();
|
|
506
|
-
const progressTimer =
|
|
370
|
+
const progressTimer = opts.onProgress
|
|
507
371
|
? setInterval(() => {
|
|
508
|
-
|
|
372
|
+
opts.onProgress?.(snapshot());
|
|
509
373
|
}, 1000)
|
|
510
374
|
: undefined;
|
|
511
375
|
const timeoutHandle =
|
|
512
|
-
|
|
376
|
+
opts.timeoutMs > 0
|
|
513
377
|
? setTimeout(() => {
|
|
514
378
|
killedByTimeout = true;
|
|
515
379
|
killTreeWithEscalation();
|
|
516
|
-
},
|
|
380
|
+
}, opts.timeoutMs)
|
|
517
381
|
: undefined;
|
|
518
382
|
|
|
519
383
|
const abortHandler = (): void => {
|
|
520
384
|
killTreeWithEscalation();
|
|
521
385
|
};
|
|
522
|
-
if (
|
|
386
|
+
if (opts.signal?.aborted) {
|
|
523
387
|
abortHandler();
|
|
524
388
|
} else {
|
|
525
|
-
|
|
389
|
+
opts.signal?.addEventListener("abort", abortHandler, { once: true });
|
|
526
390
|
}
|
|
527
391
|
|
|
528
392
|
child.stdout?.on("data", data => {
|
|
@@ -539,16 +403,16 @@ async function executeProcess(options: {
|
|
|
539
403
|
child.on("close", async code => {
|
|
540
404
|
try {
|
|
541
405
|
await closeWriteStream();
|
|
542
|
-
if (
|
|
406
|
+
if (opts.signal?.aborted) {
|
|
543
407
|
finish(() => reject(new Error("aborted")));
|
|
544
408
|
return;
|
|
545
409
|
}
|
|
546
|
-
const output = await fs.promises.readFile(
|
|
410
|
+
const output = await fs.promises.readFile(opts.logPath, "utf8");
|
|
547
411
|
finish(() =>
|
|
548
412
|
resolve({
|
|
549
413
|
exitCode: code,
|
|
550
414
|
killed: killedByTimeout,
|
|
551
|
-
logPath:
|
|
415
|
+
logPath: opts.logPath,
|
|
552
416
|
output,
|
|
553
417
|
}),
|
|
554
418
|
);
|
|
@@ -560,31 +424,9 @@ async function executeProcess(options: {
|
|
|
560
424
|
return promise;
|
|
561
425
|
}
|
|
562
426
|
|
|
563
|
-
async function runChecks(options: {
|
|
564
|
-
cwd: string;
|
|
565
|
-
pathToChecks: string;
|
|
566
|
-
logPath: string;
|
|
567
|
-
timeoutMs: number;
|
|
568
|
-
signal?: AbortSignal;
|
|
569
|
-
}): Promise<ChecksExecutionResult> {
|
|
570
|
-
const result = await executeProcess({
|
|
571
|
-
command: ["bash", options.pathToChecks],
|
|
572
|
-
cwd: options.cwd,
|
|
573
|
-
logPath: options.logPath,
|
|
574
|
-
timeoutMs: options.timeoutMs,
|
|
575
|
-
signal: options.signal,
|
|
576
|
-
});
|
|
577
|
-
return {
|
|
578
|
-
code: result.exitCode,
|
|
579
|
-
killed: result.killed,
|
|
580
|
-
logPath: result.logPath,
|
|
581
|
-
output: result.output.trim(),
|
|
582
|
-
};
|
|
583
|
-
}
|
|
584
|
-
|
|
585
427
|
function buildRunText(details: RunDetails, outputPreview: string, bestMetric: number | null): string {
|
|
586
428
|
const lines: string[] = [];
|
|
587
|
-
lines.push(`Run directory: ${details.runDirectory}`);
|
|
429
|
+
lines.push(`Run #${details.runNumber} directory: ${details.runDirectory}`);
|
|
588
430
|
if (details.timedOut) {
|
|
589
431
|
lines.push(`TIMEOUT after ${details.durationSeconds.toFixed(1)}s`);
|
|
590
432
|
} else if (details.exitCode !== 0) {
|
|
@@ -592,13 +434,6 @@ function buildRunText(details: RunDetails, outputPreview: string, bestMetric: nu
|
|
|
592
434
|
} else {
|
|
593
435
|
lines.push(`PASSED in ${details.durationSeconds.toFixed(1)}s`);
|
|
594
436
|
}
|
|
595
|
-
if (details.checksTimedOut) {
|
|
596
|
-
lines.push(`Checks timed out after ${details.checksDuration.toFixed(1)}s`);
|
|
597
|
-
} else if (details.checksPass === false) {
|
|
598
|
-
lines.push(`Checks failed in ${details.checksDuration.toFixed(1)}s`);
|
|
599
|
-
} else if (details.checksPass === true) {
|
|
600
|
-
lines.push(`Checks passed in ${details.checksDuration.toFixed(1)}s`);
|
|
601
|
-
}
|
|
602
437
|
if (bestMetric !== null) {
|
|
603
438
|
lines.push(`Current baseline ${details.metricName}: ${formatNum(bestMetric, details.metricUnit)}`);
|
|
604
439
|
}
|
|
@@ -627,14 +462,6 @@ function buildRunText(details: RunDetails, outputPreview: string, bestMetric: nu
|
|
|
627
462
|
`Output truncated (${formatBytes(EXPERIMENT_MAX_BYTES)} limit). Full output: ${details.fullOutputPath}`,
|
|
628
463
|
);
|
|
629
464
|
}
|
|
630
|
-
if (details.checksLogPath) {
|
|
631
|
-
lines.push(`Checks log: ${details.checksLogPath}`);
|
|
632
|
-
}
|
|
633
|
-
if (details.checksPass === false && details.checksOutput.length > 0) {
|
|
634
|
-
lines.push("");
|
|
635
|
-
lines.push("Checks output:");
|
|
636
|
-
lines.push(details.checksOutput);
|
|
637
|
-
}
|
|
638
465
|
return lines.join("\n").trimEnd();
|
|
639
466
|
}
|
|
640
467
|
|
|
@@ -642,12 +469,6 @@ function renderStatus(details: RunDetails, theme: Theme): string {
|
|
|
642
469
|
if (details.timedOut) {
|
|
643
470
|
return theme.fg("error", `TIMEOUT ${details.durationSeconds.toFixed(1)}s`);
|
|
644
471
|
}
|
|
645
|
-
if (details.checksTimedOut) {
|
|
646
|
-
return theme.fg("warning", `Checks timeout ${details.checksDuration.toFixed(1)}s`);
|
|
647
|
-
}
|
|
648
|
-
if (details.checksPass === false) {
|
|
649
|
-
return theme.fg("error", `Checks failed ${details.checksDuration.toFixed(1)}s`);
|
|
650
|
-
}
|
|
651
472
|
if (details.exitCode !== 0) {
|
|
652
473
|
return theme.fg("error", `FAIL exit=${details.exitCode} ${details.durationSeconds.toFixed(1)}s`);
|
|
653
474
|
}
|
|
@@ -665,5 +486,5 @@ function isRunDetails(value: unknown): value is RunDetails {
|
|
|
665
486
|
|
|
666
487
|
function isProgressDetails(value: unknown): value is RunExperimentProgressDetails {
|
|
667
488
|
if (typeof value !== "object" || value === null) return false;
|
|
668
|
-
return "phase" in value && value.phase === "running";
|
|
489
|
+
return "phase" in value && (value as { phase: unknown }).phase === "running";
|
|
669
490
|
}
|