@oh-my-pi/pi-coding-agent 14.5.14 → 14.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/CHANGELOG.md +39 -0
  2. package/package.json +7 -7
  3. package/src/autoresearch/command-resume.md +5 -8
  4. package/src/autoresearch/git.ts +41 -51
  5. package/src/autoresearch/helpers.ts +43 -359
  6. package/src/autoresearch/index.ts +281 -273
  7. package/src/autoresearch/prompt-setup.md +43 -0
  8. package/src/autoresearch/prompt.md +52 -193
  9. package/src/autoresearch/resume-message.md +2 -8
  10. package/src/autoresearch/state.ts +59 -166
  11. package/src/autoresearch/storage.ts +687 -0
  12. package/src/autoresearch/tools/init-experiment.ts +201 -290
  13. package/src/autoresearch/tools/log-experiment.ts +304 -517
  14. package/src/autoresearch/tools/run-experiment.ts +117 -296
  15. package/src/autoresearch/tools/update-notes.ts +116 -0
  16. package/src/autoresearch/types.ts +16 -66
  17. package/src/config/settings-schema.ts +1 -1
  18. package/src/config/settings.ts +20 -1
  19. package/src/cursor.ts +1 -1
  20. package/src/edit/index.ts +9 -31
  21. package/src/edit/line-hash.ts +70 -43
  22. package/src/edit/modes/hashline.lark +26 -0
  23. package/src/edit/modes/hashline.ts +898 -1099
  24. package/src/edit/modes/patch.ts +0 -7
  25. package/src/edit/modes/replace.ts +0 -4
  26. package/src/edit/renderer.ts +22 -20
  27. package/src/edit/streaming.ts +8 -28
  28. package/src/eval/eval.lark +24 -30
  29. package/src/eval/js/context-manager.ts +5 -162
  30. package/src/eval/js/prelude.txt +0 -12
  31. package/src/eval/parse.ts +129 -129
  32. package/src/eval/py/prelude.py +1 -219
  33. package/src/export/html/template.generated.ts +1 -1
  34. package/src/export/html/template.js +2 -2
  35. package/src/internal-urls/docs-index.generated.ts +1 -1
  36. package/src/modes/components/session-observer-overlay.ts +5 -2
  37. package/src/modes/components/status-line/segments.ts +1 -1
  38. package/src/modes/components/status-line.ts +3 -5
  39. package/src/modes/components/tree-selector.ts +4 -5
  40. package/src/modes/components/welcome.ts +11 -1
  41. package/src/modes/controllers/command-controller.ts +2 -6
  42. package/src/modes/controllers/event-controller.ts +1 -2
  43. package/src/modes/controllers/extension-ui-controller.ts +3 -15
  44. package/src/modes/controllers/input-controller.ts +0 -1
  45. package/src/modes/controllers/selector-controller.ts +1 -1
  46. package/src/modes/interactive-mode.ts +5 -7
  47. package/src/prompts/system/system-prompt.md +14 -38
  48. package/src/prompts/tools/ast-edit.md +8 -8
  49. package/src/prompts/tools/ast-grep.md +10 -10
  50. package/src/prompts/tools/eval.md +13 -31
  51. package/src/prompts/tools/find.md +2 -1
  52. package/src/prompts/tools/hashline.md +66 -57
  53. package/src/prompts/tools/search.md +2 -2
  54. package/src/session/session-manager.ts +17 -13
  55. package/src/tools/ast-edit.ts +141 -44
  56. package/src/tools/ast-grep.ts +112 -36
  57. package/src/tools/eval.ts +2 -53
  58. package/src/tools/find.ts +16 -15
  59. package/src/tools/path-utils.ts +36 -196
  60. package/src/tools/search.ts +56 -35
  61. package/src/utils/edit-mode.ts +2 -11
  62. package/src/utils/file-display-mode.ts +1 -1
  63. package/src/utils/git.ts +17 -0
  64. package/src/utils/session-color.ts +0 -12
  65. package/src/utils/title-generator.ts +22 -38
  66. package/src/autoresearch/apply-contract-to-state.ts +0 -24
  67. package/src/autoresearch/contract.ts +0 -288
  68. package/src/edit/modes/atom.lark +0 -29
  69. package/src/edit/modes/atom.ts +0 -1773
  70. package/src/prompts/tools/atom.md +0 -150
@@ -1,4 +1,3 @@
1
- import * as fs from "node:fs";
2
1
  import * as path from "node:path";
3
2
  import { StringEnum } from "@oh-my-pi/pi-ai";
4
3
  import { Text } from "@oh-my-pi/pi-tui";
@@ -6,87 +5,66 @@ import { Type } from "@sinclair/typebox";
6
5
  import type { ToolDefinition } from "../../extensibility/extensions";
7
6
  import type { Theme } from "../../modes/theme/theme";
8
7
  import { replaceTabs, truncateToWidth } from "../../tools/render-utils";
9
- import { applyAutoresearchContractToExperimentState } from "../apply-contract-to-state";
10
- import {
11
- contractListsEqual,
12
- contractPathListsEqual,
13
- loadAutoresearchScriptSnapshot,
14
- readAutoresearchContract,
15
- } from "../contract";
16
- import {
17
- abandonUnloggedAutoresearchRuns,
18
- collectLoggedRunNumbers,
19
- isAutoresearchShCommand,
20
- readMaxExperiments,
21
- readPendingRunSummary,
22
- resolveWorkDir,
23
- validateWorkDir,
24
- } from "../helpers";
25
- import { cloneExperimentState } from "../state";
8
+ import * as git from "../../utils/git";
9
+ import { parseWorkDirDirtyPaths } from "../git";
10
+ import { dedupeStrings, normalizePathSpec } from "../helpers";
11
+ import { buildExperimentState } from "../state";
12
+ import { openAutoresearchStorage, type SessionRow } from "../storage";
26
13
  import type { AutoresearchToolFactoryOptions, ExperimentState } from "../types";
27
14
 
15
+ export const HARNESS_FILENAME = "autoresearch.sh";
16
+ export const DEFAULT_HARNESS_COMMAND = `bash ${HARNESS_FILENAME}`;
17
+ const HARNESS_COMMIT_TITLE = "autoresearch: harness setup";
18
+
28
19
  const initExperimentSchema = Type.Object({
29
- name: Type.String({
30
- description: "Human-readable experiment name.",
20
+ name: Type.String({ description: "Human-readable experiment name." }),
21
+ goal: Type.Optional(Type.String({ description: "Free-form description of what this session optimizes." })),
22
+ primary_metric: Type.String({
23
+ description:
24
+ "Primary metric name shown in the dashboard. Match the `METRIC <name>=<value>` lines printed by the benchmark.",
31
25
  }),
32
- from_autoresearch_md: Type.Optional(
33
- Type.Boolean({
34
- description:
35
- "When true, load benchmark command, metrics, scope, off-limits, and constraints from autoresearch.md instead of passing mirrored fields below.",
36
- }),
37
- ),
38
- abandon_unlogged_runs: Type.Optional(
39
- Type.Boolean({
40
- description:
41
- "When true, mark all completed but unlogged run artifacts as abandoned so initialization can proceed without logging them first.",
42
- }),
43
- ),
44
- new_segment: Type.Optional(
45
- Type.Boolean({
46
- description:
47
- "When true, force a new segment even when the contract fields have not changed. Without this, re-initialization with matching contract is a no-op.",
48
- }),
49
- ),
50
- metric_name: Type.Optional(
51
- Type.String({
52
- description: "Primary metric name shown in the dashboard. Required when from_autoresearch_md is false.",
53
- }),
54
- ),
55
26
  metric_unit: Type.Optional(
56
- Type.String({
57
- description: "Unit for the primary metric, for example µs, ms, s, kb, or empty.",
58
- }),
27
+ Type.String({ description: "Unit for the primary metric (e.g. ms, µs, mb). Empty when unitless." }),
59
28
  ),
60
29
  direction: Type.Optional(
61
- StringEnum(["lower", "higher"], {
62
- description: "Whether lower or higher values are better. Defaults to lower.",
63
- }),
30
+ StringEnum(["lower", "higher"], { description: "Whether lower or higher values are better. Defaults to lower." }),
64
31
  ),
65
- benchmark_command: Type.Optional(
66
- Type.String({
67
- description: "Benchmark command recorded in autoresearch.md. Required when from_autoresearch_md is false.",
32
+ secondary_metrics: Type.Optional(
33
+ Type.Array(Type.String(), {
34
+ description: "Names of secondary metrics tracked alongside the primary metric.",
68
35
  }),
69
36
  ),
70
37
  scope_paths: Type.Optional(
71
38
  Type.Array(Type.String(), {
72
- description: "Files in Scope from autoresearch.md. Required when from_autoresearch_md is false.",
73
- minItems: 1,
39
+ description:
40
+ "Files or directories the agent expects to modify. Used post-hoc to flag scope deviations on log_experiment; never used to block edits.",
74
41
  }),
75
42
  ),
76
43
  off_limits: Type.Optional(
77
44
  Type.Array(Type.String(), {
78
- description: "Off Limits paths from autoresearch.md.",
45
+ description:
46
+ "Paths the agent SHOULD NOT modify. Used post-hoc to flag scope deviations on log_experiment; never used to block edits.",
79
47
  }),
80
48
  ),
81
49
  constraints: Type.Optional(
82
- Type.Array(Type.String(), {
83
- description: "Constraints from autoresearch.md.",
50
+ Type.Array(Type.String(), { description: "Free-form constraints (e.g. 'no api break')." }),
51
+ ),
52
+ max_iterations: Type.Optional(Type.Number({ description: "Soft cap on iterations per segment. Optional." })),
53
+ new_segment: Type.Optional(
54
+ Type.Boolean({
55
+ description:
56
+ "When true, bump to a new segment even when an active session exists. New baselines and best-metric reset.",
84
57
  }),
85
58
  ),
86
59
  });
87
60
 
88
61
  interface InitExperimentDetails {
89
62
  state: ExperimentState;
63
+ createdSession: boolean;
64
+ bumpedSegment: boolean;
65
+ abandonedRuns: number;
66
+ harnessCommitted: boolean;
67
+ baselineCommit: string | null;
90
68
  }
91
69
 
92
70
  export function createInitExperimentTool(
@@ -96,253 +74,117 @@ export function createInitExperimentTool(
96
74
  name: "init_experiment",
97
75
  label: "Init Experiment",
98
76
  description:
99
- "Initialize or reset the autoresearch session for the current optimization target before the first logged run of a segment.",
77
+ "Initialize or reconfigure the autoresearch session. On first call (Phase 1 Phase 2 transition), requires `./autoresearch.sh` to exist and pending harness changes are auto-committed on an autoresearch branch. Pass `new_segment: true` to start a fresh baseline within an existing session.",
100
78
  parameters: initExperimentSchema,
101
79
  defaultInactive: true,
102
80
  async execute(_toolCallId, params, _signal, _onUpdate, ctx) {
103
- const workDirError = validateWorkDir(ctx.cwd);
104
- if (workDirError) {
105
- return {
106
- content: [{ type: "text", text: `Error: ${workDirError}` }],
107
- };
108
- }
109
-
81
+ const storage = await openAutoresearchStorage(ctx.cwd);
110
82
  const runtime = options.getRuntime(ctx);
111
- const state = runtime.state;
112
- const isReinitializing = state.results.length > 0;
113
- const workDir = resolveWorkDir(ctx.cwd);
114
- const loggedRunNumbers = collectLoggedRunNumbers(state.results);
115
-
116
- let abandonSummary = "";
117
- if (params.abandon_unlogged_runs === true) {
118
- const abandoned = await abandonUnloggedAutoresearchRuns(workDir, loggedRunNumbers);
119
- if (abandoned > 0) {
120
- abandonSummary =
121
- abandoned === 1
122
- ? "Abandoned 1 unlogged run artifact.\n"
123
- : `Abandoned ${abandoned} unlogged run artifacts.\n`;
124
- }
125
- }
126
-
127
- const pendingRun = await readPendingRunSummary(workDir, loggedRunNumbers);
128
- if (pendingRun) {
129
- const metricInfo = pendingRun.parsedPrimary !== null ? `, metric=${pendingRun.parsedPrimary}` : "";
130
- const passedInfo = pendingRun.passed ? "passed" : "failed";
131
- return {
132
- content: [
133
- {
134
- type: "text",
135
- text:
136
- abandonSummary +
137
- `Error: run #${pendingRun.runNumber} has not been logged yet.\n` +
138
- `Pending: command="${pendingRun.command}"${metricInfo}, ${passedInfo}\n` +
139
- "Call log_experiment before re-initializing, or pass abandon_unlogged_runs=true.",
140
- },
141
- ],
142
- };
143
- }
144
83
 
145
- const contractResult = readAutoresearchContract(workDir);
146
- const scriptSnapshot = loadAutoresearchScriptSnapshot(workDir);
147
- const errors = [...contractResult.errors, ...scriptSnapshot.errors];
148
- if (errors.length > 0) {
149
- return {
150
- content: [{ type: "text", text: `${abandonSummary}Error: ${errors.join(" ")}` }],
151
- };
152
- }
84
+ const direction = params.direction ?? "lower";
85
+ const metricUnit = params.metric_unit ?? "";
86
+ const scopePaths = dedupeStrings((params.scope_paths ?? []).map(normalizePathSpec));
87
+ const offLimits = dedupeStrings((params.off_limits ?? []).map(normalizePathSpec));
88
+ const constraints = dedupeStrings(params.constraints ?? []);
89
+ const secondaryMetrics = dedupeStrings(params.secondary_metrics ?? []);
90
+ const goal = params.goal?.trim() || null;
91
+ const maxIterations =
92
+ params.max_iterations !== undefined && Number.isFinite(params.max_iterations) && params.max_iterations > 0
93
+ ? Math.floor(params.max_iterations)
94
+ : null;
95
+ const branch = (await git.branch.current(ctx.cwd)) ?? null;
96
+ const onAutoresearchBranch = branch?.startsWith("autoresearch/") ?? false;
153
97
 
154
- const benchmarkContract = contractResult.contract.benchmark;
155
- const expectedDirection = benchmarkContract.direction ?? "lower";
156
- const expectedMetricUnit = benchmarkContract.metricUnit;
157
- if (benchmarkContract.command && !isAutoresearchShCommand(benchmarkContract.command)) {
158
- return {
159
- content: [
160
- {
161
- type: "text",
162
- text:
163
- abandonSummary +
164
- "Error: Benchmark.command in autoresearch.md must invoke `autoresearch.sh` directly. " +
165
- "Move the real workload into `autoresearch.sh` and re-run init_experiment.",
166
- },
167
- ],
168
- };
169
- }
98
+ const existing = storage.getActiveSessionForBranch(branch);
99
+ const isNewSegmentInit = existing !== null && params.new_segment === true;
100
+ const requiresHarness = !existing || isNewSegmentInit;
170
101
 
171
- const fromMd = params.from_autoresearch_md === true;
172
- if (!fromMd) {
173
- const metricName = params.metric_name?.trim();
174
- const benchmarkCommand = params.benchmark_command?.trim();
175
- const scopePaths = params.scope_paths;
176
- if (!metricName || !benchmarkCommand || !scopePaths || scopePaths.length === 0) {
177
- return {
178
- content: [
179
- {
180
- type: "text",
181
- text:
182
- abandonSummary +
183
- "Error: when from_autoresearch_md is false or omitted, metric_name, benchmark_command, and scope_paths are required and must match autoresearch.md. " +
184
- "Alternatively pass from_autoresearch_md=true with only name (plus optional flags).",
185
- },
186
- ],
187
- };
188
- }
189
- if (benchmarkContract.command !== benchmarkCommand) {
190
- return {
191
- content: [
192
- {
193
- type: "text",
194
- text:
195
- abandonSummary +
196
- "Error: benchmark_command does not match autoresearch.md. " +
197
- `Expected: ${benchmarkContract.command ?? "(missing)"}\nReceived: ${params.benchmark_command}`,
198
- },
199
- ],
200
- };
201
- }
202
- if (benchmarkContract.primaryMetric !== metricName) {
203
- return {
204
- content: [
205
- {
206
- type: "text",
207
- text:
208
- abandonSummary +
209
- "Error: metric_name does not match autoresearch.md. " +
210
- `Expected: ${benchmarkContract.primaryMetric ?? "(missing)"}\nReceived: ${params.metric_name}`,
211
- },
212
- ],
213
- };
214
- }
215
- if ((params.metric_unit ?? "") !== expectedMetricUnit) {
216
- return {
217
- content: [
218
- {
219
- type: "text",
220
- text:
221
- abandonSummary +
222
- "Error: metric_unit does not match autoresearch.md. " +
223
- `Expected: ${expectedMetricUnit || "(empty)"}\nReceived: ${params.metric_unit ?? "(empty)"}`,
224
- },
225
- ],
226
- };
227
- }
228
- if ((params.direction ?? "lower") !== expectedDirection) {
229
- return {
230
- content: [
231
- {
232
- type: "text",
233
- text:
234
- abandonSummary +
235
- "Error: direction does not match autoresearch.md. " +
236
- `Expected: ${expectedDirection}\nReceived: ${params.direction ?? "lower"}`,
237
- },
238
- ],
239
- };
240
- }
241
- if (!contractPathListsEqual(scopePaths, contractResult.contract.scopePaths)) {
242
- return {
243
- content: [
244
- {
245
- type: "text",
246
- text:
247
- abandonSummary +
248
- "Error: scope_paths do not match autoresearch.md. " +
249
- `Expected: ${contractResult.contract.scopePaths.join(", ")}`,
250
- },
251
- ],
252
- };
253
- }
254
- if (!contractPathListsEqual(params.off_limits ?? [], contractResult.contract.offLimits)) {
255
- return {
256
- content: [
257
- {
258
- type: "text",
259
- text:
260
- abandonSummary +
261
- "Error: off_limits do not match autoresearch.md. " +
262
- `Expected: ${contractResult.contract.offLimits.join(", ") || "(empty)"}`,
263
- },
264
- ],
265
- };
266
- }
267
- if (!contractListsEqual(params.constraints ?? [], contractResult.contract.constraints)) {
102
+ if (requiresHarness) {
103
+ const harnessExists = await Bun.file(path.join(ctx.cwd, HARNESS_FILENAME)).exists();
104
+ if (!harnessExists) {
268
105
  return {
269
106
  content: [
270
107
  {
271
108
  type: "text",
272
- text:
273
- abandonSummary +
274
- "Error: constraints do not match autoresearch.md. " +
275
- `Expected: ${contractResult.contract.constraints.join(", ") || "(empty)"}`,
109
+ text: `Error: ./${HARNESS_FILENAME} does not exist. Phase 1 of autoresearch is harness setup — write \`./${HARNESS_FILENAME}\` so it exits 0 and prints \`METRIC <name>=<value>\`, validate it via \`bash ${HARNESS_FILENAME}\`, then call init_experiment again.`,
276
110
  },
277
111
  ],
278
112
  };
279
113
  }
280
114
  }
281
115
 
282
- // Check if contract matches current state — if so, re-init is a no-op
283
- if (isReinitializing && params.new_segment !== true) {
284
- const contract = contractResult.contract;
285
- const bm = contract.benchmark;
286
- const contractMatches =
287
- (bm.primaryMetric ?? "metric") === state.metricName &&
288
- bm.metricUnit === state.metricUnit &&
289
- (bm.direction ?? "lower") === state.bestDirection &&
290
- (bm.command ?? null) === state.benchmarkCommand &&
291
- contractPathListsEqual(contract.scopePaths, state.scopePaths) &&
292
- contractPathListsEqual(contract.offLimits, state.offLimits) &&
293
- contractListsEqual(contract.constraints, state.constraints);
294
- if (contractMatches) {
295
- runtime.autoresearchMode = true;
296
- runtime.autoResumeArmed = true;
297
- options.dashboard.updateWidget(ctx, runtime);
298
- options.dashboard.requestRender();
299
- return {
300
- content: [
301
- {
302
- type: "text",
303
- text:
304
- abandonSummary +
305
- `Experiment session already initialized with matching contract. Continuing segment ${state.currentSegment}.`,
306
- },
307
- ],
308
- details: { state: cloneExperimentState(state) },
309
- };
116
+ let harnessCommitted = false;
117
+ let commitWarning: string | null = null;
118
+ if (requiresHarness && onAutoresearchBranch) {
119
+ const dirty = await detectPendingChanges(ctx.cwd);
120
+ if (dirty) {
121
+ try {
122
+ await git.stage.files(ctx.cwd, []);
123
+ const message = buildHarnessCommitMessage(goal, params.name);
124
+ await git.commit(ctx.cwd, message);
125
+ harnessCommitted = true;
126
+ } catch (err) {
127
+ commitWarning = `Failed to auto-commit harness changes: ${err instanceof Error ? err.message : String(err)}. Recording baseline at current HEAD; discard may not preserve uncommitted harness files.`;
128
+ }
310
129
  }
311
130
  }
312
131
 
313
- applyAutoresearchContractToExperimentState(contractResult.contract, state);
314
- state.name = params.name;
315
- state.maxExperiments = readMaxExperiments(ctx.cwd);
316
- state.bestMetric = null;
317
- state.confidence = null;
318
- if (isReinitializing) {
319
- state.currentSegment += 1;
320
- }
132
+ const baselineCommit = await tryReadHeadSha(ctx.cwd);
321
133
 
322
- const jsonlPath = path.join(workDir, "autoresearch.jsonl");
323
- const configLine = JSON.stringify({
324
- type: "config",
325
- name: state.name,
326
- metricName: state.metricName,
327
- metricUnit: state.metricUnit,
328
- bestDirection: state.bestDirection,
329
- benchmarkCommand: state.benchmarkCommand,
330
- secondaryMetrics: state.secondaryMetrics.map(metric => metric.name),
331
- scopePaths: state.scopePaths,
332
- offLimits: state.offLimits,
333
- constraints: state.constraints,
334
- });
134
+ let session: SessionRow;
135
+ let createdSession = false;
136
+ let bumpedSegment = false;
137
+ let abandonedRuns = 0;
335
138
 
336
- if (isReinitializing) {
337
- fs.appendFileSync(jsonlPath, `${configLine}\n`);
139
+ if (!existing) {
140
+ session = storage.openSession({
141
+ name: params.name,
142
+ goal,
143
+ primaryMetric: params.primary_metric,
144
+ metricUnit,
145
+ direction,
146
+ preferredCommand: DEFAULT_HARNESS_COMMAND,
147
+ branch,
148
+ baselineCommit,
149
+ maxIterations,
150
+ scopePaths,
151
+ offLimits,
152
+ constraints,
153
+ secondaryMetrics,
154
+ });
155
+ createdSession = true;
338
156
  } else {
339
- fs.writeFileSync(jsonlPath, `${configLine}\n`);
157
+ abandonedRuns = storage.abandonPendingRuns(existing.id);
158
+ const updates: Parameters<typeof storage.updateSession>[1] = {
159
+ goal,
160
+ maxIterations,
161
+ scopePaths,
162
+ offLimits,
163
+ constraints,
164
+ secondaryMetrics,
165
+ primaryMetric: params.primary_metric,
166
+ metricUnit,
167
+ direction,
168
+ branch,
169
+ };
170
+ if (isNewSegmentInit) {
171
+ updates.baselineCommit = baselineCommit;
172
+ }
173
+ let updated = storage.updateSession(existing.id, updates);
174
+ if (isNewSegmentInit) {
175
+ updated = storage.bumpSegment(existing.id);
176
+ bumpedSegment = true;
177
+ }
178
+ session = updated;
340
179
  }
341
180
 
181
+ const loggedRuns = storage.listLoggedRuns(session.id);
182
+ const state = buildExperimentState(session, loggedRuns);
183
+ runtime.state = state;
184
+ runtime.goal = session.goal;
342
185
  runtime.autoresearchMode = true;
343
186
  runtime.autoResumeArmed = true;
344
187
  runtime.lastAutoResumePendingRunNumber = null;
345
- runtime.lastRunChecks = null;
346
188
  runtime.lastRunDuration = null;
347
189
  runtime.lastRunAsi = null;
348
190
  runtime.lastRunArtifactDir = null;
@@ -351,24 +193,65 @@ export function createInitExperimentTool(
351
193
  options.dashboard.updateWidget(ctx, runtime);
352
194
  options.dashboard.requestRender();
353
195
 
354
- const lines = [
355
- abandonSummary.trimEnd(),
356
- `Experiment initialized: ${state.name}`,
357
- `Metric: ${state.metricName} (${state.metricUnit || "unitless"}, ${state.bestDirection} is better)`,
358
- `Benchmark command: ${state.benchmarkCommand}`,
359
- `Working directory: ${workDir}`,
360
- `Files in Scope: ${state.scopePaths.join(", ")}`,
361
- isReinitializing
362
- ? "Previous results remain in history. This starts a new segment and requires a fresh baseline."
363
- : "Now run the baseline experiment and log it.",
364
- ].filter(line => line.length > 0);
365
- if (state.maxExperiments !== null) {
366
- lines.push(`Max iterations: ${state.maxExperiments}`);
196
+ const lines: string[] = [];
197
+ if (abandonedRuns > 0) {
198
+ lines.push(`Abandoned ${abandonedRuns} pending run${abandonedRuns === 1 ? "" : "s"} before reconfiguring.`);
199
+ }
200
+ if (harnessCommitted && session.baselineCommit) {
201
+ lines.push(`Committed harness setup at ${session.baselineCommit.slice(0, 12)}.`);
202
+ }
203
+ if (commitWarning) {
204
+ lines.push(commitWarning);
205
+ }
206
+ if (createdSession) {
207
+ lines.push(`Started session #${session.id}: ${session.name}`);
208
+ } else if (bumpedSegment) {
209
+ lines.push(`Bumped segment to ${session.currentSegment} for session #${session.id}: ${session.name}`);
210
+ } else {
211
+ lines.push(`Updated session #${session.id} (segment ${session.currentSegment}): ${session.name}`);
212
+ }
213
+ lines.push(
214
+ `Metric: ${session.primaryMetric} (${session.metricUnit || "unitless"}, ${session.direction} is better)`,
215
+ );
216
+ lines.push(`Benchmark entrypoint: ${DEFAULT_HARNESS_COMMAND}`);
217
+ if (session.scopePaths.length > 0) {
218
+ lines.push(`Files in scope: ${session.scopePaths.join(", ")}`);
219
+ }
220
+ if (session.offLimits.length > 0) {
221
+ lines.push(`Off limits: ${session.offLimits.join(", ")}`);
222
+ }
223
+ if (session.maxIterations !== null) {
224
+ lines.push(`Max iterations per segment: ${session.maxIterations}`);
225
+ }
226
+ if (session.branch) {
227
+ lines.push(`Active branch: ${session.branch}`);
228
+ }
229
+ if (session.baselineCommit) {
230
+ lines.push(`Baseline commit: ${session.baselineCommit.slice(0, 12)}`);
231
+ }
232
+ if (createdSession) {
233
+ lines.push(
234
+ "Phase 2: iteration loop is active. Run the baseline experiment with `run_experiment` and log it.",
235
+ );
236
+ } else if (bumpedSegment) {
237
+ lines.push("Run a fresh baseline for the new segment.");
238
+ }
239
+ if (requiresHarness && !onAutoresearchBranch) {
240
+ lines.push(
241
+ "Note: not on a dedicated `autoresearch/*` branch — `log_experiment discard` will only revert run-modified files, not reset to baseline.",
242
+ );
367
243
  }
368
244
 
369
245
  return {
370
246
  content: [{ type: "text", text: lines.join("\n") }],
371
- details: { state: cloneExperimentState(state) },
247
+ details: {
248
+ state,
249
+ createdSession,
250
+ bumpedSegment,
251
+ abandonedRuns,
252
+ harnessCommitted,
253
+ baselineCommit: session.baselineCommit,
254
+ },
372
255
  };
373
256
  },
374
257
  renderCall(args, _options, theme): Text {
@@ -384,3 +267,31 @@ export function createInitExperimentTool(
384
267
  function renderInitCall(name: string, theme: Theme): string {
385
268
  return `${theme.fg("toolTitle", theme.bold("init_experiment"))} ${theme.fg("accent", truncateToWidth(replaceTabs(name), 100))}`;
386
269
  }
270
+
271
+ async function tryReadHeadSha(cwd: string): Promise<string | null> {
272
+ try {
273
+ return (await git.head.sha(cwd)) ?? null;
274
+ } catch {
275
+ return null;
276
+ }
277
+ }
278
+
279
+ async function detectPendingChanges(cwd: string): Promise<boolean> {
280
+ try {
281
+ const statusText = await git.status(cwd, { porcelainV1: true, untrackedFiles: "all", z: true });
282
+ const workDirPrefix = await git.show.prefix(cwd).catch(() => "");
283
+ return parseWorkDirDirtyPaths(statusText, workDirPrefix).length > 0;
284
+ } catch {
285
+ return false;
286
+ }
287
+ }
288
+
289
+ function buildHarnessCommitMessage(goal: string | null, name: string): string {
290
+ const lines = [HARNESS_COMMIT_TITLE, "", `Benchmark entrypoint: ${DEFAULT_HARNESS_COMMAND}`];
291
+ if (goal) {
292
+ lines.push(`Goal: ${goal}`);
293
+ } else {
294
+ lines.push(`Session: ${name}`);
295
+ }
296
+ return lines.join("\n");
297
+ }