@oh-my-pi/pi-coding-agent 13.14.0 → 13.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/CHANGELOG.md +140 -0
  2. package/package.json +10 -8
  3. package/src/autoresearch/command-initialize.md +34 -0
  4. package/src/autoresearch/command-resume.md +17 -0
  5. package/src/autoresearch/contract.ts +332 -0
  6. package/src/autoresearch/dashboard.ts +447 -0
  7. package/src/autoresearch/git.ts +243 -0
  8. package/src/autoresearch/helpers.ts +458 -0
  9. package/src/autoresearch/index.ts +693 -0
  10. package/src/autoresearch/prompt.md +227 -0
  11. package/src/autoresearch/resume-message.md +16 -0
  12. package/src/autoresearch/state.ts +386 -0
  13. package/src/autoresearch/tools/init-experiment.ts +310 -0
  14. package/src/autoresearch/tools/log-experiment.ts +833 -0
  15. package/src/autoresearch/tools/run-experiment.ts +640 -0
  16. package/src/autoresearch/types.ts +218 -0
  17. package/src/cli/args.ts +8 -2
  18. package/src/cli/initial-message.ts +58 -0
  19. package/src/config/keybindings.ts +417 -212
  20. package/src/config/model-registry.ts +1 -0
  21. package/src/config/model-resolver.ts +57 -9
  22. package/src/config/settings-schema.ts +38 -10
  23. package/src/config/settings.ts +1 -4
  24. package/src/exec/bash-executor.ts +7 -5
  25. package/src/export/html/template.css +43 -13
  26. package/src/export/html/template.generated.ts +1 -1
  27. package/src/export/html/template.html +1 -0
  28. package/src/export/html/template.js +107 -0
  29. package/src/extensibility/extensions/types.ts +31 -8
  30. package/src/internal-urls/docs-index.generated.ts +1 -1
  31. package/src/lsp/index.ts +1 -1
  32. package/src/main.ts +44 -44
  33. package/src/mcp/oauth-discovery.ts +1 -1
  34. package/src/modes/acp/acp-agent.ts +957 -0
  35. package/src/modes/acp/acp-event-mapper.ts +531 -0
  36. package/src/modes/acp/acp-mode.ts +13 -0
  37. package/src/modes/acp/index.ts +2 -0
  38. package/src/modes/components/agent-dashboard.ts +5 -4
  39. package/src/modes/components/bash-execution.ts +40 -11
  40. package/src/modes/components/custom-editor.ts +47 -47
  41. package/src/modes/components/extensions/extension-dashboard.ts +2 -1
  42. package/src/modes/components/history-search.ts +2 -1
  43. package/src/modes/components/hook-editor.ts +2 -1
  44. package/src/modes/components/hook-input.ts +8 -7
  45. package/src/modes/components/hook-selector.ts +15 -10
  46. package/src/modes/components/keybinding-hints.ts +9 -9
  47. package/src/modes/components/login-dialog.ts +3 -3
  48. package/src/modes/components/mcp-add-wizard.ts +2 -1
  49. package/src/modes/components/model-selector.ts +14 -3
  50. package/src/modes/components/oauth-selector.ts +2 -1
  51. package/src/modes/components/python-execution.ts +2 -3
  52. package/src/modes/components/session-selector.ts +2 -1
  53. package/src/modes/components/settings-selector.ts +2 -1
  54. package/src/modes/components/status-line-segment-editor.ts +2 -1
  55. package/src/modes/components/tool-execution.ts +4 -5
  56. package/src/modes/components/tree-selector.ts +3 -2
  57. package/src/modes/components/user-message-selector.ts +3 -8
  58. package/src/modes/components/user-message.ts +16 -0
  59. package/src/modes/controllers/command-controller.ts +0 -2
  60. package/src/modes/controllers/extension-ui-controller.ts +89 -4
  61. package/src/modes/controllers/input-controller.ts +29 -23
  62. package/src/modes/controllers/mcp-command-controller.ts +1 -1
  63. package/src/modes/index.ts +1 -0
  64. package/src/modes/interactive-mode.ts +17 -5
  65. package/src/modes/print-mode.ts +1 -1
  66. package/src/modes/prompt-action-autocomplete.ts +7 -7
  67. package/src/modes/rpc/rpc-mode.ts +7 -2
  68. package/src/modes/rpc/rpc-types.ts +1 -0
  69. package/src/modes/theme/theme.ts +53 -44
  70. package/src/modes/types.ts +9 -2
  71. package/src/modes/utils/hotkeys-markdown.ts +19 -19
  72. package/src/modes/utils/keybinding-matchers.ts +21 -0
  73. package/src/modes/utils/ui-helpers.ts +1 -1
  74. package/src/patch/hashline.ts +139 -127
  75. package/src/patch/index.ts +77 -59
  76. package/src/patch/shared.ts +19 -11
  77. package/src/prompts/tools/hashline.md +43 -116
  78. package/src/sdk.ts +34 -17
  79. package/src/session/agent-session.ts +123 -30
  80. package/src/session/session-manager.ts +32 -31
  81. package/src/session/streaming-output.ts +87 -37
  82. package/src/tools/ask.ts +56 -30
  83. package/src/tools/bash-interactive.ts +2 -6
  84. package/src/tools/bash-interceptor.ts +1 -39
  85. package/src/tools/bash-skill-urls.ts +1 -1
  86. package/src/tools/browser.ts +1 -1
  87. package/src/tools/gemini-image.ts +1 -1
  88. package/src/tools/python.ts +2 -2
  89. package/src/tools/resolve.ts +1 -1
  90. package/src/utils/child-process.ts +88 -0
@@ -0,0 +1,833 @@
1
+ import * as fs from "node:fs";
2
+ import * as path from "node:path";
3
+ import { StringEnum } from "@oh-my-pi/pi-ai";
4
+ import { Text } from "@oh-my-pi/pi-tui";
5
+ import { logger } from "@oh-my-pi/pi-utils";
6
+ import { Type } from "@sinclair/typebox";
7
+ import type { ToolDefinition } from "../../extensibility/extensions";
8
+ import type { Theme } from "../../modes/theme/theme";
9
+ import { replaceTabs, truncateToWidth } from "../../tools/render-utils";
10
+ import { getAutoresearchFingerprintMismatchError, pathMatchesContractPath } from "../contract";
11
+ import { getCurrentAutoresearchBranch, parseWorkDirDirtyPaths } from "../git";
12
+ import {
13
+ AUTORESEARCH_COMMITTABLE_FILES,
14
+ formatNum,
15
+ inferMetricUnitFromName,
16
+ isAutoresearchCommittableFile,
17
+ isAutoresearchLocalStatePath,
18
+ isBetter,
19
+ mergeAsi,
20
+ readPendingRunSummary,
21
+ resolveWorkDir,
22
+ validateWorkDir,
23
+ } from "../helpers";
24
+ import {
25
+ cloneExperimentState,
26
+ computeConfidence,
27
+ currentResults,
28
+ findBaselineMetric,
29
+ findBaselineSecondary,
30
+ findBestKeptMetric,
31
+ } from "../state";
32
+ import type {
33
+ ASIData,
34
+ AutoresearchToolFactoryOptions,
35
+ ExperimentResult,
36
+ ExperimentState,
37
+ LogDetails,
38
+ NumericMetricMap,
39
+ } from "../types";
40
+
41
+ const EXPERIMENT_TOOL_NAMES = ["init_experiment", "run_experiment", "log_experiment"];
42
+
43
+ const logExperimentSchema = Type.Object({
44
+ commit: Type.String({
45
+ description: "Current git commit hash or placeholder.",
46
+ }),
47
+ metric: Type.Number({
48
+ description: "Primary metric value for this run.",
49
+ }),
50
+ status: StringEnum(["keep", "discard", "crash", "checks_failed"], {
51
+ description: "Outcome for this run.",
52
+ }),
53
+ description: Type.String({
54
+ description: "Short description of the experiment.",
55
+ }),
56
+ metrics: Type.Optional(
57
+ Type.Record(Type.String(), Type.Number(), {
58
+ description: "Secondary metrics for this run.",
59
+ }),
60
+ ),
61
+ force: Type.Optional(
62
+ Type.Boolean({
63
+ description: "Allow introducing new secondary metrics.",
64
+ }),
65
+ ),
66
+ asi: Type.Optional(
67
+ Type.Record(Type.String(), Type.Unknown(), {
68
+ description: "Actionable side information captured for this run.",
69
+ }),
70
+ ),
71
+ });
72
+
73
+ interface PreservedFile {
74
+ content: Buffer;
75
+ path: string;
76
+ }
77
+
78
+ interface KeepCommitResult {
79
+ error?: string;
80
+ note?: string;
81
+ }
82
+
83
+ export function createLogExperimentTool(
84
+ options: AutoresearchToolFactoryOptions,
85
+ ): ToolDefinition<typeof logExperimentSchema, LogDetails> {
86
+ return {
87
+ name: "log_experiment",
88
+ label: "Log Experiment",
89
+ description:
90
+ "Log the experiment result, update dashboard state, persist JSONL history, and apply git keep or revert behavior.",
91
+ parameters: logExperimentSchema,
92
+ defaultInactive: true,
93
+ async execute(_toolCallId, params, _signal, _onUpdate, ctx) {
94
+ const workDirError = validateWorkDir(ctx.cwd);
95
+ if (workDirError) {
96
+ return {
97
+ content: [{ type: "text", text: `Error: ${workDirError}` }],
98
+ };
99
+ }
100
+
101
+ const runtime = options.getRuntime(ctx);
102
+ const state = runtime.state;
103
+ const workDir = resolveWorkDir(ctx.cwd);
104
+ const fingerprintError = getAutoresearchFingerprintMismatchError(state.segmentFingerprint, workDir);
105
+ if (fingerprintError) {
106
+ return {
107
+ content: [{ type: "text", text: `Error: ${fingerprintError}` }],
108
+ };
109
+ }
110
+
111
+ const pendingRun =
112
+ runtime.lastRunSummary ?? (await readPendingRunSummary(workDir, collectLoggedRunNumbers(state.results)));
113
+ if (!pendingRun) {
114
+ return {
115
+ content: [{ type: "text", text: "Error: no unlogged run is available. Run run_experiment first." }],
116
+ };
117
+ }
118
+ runtime.lastRunSummary = pendingRun;
119
+ runtime.lastRunAsi = pendingRun.parsedAsi;
120
+ runtime.lastRunChecks =
121
+ pendingRun.checksPass === null
122
+ ? null
123
+ : {
124
+ pass: pendingRun.checksPass,
125
+ output: "",
126
+ duration: pendingRun.checksDurationSeconds ?? 0,
127
+ };
128
+ runtime.lastRunDuration = pendingRun.durationSeconds;
129
+
130
+ if (pendingRun.parsedPrimary !== null && params.metric !== pendingRun.parsedPrimary) {
131
+ return {
132
+ content: [
133
+ {
134
+ type: "text",
135
+ text:
136
+ "Error: metric does not match the parsed primary metric from the pending run.\n" +
137
+ `Expected: ${pendingRun.parsedPrimary}\nReceived: ${params.metric}`,
138
+ },
139
+ ],
140
+ };
141
+ }
142
+
143
+ if (params.status === "keep" && !pendingRun.passed) {
144
+ return {
145
+ content: [
146
+ {
147
+ type: "text",
148
+ text: "Error: cannot keep this run because the pending benchmark did not pass. Log it as crash or checks_failed instead.",
149
+ },
150
+ ],
151
+ };
152
+ }
153
+
154
+ if (params.status === "keep" && runtime.lastRunChecks && !runtime.lastRunChecks.pass) {
155
+ return {
156
+ content: [
157
+ {
158
+ type: "text",
159
+ text: "Error: cannot keep this run because autoresearch.checks.sh failed. Log it as checks_failed instead.",
160
+ },
161
+ ],
162
+ };
163
+ }
164
+
165
+ const observedStatusError = validateObservedStatus(params.status, pendingRun);
166
+ if (observedStatusError) {
167
+ return {
168
+ content: [{ type: "text", text: `Error: ${observedStatusError}` }],
169
+ };
170
+ }
171
+
172
+ const secondaryMetrics = buildSecondaryMetrics(params.metrics, pendingRun.parsedMetrics, state.metricName);
173
+ const validationError = validateSecondaryMetrics(state, secondaryMetrics, params.force ?? false);
174
+ if (validationError) {
175
+ return {
176
+ content: [{ type: "text", text: `Error: ${validationError}` }],
177
+ };
178
+ }
179
+
180
+ const mergedAsi = mergeAsi(runtime.lastRunAsi, sanitizeAsi(params.asi));
181
+ const asiValidationError = validateAsiRequirements(mergedAsi, params.status);
182
+ if (asiValidationError) {
183
+ return {
184
+ content: [{ type: "text", text: `Error: ${asiValidationError}` }],
185
+ };
186
+ }
187
+
188
+ let keepScopeValidation: { committablePaths: string[] } | undefined;
189
+ if (params.status === "keep") {
190
+ const scopeValidation = await validateKeepPaths(options, workDir, state);
191
+ if (typeof scopeValidation === "string") {
192
+ return {
193
+ content: [{ type: "text", text: `Error: ${scopeValidation}` }],
194
+ };
195
+ }
196
+ const currentBestMetric = findBestKeptMetric(state.results, state.currentSegment, state.bestDirection);
197
+ if (
198
+ currentBestMetric !== null &&
199
+ params.metric !== currentBestMetric &&
200
+ !isBetter(params.metric, currentBestMetric, state.bestDirection)
201
+ ) {
202
+ return {
203
+ content: [
204
+ {
205
+ type: "text",
206
+ text:
207
+ "Error: cannot keep this run because the primary metric regressed.\n" +
208
+ `Current best: ${currentBestMetric}\nReceived: ${params.metric}`,
209
+ },
210
+ ],
211
+ };
212
+ }
213
+ keepScopeValidation = scopeValidation;
214
+ }
215
+
216
+ const experiment: ExperimentResult = {
217
+ runNumber: runtime.lastRunNumber ?? pendingRun.runNumber,
218
+ commit: params.commit.slice(0, 7),
219
+ metric: params.metric,
220
+ metrics: secondaryMetrics,
221
+ status: params.status,
222
+ description: params.description,
223
+ timestamp: Date.now(),
224
+ segment: state.currentSegment,
225
+ confidence: null,
226
+ asi: mergedAsi,
227
+ };
228
+
229
+ const activeBranch = await getCurrentAutoresearchBranch(options.pi, workDir);
230
+ if (!activeBranch) {
231
+ return {
232
+ content: [
233
+ {
234
+ type: "text",
235
+ text:
236
+ "Error: autoresearch keep/discard actions require an active `autoresearch/...` branch. " +
237
+ "Run `/autoresearch` again to restore the protected branch before logging this run.",
238
+ },
239
+ ],
240
+ };
241
+ }
242
+
243
+ let gitNote: string | null = null;
244
+ if (params.status === "keep") {
245
+ const commitResult = await commitKeptExperiment(options, workDir, state, experiment, keepScopeValidation);
246
+ if (commitResult.error) {
247
+ return {
248
+ content: [{ type: "text", text: `Error: ${commitResult.error}` }],
249
+ };
250
+ }
251
+ gitNote = commitResult.note ?? null;
252
+ } else {
253
+ const revertResult = await revertFailedExperiment(options, workDir);
254
+ if (revertResult.error) {
255
+ return {
256
+ content: [{ type: "text", text: `Error: ${revertResult.error}` }],
257
+ };
258
+ }
259
+ gitNote = revertResult.note ?? null;
260
+ }
261
+
262
+ const previousState = cloneExperimentState(state);
263
+ state.results.push(experiment);
264
+ registerSecondaryMetrics(state, secondaryMetrics);
265
+ state.bestMetric = findBaselineMetric(state.results, state.currentSegment);
266
+ state.confidence = computeConfidence(state.results, state.currentSegment, state.bestDirection);
267
+ experiment.confidence = state.confidence;
268
+
269
+ const wallClockSeconds = runtime.lastRunDuration;
270
+ try {
271
+ persistRun(workDir, experiment);
272
+ } catch (error) {
273
+ runtime.state = previousState;
274
+ options.dashboard.updateWidget(ctx, runtime);
275
+ options.dashboard.requestRender();
276
+ throw error;
277
+ }
278
+ try {
279
+ await updateRunMetadata(runtime.lastRunArtifactDir ?? pendingRun.runDirectory, {
280
+ commit: experiment.commit,
281
+ confidence: experiment.confidence,
282
+ description: experiment.description,
283
+ gitNote,
284
+ loggedAt: new Date(experiment.timestamp).toISOString(),
285
+ loggedAsi: experiment.asi,
286
+ loggedMetric: experiment.metric,
287
+ loggedMetrics: experiment.metrics,
288
+ runNumber: runtime.lastRunNumber ?? pendingRun.runNumber,
289
+ status: experiment.status,
290
+ wallClockSeconds,
291
+ });
292
+ } catch (error) {
293
+ logger.warn("Failed to update autoresearch run metadata after persisting JSONL history", {
294
+ error: error instanceof Error ? error.message : String(error),
295
+ runDirectory: runtime.lastRunArtifactDir ?? pendingRun.runDirectory,
296
+ runNumber: runtime.lastRunNumber ?? pendingRun.runNumber,
297
+ });
298
+ }
299
+
300
+ runtime.runningExperiment = null;
301
+ runtime.lastRunChecks = null;
302
+ runtime.lastRunDuration = null;
303
+ runtime.lastRunAsi = null;
304
+ runtime.lastRunArtifactDir = null;
305
+ runtime.lastRunNumber = null;
306
+ runtime.lastRunSummary = null;
307
+ runtime.autoResumeArmed = true;
308
+ runtime.lastAutoResumePendingRunNumber = null;
309
+
310
+ const currentSegmentRuns = currentResults(state.results, state.currentSegment).length;
311
+ const text = buildLogText(state, experiment, currentSegmentRuns, wallClockSeconds, gitNote);
312
+ if (state.maxExperiments !== null && currentSegmentRuns >= state.maxExperiments) {
313
+ runtime.autoresearchMode = false;
314
+ options.pi.appendEntry(
315
+ "autoresearch-control",
316
+ runtime.goal ? { mode: "off", goal: runtime.goal } : { mode: "off" },
317
+ );
318
+ await options.pi.setActiveTools(
319
+ options.pi.getActiveTools().filter(name => !EXPERIMENT_TOOL_NAMES.includes(name)),
320
+ );
321
+ }
322
+ options.dashboard.updateWidget(ctx, runtime);
323
+ options.dashboard.requestRender();
324
+
325
+ return {
326
+ content: [{ type: "text", text }],
327
+ details: {
328
+ experiment: {
329
+ ...experiment,
330
+ metrics: { ...experiment.metrics },
331
+ asi: experiment.asi ? structuredClone(experiment.asi) : undefined,
332
+ },
333
+ state: cloneExperimentState(state),
334
+ wallClockSeconds,
335
+ },
336
+ };
337
+ },
338
+ renderCall(args, _options, theme): Text {
339
+ const color = args.status === "keep" ? "success" : args.status === "discard" ? "warning" : "error";
340
+ const description = truncateToWidth(replaceTabs(args.description), 100);
341
+ return new Text(
342
+ `${theme.fg("toolTitle", theme.bold("log_experiment"))} ${theme.fg(color, args.status)} ${theme.fg("muted", description)}`,
343
+ 0,
344
+ 0,
345
+ );
346
+ },
347
+ renderResult(result, _options, theme): Text {
348
+ const details = result.details;
349
+ if (!details) {
350
+ return new Text(replaceTabs(result.content.find(part => part.type === "text")?.text ?? ""), 0, 0);
351
+ }
352
+ const summary = renderSummary(details, theme);
353
+ return new Text(summary, 0, 0);
354
+ },
355
+ };
356
+ }
357
+
358
+ function cloneMetrics(value: NumericMetricMap | undefined): NumericMetricMap {
359
+ return value ? { ...value } : {};
360
+ }
361
+
362
+ function buildSecondaryMetrics(
363
+ overrides: NumericMetricMap | undefined,
364
+ parsedMetrics: NumericMetricMap | null,
365
+ primaryMetricName: string,
366
+ ): NumericMetricMap {
367
+ const merged: NumericMetricMap = {};
368
+ for (const [name, value] of Object.entries(parsedMetrics ?? {})) {
369
+ if (name === "__proto__" || name === "constructor" || name === "prototype") continue;
370
+ if (name === primaryMetricName) continue;
371
+ merged[name] = value;
372
+ }
373
+ for (const [name, value] of Object.entries(cloneMetrics(overrides))) {
374
+ if (name === "__proto__" || name === "constructor" || name === "prototype") continue;
375
+ merged[name] = value;
376
+ }
377
+ return merged;
378
+ }
379
+
380
+ function sanitizeAsi(value: { [key: string]: unknown } | undefined): ASIData | undefined {
381
+ if (!value) return undefined;
382
+ const result: ASIData = {};
383
+ for (const [key, entryValue] of Object.entries(value)) {
384
+ if (key === "__proto__" || key === "constructor" || key === "prototype") continue;
385
+ const sanitized = sanitizeAsiValue(entryValue);
386
+ if (sanitized !== undefined) {
387
+ result[key] = sanitized;
388
+ }
389
+ }
390
+ return Object.keys(result).length > 0 ? result : undefined;
391
+ }
392
+
393
+ function sanitizeAsiValue(value: unknown): ASIData[string] | undefined {
394
+ if (value === null) return null;
395
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") return value;
396
+ if (Array.isArray(value)) {
397
+ const items = value
398
+ .map(item => sanitizeAsiValue(item))
399
+ .filter((item): item is NonNullable<typeof item> => item !== undefined);
400
+ return items;
401
+ }
402
+ if (typeof value === "object") {
403
+ const objectValue = value as { [key: string]: unknown };
404
+ const result: ASIData = {};
405
+ for (const [key, entryValue] of Object.entries(objectValue)) {
406
+ if (key === "__proto__" || key === "constructor" || key === "prototype") continue;
407
+ const sanitized = sanitizeAsiValue(entryValue);
408
+ if (sanitized !== undefined) {
409
+ result[key] = sanitized;
410
+ }
411
+ }
412
+ return result;
413
+ }
414
+ return undefined;
415
+ }
416
+
417
+ export function validateAsiRequirements(asi: ASIData | undefined, status: ExperimentResult["status"]): string | null {
418
+ if (!asi) {
419
+ return "asi is required. Include at minimum a non-empty hypothesis.";
420
+ }
421
+ if (typeof asi.hypothesis !== "string" || asi.hypothesis.trim().length === 0) {
422
+ return "asi.hypothesis is required and must be a non-empty string.";
423
+ }
424
+ if (status === "keep") return null;
425
+ if (typeof asi.rollback_reason !== "string" || asi.rollback_reason.trim().length === 0) {
426
+ return "asi.rollback_reason is required for discard, crash, and checks_failed results.";
427
+ }
428
+ if (typeof asi.next_action_hint !== "string" || asi.next_action_hint.trim().length === 0) {
429
+ return "asi.next_action_hint is required for discard, crash, and checks_failed results.";
430
+ }
431
+ return null;
432
+ }
433
+
434
+ function validateSecondaryMetrics(state: ExperimentState, metrics: NumericMetricMap, force: boolean): string | null {
435
+ if (state.secondaryMetrics.length === 0) return null;
436
+ const knownNames = new Set(state.secondaryMetrics.map(metric => metric.name));
437
+ const providedNames = new Set(Object.keys(metrics));
438
+
439
+ const missing = [...knownNames].filter(name => !providedNames.has(name));
440
+ if (missing.length > 0) {
441
+ return `missing secondary metrics: ${missing.join(", ")}`;
442
+ }
443
+
444
+ const newMetrics = [...providedNames].filter(name => !knownNames.has(name));
445
+ if (newMetrics.length > 0 && !force) {
446
+ return `new secondary metrics require force=true: ${newMetrics.join(", ")}`;
447
+ }
448
+ return null;
449
+ }
450
+
451
+ function registerSecondaryMetrics(state: ExperimentState, metrics: NumericMetricMap): void {
452
+ for (const name of Object.keys(metrics)) {
453
+ if (state.secondaryMetrics.some(metric => metric.name === name)) continue;
454
+ state.secondaryMetrics.push({
455
+ name,
456
+ unit: inferMetricUnitFromName(name),
457
+ });
458
+ }
459
+ }
460
+
461
+ function persistRun(workDir: string, experiment: ExperimentResult): void {
462
+ const entry = {
463
+ run: experiment.runNumber,
464
+ ...experiment,
465
+ };
466
+ const jsonlPath = path.join(workDir, "autoresearch.jsonl");
467
+ fs.appendFileSync(jsonlPath, `${JSON.stringify(entry)}\n`);
468
+ }
469
+
470
+ function collectLoggedRunNumbers(results: ExperimentResult[]): Set<number> {
471
+ const runNumbers = new Set<number>();
472
+ for (const result of results) {
473
+ if (result.runNumber !== null) {
474
+ runNumbers.add(result.runNumber);
475
+ }
476
+ }
477
+ return runNumbers;
478
+ }
479
+
480
+ function validateObservedStatus(
481
+ status: ExperimentResult["status"],
482
+ pendingRun: { checksPass: boolean | null; passed: boolean },
483
+ ): string | null {
484
+ if (pendingRun.checksPass === false) {
485
+ return status === "checks_failed"
486
+ ? null
487
+ : "benchmark checks failed for the pending run. Log it as checks_failed.";
488
+ }
489
+ if (!pendingRun.passed) {
490
+ return status === "crash" ? null : "the pending benchmark failed. Log it as crash.";
491
+ }
492
+ return status === "keep" || status === "discard" ? null : "the pending benchmark passed. Log it as keep or discard.";
493
+ }
494
+
495
+ async function commitKeptExperiment(
496
+ options: AutoresearchToolFactoryOptions,
497
+ workDir: string,
498
+ state: ExperimentState,
499
+ experiment: ExperimentResult,
500
+ scopeValidation: { committablePaths: string[] } | undefined,
501
+ ): Promise<KeepCommitResult> {
502
+ if (!scopeValidation || scopeValidation.committablePaths.length === 0) {
503
+ return { note: "nothing to commit" };
504
+ }
505
+
506
+ const addResult = await options.pi.exec("git", ["add", "--all", "--", ...scopeValidation.committablePaths], {
507
+ cwd: workDir,
508
+ timeout: 10_000,
509
+ });
510
+ if (addResult.code !== 0) {
511
+ return {
512
+ error: `git add failed: ${mergeStdoutStderr(addResult).trim() || `exit ${addResult.code}`}`,
513
+ };
514
+ }
515
+
516
+ const diffResult = await options.pi.exec(
517
+ "git",
518
+ ["diff", "--cached", "--quiet", "--", ...scopeValidation.committablePaths],
519
+ {
520
+ cwd: workDir,
521
+ timeout: 10_000,
522
+ },
523
+ );
524
+ if (diffResult.code === 0) {
525
+ return { note: "nothing to commit" };
526
+ }
527
+
528
+ const payload: { [key: string]: string | number } = {
529
+ status: experiment.status,
530
+ [state.metricName]: experiment.metric,
531
+ };
532
+ for (const [name, value] of Object.entries(experiment.metrics)) {
533
+ payload[name] = value;
534
+ }
535
+ const commitMessage = `${experiment.description}\n\nResult: ${JSON.stringify(payload)}`;
536
+ const commitResult = await options.pi.exec(
537
+ "git",
538
+ ["commit", "-m", commitMessage, "--", ...scopeValidation.committablePaths],
539
+ {
540
+ cwd: workDir,
541
+ timeout: 10_000,
542
+ },
543
+ );
544
+ if (commitResult.code !== 0) {
545
+ return {
546
+ error: `git commit failed: ${mergeStdoutStderr(commitResult).trim() || `exit ${commitResult.code}`}`,
547
+ };
548
+ }
549
+
550
+ const revParseResult = await options.pi.exec("git", ["rev-parse", "--short=7", "HEAD"], {
551
+ cwd: workDir,
552
+ timeout: 5_000,
553
+ });
554
+ const newCommit = revParseResult.stdout.trim();
555
+ if (newCommit.length >= 7) {
556
+ experiment.commit = newCommit;
557
+ }
558
+ const summaryLine =
559
+ mergeStdoutStderr(commitResult)
560
+ .split("\n")
561
+ .find(line => line.trim().length > 0) ?? "committed";
562
+ return { note: summaryLine.trim() };
563
+ }
564
+
565
+ async function revertFailedExperiment(
566
+ options: AutoresearchToolFactoryOptions,
567
+ workDir: string,
568
+ ): Promise<KeepCommitResult> {
569
+ const preservedFiles = preserveAutoresearchFiles(workDir);
570
+ const restoreResult = await options.pi.exec(
571
+ "git",
572
+ ["restore", "--source=HEAD", "--staged", "--worktree", "--", "."],
573
+ { cwd: workDir, timeout: 10_000 },
574
+ );
575
+ const cleanResult = await options.pi.exec("git", ["clean", "-fd", "--", "."], { cwd: workDir, timeout: 10_000 });
576
+ const cleanIgnoredResult = await options.pi.exec("git", ["clean", "-fdX", "--", "."], {
577
+ cwd: workDir,
578
+ timeout: 10_000,
579
+ });
580
+ restoreAutoresearchFiles(preservedFiles);
581
+ if (restoreResult.code !== 0) {
582
+ return {
583
+ error: `git restore failed: ${mergeStdoutStderr(restoreResult).trim() || `exit ${restoreResult.code}`}`,
584
+ };
585
+ }
586
+ if (cleanResult.code !== 0) {
587
+ return {
588
+ error: `git clean failed: ${mergeStdoutStderr(cleanResult).trim() || `exit ${cleanResult.code}`}`,
589
+ };
590
+ }
591
+ if (cleanIgnoredResult.code !== 0) {
592
+ return {
593
+ error: `git clean -X failed: ${mergeStdoutStderr(cleanIgnoredResult).trim() || `exit ${cleanIgnoredResult.code}`}`,
594
+ };
595
+ }
596
+ const dirtyCheckResult = await options.pi.exec(
597
+ "git",
598
+ ["status", "--porcelain=v1", "-z", "--untracked-files=all", "--", "."],
599
+ { cwd: workDir, timeout: 10_000 },
600
+ );
601
+ if (dirtyCheckResult.code !== 0) {
602
+ return {
603
+ error: `git status failed after cleanup: ${mergeStdoutStderr(dirtyCheckResult).trim() || `exit ${dirtyCheckResult.code}`}`,
604
+ };
605
+ }
606
+ const workDirPrefix = await readGitWorkDirPrefix(options, workDir);
607
+ const remainingDirtyPaths = parseWorkDirDirtyPaths(dirtyCheckResult.stdout, workDirPrefix).filter(
608
+ relativePath => !isAutoresearchLocalStatePath(relativePath),
609
+ );
610
+ if (remainingDirtyPaths.length > 0) {
611
+ return {
612
+ error:
613
+ "Autoresearch cleanup left the worktree dirty. Resolve these paths before continuing: " +
614
+ remainingDirtyPaths.join(", "),
615
+ };
616
+ }
617
+ return { note: "reverted changes" };
618
+ }
619
+
620
+ function preserveAutoresearchFiles(workDir: string): PreservedFile[] {
621
+ const files: PreservedFile[] = [];
622
+ for (const relativePath of [...AUTORESEARCH_COMMITTABLE_FILES, "autoresearch.jsonl"]) {
623
+ const absolutePath = path.join(workDir, relativePath);
624
+ if (!fs.existsSync(absolutePath)) continue;
625
+ files.push({
626
+ content: fs.readFileSync(absolutePath),
627
+ path: absolutePath,
628
+ });
629
+ }
630
+ const localStateDir = path.join(workDir, ".autoresearch");
631
+ if (fs.existsSync(localStateDir)) {
632
+ collectDirectoryFiles(localStateDir, files);
633
+ }
634
+ return files;
635
+ }
636
+
637
+ function restoreAutoresearchFiles(files: PreservedFile[]): void {
638
+ for (const file of files) {
639
+ fs.mkdirSync(path.dirname(file.path), { recursive: true });
640
+ fs.writeFileSync(file.path, file.content);
641
+ }
642
+ }
643
+
644
+ function mergeStdoutStderr(result: { stderr: string; stdout: string }): string {
645
+ return `${result.stdout}${result.stderr}`;
646
+ }
647
+
648
+ async function validateKeepPaths(
649
+ options: AutoresearchToolFactoryOptions,
650
+ workDir: string,
651
+ state: ExperimentState,
652
+ ): Promise<{ committablePaths: string[] } | string> {
653
+ if (state.scopePaths.length === 0) {
654
+ return "Files in Scope is empty for the current segment. Re-run init_experiment after fixing autoresearch.md.";
655
+ }
656
+
657
+ const statusResult = await options.pi.exec(
658
+ "git",
659
+ ["status", "--porcelain=v1", "-z", "--untracked-files=all", "--", "."],
660
+ {
661
+ cwd: workDir,
662
+ timeout: 10_000,
663
+ },
664
+ );
665
+ if (statusResult.code !== 0) {
666
+ return `git status failed: ${mergeStdoutStderr(statusResult).trim() || `exit ${statusResult.code}`}`;
667
+ }
668
+
669
+ const workDirPrefix = await readGitWorkDirPrefix(options, workDir);
670
+ const committablePaths: string[] = [];
671
+ for (const normalizedPath of parseWorkDirDirtyPaths(statusResult.stdout, workDirPrefix)) {
672
+ if (isAutoresearchLocalStatePath(normalizedPath)) {
673
+ continue;
674
+ }
675
+ if (isAutoresearchCommittableFile(normalizedPath)) {
676
+ committablePaths.push(normalizedPath);
677
+ continue;
678
+ }
679
+ if (state.offLimits.some(spec => pathMatchesContractPath(normalizedPath, spec))) {
680
+ return `cannot keep this run because ${normalizedPath} is listed under Off Limits in autoresearch.md`;
681
+ }
682
+ if (!state.scopePaths.some(spec => pathMatchesContractPath(normalizedPath, spec))) {
683
+ return `cannot keep this run because ${normalizedPath} is outside Files in Scope`;
684
+ }
685
+ committablePaths.push(normalizedPath);
686
+ }
687
+
688
+ return { committablePaths };
689
+ }
690
+
691
+ function collectDirectoryFiles(directory: string, files: PreservedFile[]): void {
692
+ for (const entry of fs.readdirSync(directory, { withFileTypes: true })) {
693
+ const absolutePath = path.join(directory, entry.name);
694
+ if (entry.isDirectory()) {
695
+ collectDirectoryFiles(absolutePath, files);
696
+ continue;
697
+ }
698
+ files.push({
699
+ content: fs.readFileSync(absolutePath),
700
+ path: absolutePath,
701
+ });
702
+ }
703
+ }
704
+
705
+ async function updateRunMetadata(
706
+ runDirectory: string | null,
707
+ metadata: {
708
+ commit: string;
709
+ confidence: number | null;
710
+ description: string;
711
+ gitNote: string | null;
712
+ loggedAt: string;
713
+ loggedAsi: ASIData | undefined;
714
+ loggedMetric: number;
715
+ loggedMetrics: NumericMetricMap;
716
+ runNumber: number | null;
717
+ status: ExperimentResult["status"];
718
+ wallClockSeconds: number | null;
719
+ },
720
+ ): Promise<void> {
721
+ if (!runDirectory) return;
722
+ const runJsonPath = path.join(runDirectory, "run.json");
723
+ let existing: Record<string, unknown> = {};
724
+ try {
725
+ existing = (await Bun.file(runJsonPath).json()) as Record<string, unknown>;
726
+ } catch {
727
+ existing = {};
728
+ }
729
+ await Bun.write(
730
+ runJsonPath,
731
+ JSON.stringify(
732
+ {
733
+ ...existing,
734
+ loggedRunNumber: metadata.runNumber,
735
+ loggedAt: metadata.loggedAt,
736
+ loggedAsi: metadata.loggedAsi,
737
+ loggedMetric: metadata.loggedMetric,
738
+ loggedMetrics: metadata.loggedMetrics,
739
+ status: metadata.status,
740
+ description: metadata.description,
741
+ commit: metadata.commit,
742
+ gitNote: metadata.gitNote,
743
+ confidence: metadata.confidence,
744
+ wallClockSeconds: metadata.wallClockSeconds,
745
+ },
746
+ null,
747
+ 2,
748
+ ),
749
+ );
750
+ }
751
+
752
+ function buildLogText(
753
+ state: ExperimentState,
754
+ experiment: ExperimentResult,
755
+ currentSegmentRuns: number,
756
+ wallClockSeconds: number | null,
757
+ gitNote: string | null,
758
+ ): string {
759
+ const displayRunNumber = experiment.runNumber ?? state.results.length;
760
+ const lines = [`Logged run #${displayRunNumber}: ${experiment.status} - ${experiment.description}`];
761
+ if (wallClockSeconds !== null) {
762
+ lines.push(`Wall clock: ${wallClockSeconds.toFixed(1)}s`);
763
+ }
764
+ if (state.bestMetric !== null) {
765
+ lines.push(`Baseline ${state.metricName}: ${formatNum(state.bestMetric, state.metricUnit)}`);
766
+ }
767
+ if (currentSegmentRuns > 1 && state.bestMetric !== null && experiment.metric !== state.bestMetric) {
768
+ const delta = ((experiment.metric - state.bestMetric) / state.bestMetric) * 100;
769
+ const sign = delta > 0 ? "+" : "";
770
+ lines.push(`This run: ${formatNum(experiment.metric, state.metricUnit)} (${sign}${delta.toFixed(1)}%)`);
771
+ } else {
772
+ lines.push(`This run: ${formatNum(experiment.metric, state.metricUnit)}`);
773
+ }
774
+ if (Object.keys(experiment.metrics).length > 0) {
775
+ const baselineSecondary = findBaselineSecondary(state.results, state.currentSegment, state.secondaryMetrics);
776
+ const parts = Object.entries(experiment.metrics).map(([name, value]) => {
777
+ const unit = state.secondaryMetrics.find(metric => metric.name === name)?.unit ?? "";
778
+ const baseline = baselineSecondary[name];
779
+ if (baseline === undefined || baseline === 0 || currentSegmentRuns === 1) {
780
+ return `${name}: ${formatNum(value, unit)}`;
781
+ }
782
+ const delta = ((value - baseline) / baseline) * 100;
783
+ const sign = delta > 0 ? "+" : "";
784
+ return `${name}: ${formatNum(value, unit)} (${sign}${delta.toFixed(1)}%)`;
785
+ });
786
+ lines.push(`Secondary metrics: ${parts.join(" ")}`);
787
+ }
788
+ if (experiment.asi) {
789
+ const asiSummary = Object.entries(experiment.asi)
790
+ .map(([key, value]) => `${key}: ${truncateAsiValue(value)}`)
791
+ .join(" | ");
792
+ lines.push(`ASI: ${asiSummary}`);
793
+ }
794
+ if (state.confidence !== null) {
795
+ const status = state.confidence >= 2 ? "likely real" : state.confidence >= 1 ? "marginal" : "within noise";
796
+ lines.push(`Confidence: ${state.confidence.toFixed(1)}x noise floor (${status})`);
797
+ }
798
+ if (gitNote) {
799
+ lines.push(`Git: ${gitNote}`);
800
+ }
801
+ if (state.maxExperiments !== null) {
802
+ lines.push(`Progress: ${currentSegmentRuns}/${state.maxExperiments} runs in current segment`);
803
+ if (currentSegmentRuns >= state.maxExperiments) {
804
+ lines.push(`Maximum experiments reached (${state.maxExperiments}). Autoresearch mode is now off.`);
805
+ }
806
+ }
807
+ return lines.join("\n");
808
+ }
809
+
810
+ async function readGitWorkDirPrefix(options: AutoresearchToolFactoryOptions, workDir: string): Promise<string> {
811
+ const prefixResult = await options.pi.exec("git", ["rev-parse", "--show-prefix"], { cwd: workDir, timeout: 5_000 });
812
+ if (prefixResult.code !== 0) return "";
813
+ return prefixResult.stdout.trim();
814
+ }
815
+
816
+ function truncateAsiValue(value: ASIData[string]): string {
817
+ const text = typeof value === "string" ? value : JSON.stringify(value);
818
+ return text.length > 120 ? `${text.slice(0, 117)}...` : text;
819
+ }
820
+
821
+ function renderSummary(details: LogDetails, theme: Theme): string {
822
+ const { experiment, state } = details;
823
+ const color = experiment.status === "keep" ? "success" : experiment.status === "discard" ? "warning" : "error";
824
+ let summary = `${theme.fg(color, experiment.status.toUpperCase())} ${theme.fg("muted", truncateToWidth(replaceTabs(experiment.description), 100))}`;
825
+ summary += ` ${theme.fg("accent", `${state.metricName}=${formatNum(experiment.metric, state.metricUnit)}`)}`;
826
+ if (state.bestMetric !== null) {
827
+ summary += ` ${theme.fg("dim", `baseline ${formatNum(state.bestMetric, state.metricUnit)}`)}`;
828
+ }
829
+ if (state.confidence !== null) {
830
+ summary += ` ${theme.fg("dim", `conf ${state.confidence.toFixed(1)}x`)}`;
831
+ }
832
+ return summary;
833
+ }