npm - @oh-my-pi/pi-coding-agent - Versions diffs - 14.5.14 → 14.6.0 - Mend

@oh-my-pi/pi-coding-agent 14.5.14 → 14.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

package/CHANGELOG.md +39 -0
package/package.json +7 -7
package/src/autoresearch/command-resume.md +5 -8
package/src/autoresearch/git.ts +41 -51
package/src/autoresearch/helpers.ts +43 -359
package/src/autoresearch/index.ts +281 -273
package/src/autoresearch/prompt-setup.md +43 -0
package/src/autoresearch/prompt.md +52 -193
package/src/autoresearch/resume-message.md +2 -8
package/src/autoresearch/state.ts +59 -166
package/src/autoresearch/storage.ts +687 -0
package/src/autoresearch/tools/init-experiment.ts +201 -290
package/src/autoresearch/tools/log-experiment.ts +304 -517
package/src/autoresearch/tools/run-experiment.ts +117 -296
package/src/autoresearch/tools/update-notes.ts +116 -0
package/src/autoresearch/types.ts +16 -66
package/src/config/settings-schema.ts +1 -1
package/src/config/settings.ts +20 -1
package/src/cursor.ts +1 -1
package/src/edit/index.ts +9 -31
package/src/edit/line-hash.ts +70 -43
package/src/edit/modes/hashline.lark +26 -0
package/src/edit/modes/hashline.ts +898 -1099
package/src/edit/modes/patch.ts +0 -7
package/src/edit/modes/replace.ts +0 -4
package/src/edit/renderer.ts +22 -20
package/src/edit/streaming.ts +8 -28
package/src/eval/eval.lark +24 -30
package/src/eval/js/context-manager.ts +5 -162
package/src/eval/js/prelude.txt +0 -12
package/src/eval/parse.ts +129 -129
package/src/eval/py/prelude.py +1 -219
package/src/export/html/template.generated.ts +1 -1
package/src/export/html/template.js +2 -2
package/src/internal-urls/docs-index.generated.ts +1 -1
package/src/modes/components/session-observer-overlay.ts +5 -2
package/src/modes/components/status-line/segments.ts +1 -1
package/src/modes/components/status-line.ts +3 -5
package/src/modes/components/tree-selector.ts +4 -5
package/src/modes/components/welcome.ts +11 -1
package/src/modes/controllers/command-controller.ts +2 -6
package/src/modes/controllers/event-controller.ts +1 -2
package/src/modes/controllers/extension-ui-controller.ts +3 -15
package/src/modes/controllers/input-controller.ts +0 -1
package/src/modes/controllers/selector-controller.ts +1 -1
package/src/modes/interactive-mode.ts +5 -7
package/src/prompts/system/system-prompt.md +14 -38
package/src/prompts/tools/ast-edit.md +8 -8
package/src/prompts/tools/ast-grep.md +10 -10
package/src/prompts/tools/eval.md +13 -31
package/src/prompts/tools/find.md +2 -1
package/src/prompts/tools/hashline.md +66 -57
package/src/prompts/tools/search.md +2 -2
package/src/session/session-manager.ts +17 -13
package/src/tools/ast-edit.ts +141 -44
package/src/tools/ast-grep.ts +112 -36
package/src/tools/eval.ts +2 -53
package/src/tools/find.ts +16 -15
package/src/tools/path-utils.ts +36 -196
package/src/tools/search.ts +56 -35
package/src/utils/edit-mode.ts +2 -11
package/src/utils/file-display-mode.ts +1 -1
package/src/utils/git.ts +17 -0
package/src/utils/session-color.ts +0 -12
package/src/utils/title-generator.ts +22 -38
package/src/autoresearch/apply-contract-to-state.ts +0 -24
package/src/autoresearch/contract.ts +0 -288
package/src/edit/modes/atom.lark +0 -29
package/src/edit/modes/atom.ts +0 -1773
package/src/prompts/tools/atom.md +0 -150

package/src/autoresearch/tools/log-experiment.ts CHANGED Viewed

@@ -2,36 +2,21 @@ import * as fs from "node:fs";
 import * as path from "node:path";
 import { StringEnum } from "@oh-my-pi/pi-ai";
 import { Text } from "@oh-my-pi/pi-tui";
-import { logger } from "@oh-my-pi/pi-utils";
 import { Type } from "@sinclair/typebox";
 import type { ToolDefinition } from "../../extensibility/extensions";
 import type { Theme } from "../../modes/theme/theme";
 import { replaceTabs, truncateToWidth } from "../../tools/render-utils";
 import * as git from "../../utils/git";
-import { applyAutoresearchContractToExperimentState } from "../apply-contract-to-state";
-import { loadAutoresearchScriptSnapshot, pathMatchesContractPath, readAutoresearchContract } from "../contract";
-import { computeRunModifiedPaths, getCurrentAutoresearchBranch, parseWorkDirDirtyPathsWithStatus } from "../git";
+import { computeRunModifiedPaths, getCurrentAutoresearchBranch, parseWorkDirDirtyPaths } from "../git";
+import { ensureNumericMetricMap, formatNum, mergeAsi, pathMatchesSpec, sanitizeAsi } from "../helpers";
 import {
-	collectLoggedRunNumbers,
-	formatNum,
-	inferMetricUnitFromName,
-	isAutoresearchCommittableFile,
-	isAutoresearchLocalStatePath,
-	isAutoresearchShCommand,
-	isBetter,
-	mergeAsi,
-	readPendingRunSummary,
-	resolveWorkDir,
-	validateWorkDir,
-} from "../helpers";
-import {
-	cloneExperimentState,
+	buildExperimentState,
 	computeConfidence,
 	currentResults,
-	findBaselineMetric,
 	findBaselineSecondary,
 	findBestKeptMetric,
 } from "../state";
+import { openAutoresearchStorageIfExists, type SessionRow } from "../storage";
 import type {
 	ASIData,
 	AutoresearchToolFactoryOptions,
@@ -41,50 +26,50 @@ import type {
 	NumericMetricMap,
 } from "../types";
-const EXPERIMENT_TOOL_NAMES = ["init_experiment", "run_experiment", "log_experiment"];
+const EXPERIMENT_TOOL_NAMES = ["init_experiment", "run_experiment", "log_experiment", "update_notes"];
 const logExperimentSchema = Type.Object({
-	commit: Type.String({
-		description: "Current git commit hash or placeholder.",
-	}),
 	metric: Type.Number({
-		description: "Primary metric value for this run.",
+		description: "Primary metric value for this run. May differ from the parsed value; deviation is recorded.",
 	}),
 	status: StringEnum(["keep", "discard", "crash", "checks_failed"], {
 		description: "Outcome for this run.",
 	}),
-	description: Type.String({
-		description: "Short description of the experiment.",
-	}),
+	description: Type.String({ description: "Short description of the experiment." }),
 	metrics: Type.Optional(
-		Type.Record(Type.String(), Type.Number(), {
-			description: "Secondary metrics for this run.",
-		}),
+		Type.Record(Type.String(), Type.Number(), { description: "Secondary metrics for this run." }),
 	),
-	force: Type.Optional(
-		Type.Boolean({
-			description:
-				"When true: skip ASI field requirements and allow keeping a run whose primary metric regressed versus the best kept run.",
-		}),
+	asi: Type.Optional(
+		Type.Object(
+			{},
+			{
+				additionalProperties: Type.Unknown(),
+				description: "Free-form structured metadata captured for this run (hypothesis, learnings, etc.).",
+			},
+		),
 	),
-	skip_restore: Type.Optional(
-		Type.Boolean({
+	commit: Type.Optional(
+		Type.String({ description: "Override the commit hash recorded for this run. Defaults to the current HEAD." }),
+	),
+	justification: Type.Optional(
+		Type.String({
 			description:
-				"When true and status is discard/crash/checks_failed: skip reverting the working tree to HEAD. Useful when the experiment did not modify tracked files or you want to preserve the current state.",
+				"Required when the run modifies paths outside scope or inside off-limits and you still want it kept. Free-form explanation.",
 		}),
 	),
-	asi: Type.Optional(
-		Type.Record(Type.String(), Type.Unknown(), {
-			description: "Actionable side information captured for this run.",
-		}),
+	flag_runs: Type.Optional(
+		Type.Array(
+			Type.Object({
+				run_id: Type.Number({ description: "Run id (#) of a previously logged run to flag as suspect." }),
+				reason: Type.String({
+					description: "Why this earlier run is suspect (e.g. reward-hacked, broken metric).",
+				}),
+			}),
+			{ description: "Mark earlier runs as flagged. Flagged runs are excluded from baseline and best-metric math." },
+		),
 	),
 });
-interface KeepCommitResult {
-	error?: string;
-	note?: string;
-}
 export function createLogExperimentTool(
 	options: AutoresearchToolFactoryOptions,
 ): ToolDefinition<typeof logExperimentSchema, LogDetails> {
@@ -92,189 +77,111 @@ export function createLogExperimentTool(
 		name: "log_experiment",
 		label: "Log Experiment",
 		description:
-			"Log the experiment result, update dashboard state, persist JSONL history, and apply git keep or revert behavior.",
+			"Log the result of the latest run_experiment. Records the metric, optional ASI metadata, modified paths, and scope deviations. On `keep`, modified files are committed; on `discard`/`crash`/`checks_failed`, the worktree is reverted. Pass `flag_runs` to mark earlier runs as suspect; flagged runs are excluded from baseline and best-metric math.",
 		parameters: logExperimentSchema,
 		defaultInactive: true,
 		async execute(_toolCallId, params, _signal, _onUpdate, ctx) {
-			const workDirError = validateWorkDir(ctx.cwd);
-			if (workDirError) {
-				return {
-					content: [{ type: "text", text: `Error: ${workDirError}` }],
-				};
-			}
-			const runtime = options.getRuntime(ctx);
-			const state = runtime.state;
-			const workDir = resolveWorkDir(ctx.cwd);
-			const contractResult = readAutoresearchContract(workDir);
-			const scriptSnapshot = loadAutoresearchScriptSnapshot(workDir);
-			const contractErrors = [...contractResult.errors, ...scriptSnapshot.errors];
-			if (contractErrors.length > 0) {
-				return {
-					content: [{ type: "text", text: `Error: ${contractErrors.join(" ")}` }],
-				};
-			}
-			const benchmarkForSync = contractResult.contract.benchmark;
-			if (benchmarkForSync.command && !isAutoresearchShCommand(benchmarkForSync.command)) {
+			const storage = await openAutoresearchStorageIfExists(ctx.cwd);
+			const currentBranch = (await git.branch.current(ctx.cwd)) ?? null;
+			const session = storage?.getActiveSessionForBranch(currentBranch) ?? null;
+			if (!storage || !session) {
 				return {
 					content: [
 						{
 							type: "text",
-							text:
-								"Error: Benchmark.command in autoresearch.md must invoke `autoresearch.sh` directly before logging. " +
-								"Fix autoresearch.md or move the workload into autoresearch.sh.",
+							text: "Error: no active autoresearch session for the current branch. Call init_experiment first.",
 						},
 					],
 				};
 			}
-			const pendingRun =
-				runtime.lastRunSummary ?? (await readPendingRunSummary(workDir, collectLoggedRunNumbers(state.results)));
+			const pendingRun = storage.getPendingRun(session.id);
 			if (!pendingRun) {
 				return {
-					content: [{ type: "text", text: "Error: no unlogged run is available. Run run_experiment first." }],
-				};
-			}
-			applyAutoresearchContractToExperimentState(contractResult.contract, state);
-			const logPreamble =
-				"Refreshed session fields from autoresearch.md before logging (benchmark, scope, constraints).\n\n";
-			runtime.lastRunSummary = pendingRun;
-			runtime.lastRunAsi = pendingRun.parsedAsi;
-			runtime.lastRunChecks =
-				pendingRun.checksPass === null
-					? null
-					: {
-							pass: pendingRun.checksPass,
-							output: "",
-							duration: pendingRun.checksDurationSeconds ?? 0,
-						};
-			runtime.lastRunDuration = pendingRun.durationSeconds;
-			if (pendingRun.parsedPrimary !== null && params.metric !== pendingRun.parsedPrimary) {
-				return {
-					content: [
-						{
-							type: "text",
-							text:
-								"Error: metric does not match the parsed primary metric from the pending run.\n" +
-								`Expected: ${pendingRun.parsedPrimary}\nReceived: ${params.metric}`,
-						},
-					],
-				};
-			}
-			if (params.status === "keep" && !pendingRun.passed) {
-				return {
-					content: [
-						{
-							type: "text",
-							text: "Error: cannot keep this run because the pending benchmark did not pass. Log it as crash or checks_failed instead.",
-						},
-					],
-				};
-			}
-			if (params.status === "keep" && runtime.lastRunChecks && !runtime.lastRunChecks.pass) {
-				return {
-					content: [
-						{
-							type: "text",
-							text: "Error: cannot keep this run because autoresearch.checks.sh failed. Log it as checks_failed instead.",
-						},
-					],
-				};
-			}
-			const observedStatusError = validateObservedStatus(params.status, pendingRun);
-			if (observedStatusError) {
-				return {
-					content: [{ type: "text", text: `Error: ${observedStatusError}` }],
+					content: [{ type: "text", text: "Error: no pending run available. Run run_experiment first." }],
 				};
 			}
-			const forceLoose = params.force === true;
-			const secondaryMetrics = buildSecondaryMetrics(params.metrics, pendingRun.parsedMetrics, state.metricName);
+			const runtime = options.getRuntime(ctx);
-			const mergedAsi = mergeAsi(runtime.lastRunAsi, sanitizeAsi(params.asi));
-			if (!forceLoose) {
-				const asiValidationError = validateAsiRequirements(mergedAsi, params.status);
-				if (asiValidationError) {
-					return {
-						content: [{ type: "text", text: `Error: ${asiValidationError}` }],
-					};
-				}
+			const flaggedRuns: LogDetails["flaggedRuns"] = [];
+			for (const flag of params.flag_runs ?? []) {
+				const target = storage.getRunById(flag.run_id);
+				if (!target || target.sessionId !== session.id) continue;
+				storage.flagRun(flag.run_id, flag.reason);
+				flaggedRuns.push({ runId: flag.run_id, reason: flag.reason });
 			}
-			const preRunDirtyPaths = pendingRun.preRunDirtyPaths;
-			let keepScopeValidation: { committablePaths: string[] } | undefined;
-			if (params.status === "keep") {
-				const scopeValidation = await validateKeepPaths(options, workDir, state);
-				if (typeof scopeValidation === "string") {
-					return {
-						content: [{ type: "text", text: `Error: ${scopeValidation}` }],
-					};
-				}
-				const currentBestMetric = findBestKeptMetric(state.results, state.currentSegment, state.bestDirection);
-				if (
-					!forceLoose &&
-					currentBestMetric !== null &&
-					params.metric !== currentBestMetric &&
-					!isBetter(params.metric, currentBestMetric, state.bestDirection)
-				) {
-					return {
-						content: [
-							{
-								type: "text",
-								text:
-									"Error: cannot keep this run because the primary metric regressed.\n" +
-									`Current best: ${currentBestMetric}\nReceived: ${params.metric}`,
-							},
-						],
-					};
-				}
-				keepScopeValidation = scopeValidation;
+			const branchName = await getCurrentAutoresearchBranch(options.pi, ctx.cwd);
+			const onAutoresearchBranch = branchName !== null;
+			let allModified: string[];
+			if (onAutoresearchBranch) {
+				// On a dedicated autoresearch branch every iteration starts from a clean
+				// worktree (init_experiment baseline + previous keep commit / discard reset),
+				// so any currently-dirty path is the agent's iteration change. Off-branch we
+				// can't tell user dirt apart from agent edits, so we keep the (lossy)
+				// preRunDirtyPaths filter.
+				const statusText = await tryGitStatus(ctx.cwd);
+				const workDirPrefix = await tryGitPrefix(ctx.cwd);
+				allModified = parseWorkDirDirtyPaths(statusText, workDirPrefix);
+			} else {
+				const { modifiedTracked, modifiedUntracked } = await detectModifiedPaths(
+					ctx.cwd,
+					pendingRun.preRunDirtyPaths,
+				);
+				allModified = [...modifiedTracked, ...modifiedUntracked];
 			}
+			const scopeDeviations = computeScopeDeviations(allModified, session);
-			const experiment: ExperimentResult = {
-				runNumber: runtime.lastRunNumber ?? pendingRun.runNumber,
-				commit: params.commit.slice(0, 7),
-				metric: params.metric,
-				metrics: secondaryMetrics,
-				status: params.status,
-				description: params.description,
-				timestamp: Date.now(),
-				segment: state.currentSegment,
-				confidence: null,
-				asi: mergedAsi,
-			};
+			const justification = params.justification?.trim() || null;
+			const warnings: string[] = [];
-			const activeBranch = await getCurrentAutoresearchBranch(options.pi, workDir);
-			if (!activeBranch) {
-				return {
-					content: [
-						{
-							type: "text",
-							text:
-								"Error: autoresearch keep/discard actions require an active `autoresearch/...` branch. " +
-								"Run `/autoresearch` again to restore the protected branch before logging this run.",
-						},
-					],
-				};
-			}
+			const headSha = await tryReadHeadSha(ctx.cwd);
+			const explicitCommit = params.commit?.trim();
+			let commitHash = explicitCommit && explicitCommit.length > 0 ? explicitCommit : headSha;
 			let gitNote: string | null = null;
 			if (params.status === "keep") {
-				const commitResult = await commitKeptExperiment(options, workDir, state, experiment, keepScopeValidation);
-				if (commitResult.error) {
-					return {
-						content: [{ type: "text", text: `Error: ${commitResult.error}` }],
-					};
+				if (onAutoresearchBranch && allModified.length > 0) {
+					const commitResult = await commitKeptExperiment(
+						ctx.cwd,
+						params.description,
+						params.status,
+						params.metric,
+						params.metrics ?? {},
+						allModified,
+						session.primaryMetric,
+					);
+					if (commitResult.error) {
+						return {
+							content: [{ type: "text", text: `Error: ${commitResult.error}` }],
+						};
+					}
+					gitNote = commitResult.note ?? null;
+					const newSha = await tryReadHeadSha(ctx.cwd);
+					if (newSha) commitHash = newSha;
+				} else if (!onAutoresearchBranch) {
+					warnings.push(
+						"Auto-commit skipped: not on a dedicated autoresearch branch. Modified files remain in the worktree.",
+					);
+				} else if (allModified.length === 0) {
+					gitNote = "nothing to commit";
+				}
+				if (scopeDeviations.length > 0) {
+					if (justification === null) {
+						warnings.push(
+							`Kept with unjustified scope deviations: ${scopeDeviations.join(", ")}. Pass \`justification\` next time or \`flag_runs\` this entry on a future log_experiment if it was a mistake.`,
+						);
+					} else {
+						warnings.push(`Kept with scope deviations (justified): ${scopeDeviations.join(", ")}`);
+					}
 				}
-				gitNote = commitResult.note ?? null;
-			} else if (!params.skip_restore) {
-				const revertResult = await revertFailedExperiment(options, workDir, preRunDirtyPaths);
+			} else {
+				const revertResult = await revertFailedExperiment(
+					ctx.cwd,
+					pendingRun.preRunDirtyPaths,
+					onAutoresearchBranch,
+				);
 				if (revertResult.error) {
 					return {
 						content: [{ type: "text", text: `Error: ${revertResult.error}` }],
@@ -283,57 +190,78 @@ export function createLogExperimentTool(
 				gitNote = revertResult.note ?? null;
 			}
-			const previousState = cloneExperimentState(state);
-			state.results.push(experiment);
-			registerSecondaryMetrics(state, secondaryMetrics);
-			state.bestMetric = findBaselineMetric(state.results, state.currentSegment);
-			state.confidence = computeConfidence(state.results, state.currentSegment, state.bestDirection);
-			experiment.confidence = state.confidence;
-			const wallClockSeconds = runtime.lastRunDuration;
-			try {
-				persistRun(workDir, experiment);
-			} catch (error) {
-				runtime.state = previousState;
-				options.dashboard.updateWidget(ctx, runtime);
-				options.dashboard.requestRender();
-				throw error;
-			}
-			try {
-				await updateRunMetadata(runtime.lastRunArtifactDir ?? pendingRun.runDirectory, {
-					commit: experiment.commit,
-					confidence: experiment.confidence,
-					description: experiment.description,
-					gitNote,
-					loggedAt: new Date(experiment.timestamp).toISOString(),
-					loggedAsi: experiment.asi,
-					loggedMetric: experiment.metric,
-					loggedMetrics: experiment.metrics,
-					runNumber: runtime.lastRunNumber ?? pendingRun.runNumber,
-					status: experiment.status,
-					wallClockSeconds,
-				});
-			} catch (error) {
-				logger.warn("Failed to update autoresearch run metadata after persisting JSONL history", {
-					error: error instanceof Error ? error.message : String(error),
-					runDirectory: runtime.lastRunArtifactDir ?? pendingRun.runDirectory,
-					runNumber: runtime.lastRunNumber ?? pendingRun.runNumber,
-				});
+			const metric = params.metric;
+			const secondaryMetrics: NumericMetricMap = mergeMetrics(
+				pendingRun.parsedMetrics,
+				params.metrics,
+				session.primaryMetric,
+			);
+			const asi: ASIData | undefined = mergeAsi(pendingRun.parsedAsi, sanitizeAsi(params.asi));
+			if (pendingRun.parsedPrimary !== null && metric !== pendingRun.parsedPrimary) {
+				warnings.push(
+					`Logged metric ${metric} differs from parsed primary ${pendingRun.parsedPrimary}. Both values stored.`,
+				);
 			}
+			const loggedAt = Date.now();
+			const tentativeRun = storage.markRunLogged({
+				runId: pendingRun.id,
+				status: params.status,
+				description: params.description,
+				metric,
+				metrics: secondaryMetrics,
+				asi: asi ?? null,
+				commitHash,
+				confidence: null,
+				modifiedPaths: allModified,
+				scopeDeviations,
+				justification,
+				loggedAt,
+			});
+			// Recompute confidence with this run included
+			const refreshedSession = storage.getSessionById(session.id) ?? session;
+			const loggedRuns = storage.listLoggedRuns(session.id);
+			const stateForConfidence = buildExperimentState(refreshedSession, loggedRuns);
+			const confidence = computeConfidence(
+				stateForConfidence.results,
+				stateForConfidence.currentSegment,
+				stateForConfidence.bestDirection,
+			);
+			storage.updateRunConfidence(tentativeRun.id, confidence);
+			const finalState = buildExperimentState(refreshedSession, storage.listLoggedRuns(session.id));
+			runtime.state = finalState;
 			runtime.runningExperiment = null;
-			runtime.lastRunChecks = null;
+			runtime.lastRunSummary = null;
 			runtime.lastRunDuration = null;
 			runtime.lastRunAsi = null;
 			runtime.lastRunArtifactDir = null;
 			runtime.lastRunNumber = null;
-			runtime.lastRunSummary = null;
 			runtime.autoResumeArmed = true;
 			runtime.lastAutoResumePendingRunNumber = null;
-			const currentSegmentRuns = currentResults(state.results, state.currentSegment).length;
-			const text = logPreamble + buildLogText(state, experiment, currentSegmentRuns, wallClockSeconds, gitNote);
-			if (state.maxExperiments !== null && currentSegmentRuns >= state.maxExperiments) {
+			const experiment: ExperimentResult = {
+				runNumber: tentativeRun.id,
+				commit: (commitHash ?? "").slice(0, 12),
+				metric,
+				metrics: secondaryMetrics,
+				status: params.status,
+				description: params.description,
+				timestamp: loggedAt,
+				segment: pendingRun.segment,
+				confidence,
+				asi,
+				modifiedPaths: allModified,
+				scopeDeviations,
+				justification,
+				flagged: false,
+				flaggedReason: null,
+			};
+			const segmentRunCount = currentResults(finalState.results, finalState.currentSegment).length;
+			if (finalState.maxExperiments !== null && segmentRunCount >= finalState.maxExperiments) {
 				runtime.autoresearchMode = false;
 				options.pi.appendEntry(
 					"autoresearch-control",
@@ -343,19 +271,30 @@ export function createLogExperimentTool(
 					options.pi.getActiveTools().filter(name => !EXPERIMENT_TOOL_NAMES.includes(name)),
 				);
 			}
 			options.dashboard.updateWidget(ctx, runtime);
 			options.dashboard.requestRender();
+			const wallClockSeconds = pendingRun.durationMs !== null ? pendingRun.durationMs / 1000 : null;
+			const text = buildLogText(
+				finalState,
+				experiment,
+				segmentRunCount,
+				wallClockSeconds,
+				gitNote,
+				warnings,
+				flaggedRuns,
+			);
 			return {
 				content: [{ type: "text", text }],
 				details: {
-					experiment: {
-						...experiment,
-						metrics: { ...experiment.metrics },
-						asi: experiment.asi ? structuredClone(experiment.asi) : undefined,
-					},
-					state: cloneExperimentState(state),
+					experiment,
+					state: finalState,
 					wallClockSeconds,
+					scopeDeviations,
+					justification,
+					flaggedRuns,
 				},
 			};
 		},
@@ -373,320 +312,163 @@ export function createLogExperimentTool(
 			if (!details) {
 				return new Text(replaceTabs(result.content.find(part => part.type === "text")?.text ?? ""), 0, 0);
 			}
-			const summary = renderSummary(details, theme);
-			return new Text(summary, 0, 0);
+			return new Text(renderSummary(details, theme), 0, 0);
 		},
 	};
 }
-function cloneMetrics(value: NumericMetricMap | undefined): NumericMetricMap {
-	return value ? { ...value } : {};
-}
-function buildSecondaryMetrics(
-	overrides: NumericMetricMap | undefined,
-	parsedMetrics: NumericMetricMap | null,
-	primaryMetricName: string,
-): NumericMetricMap {
-	const merged: NumericMetricMap = {};
-	for (const [name, value] of Object.entries(parsedMetrics ?? {})) {
-		if (name === "__proto__" || name === "constructor" || name === "prototype") continue;
-		if (name === primaryMetricName) continue;
-		merged[name] = value;
-	}
-	for (const [name, value] of Object.entries(cloneMetrics(overrides))) {
-		if (name === "__proto__" || name === "constructor" || name === "prototype") continue;
-		merged[name] = value;
-	}
-	return merged;
-}
-function sanitizeAsi(value: { [key: string]: unknown } | undefined): ASIData | undefined {
-	if (!value) return undefined;
-	const result: ASIData = {};
-	for (const [key, entryValue] of Object.entries(value)) {
-		if (key === "__proto__" || key === "constructor" || key === "prototype") continue;
-		const sanitized = sanitizeAsiValue(entryValue);
-		if (sanitized !== undefined) {
-			result[key] = sanitized;
-		}
-	}
-	return Object.keys(result).length > 0 ? result : undefined;
-}
-function sanitizeAsiValue(value: unknown): ASIData[string] | undefined {
-	if (value === null) return null;
-	if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") return value;
-	if (Array.isArray(value)) {
-		const items = value
-			.map(item => sanitizeAsiValue(item))
-			.filter((item): item is NonNullable<typeof item> => item !== undefined);
-		return items;
-	}
-	if (typeof value === "object") {
-		const objectValue = value as { [key: string]: unknown };
-		const result: ASIData = {};
-		for (const [key, entryValue] of Object.entries(objectValue)) {
-			if (key === "__proto__" || key === "constructor" || key === "prototype") continue;
-			const sanitized = sanitizeAsiValue(entryValue);
-			if (sanitized !== undefined) {
-				result[key] = sanitized;
-			}
-		}
-		return result;
-	}
-	return undefined;
-}
-export function validateAsiRequirements(asi: ASIData | undefined, status: ExperimentResult["status"]): string | null {
-	if (!asi) {
-		return "asi is required. Include at minimum a non-empty hypothesis.";
-	}
-	if (typeof asi.hypothesis !== "string" || asi.hypothesis.trim().length === 0) {
-		return "asi.hypothesis is required and must be a non-empty string.";
-	}
-	if (status === "keep") return null;
-	if (typeof asi.rollback_reason !== "string" || asi.rollback_reason.trim().length === 0) {
-		return "asi.rollback_reason is required for discard, crash, and checks_failed results.";
-	}
-	if (typeof asi.next_action_hint !== "string" || asi.next_action_hint.trim().length === 0) {
-		return "asi.next_action_hint is required for discard, crash, and checks_failed results.";
-	}
-	return null;
-}
-function registerSecondaryMetrics(state: ExperimentState, metrics: NumericMetricMap): void {
-	for (const name of Object.keys(metrics)) {
-		if (state.secondaryMetrics.some(metric => metric.name === name)) continue;
-		state.secondaryMetrics.push({
-			name,
-			unit: inferMetricUnitFromName(name),
-		});
-	}
-}
-function persistRun(workDir: string, experiment: ExperimentResult): void {
-	const entry = {
-		run: experiment.runNumber,
-		...experiment,
-	};
-	const jsonlPath = path.join(workDir, "autoresearch.jsonl");
-	fs.appendFileSync(jsonlPath, `${JSON.stringify(entry)}\n`);
-}
-function validateObservedStatus(
-	status: ExperimentResult["status"],
-	pendingRun: { checksPass: boolean | null; passed: boolean },
-): string | null {
-	if (pendingRun.checksPass === false) {
-		return status === "checks_failed"
-			? null
-			: "benchmark checks failed for the pending run. Log it as checks_failed.";
-	}
-	if (!pendingRun.passed) {
-		return status === "crash" ? null : "the pending benchmark failed. Log it as crash.";
-	}
-	return status === "keep" || status === "discard" ? null : "the pending benchmark passed. Log it as keep or discard.";
+interface KeepCommitResult {
+	error?: string;
+	note?: string;
 }
 async function commitKeptExperiment(
-	_options: AutoresearchToolFactoryOptions,
-	workDir: string,
-	state: ExperimentState,
-	experiment: ExperimentResult,
-	scopeValidation: { committablePaths: string[] } | undefined,
+	cwd: string,
+	description: string,
+	status: ExperimentResult["status"],
+	metric: number,
+	metrics: NumericMetricMap,
+	files: string[],
+	primaryMetric: string,
 ): Promise<KeepCommitResult> {
-	if (!scopeValidation || scopeValidation.committablePaths.length === 0) {
-		return { note: "nothing to commit" };
-	}
+	if (files.length === 0) return { note: "nothing to commit" };
 	try {
-		await git.stage.files(workDir, scopeValidation.committablePaths);
+		await git.stage.files(cwd, files);
 	} catch (err) {
-		return {
-			error: `git add failed: ${err instanceof Error ? err.message : String(err)}`,
-		};
+		return { error: `git add failed: ${err instanceof Error ? err.message : String(err)}` };
 	}
-	if (!(await git.diff.has(workDir, { cached: true, files: scopeValidation.committablePaths }))) {
+	if (!(await git.diff.has(cwd, { cached: true, files }))) {
 		return { note: "nothing to commit" };
 	}
 	const payload: { [key: string]: string | number } = {
-		status: experiment.status,
-		[state.metricName]: experiment.metric,
+		status,
+		[primaryMetric]: metric,
 	};
-	for (const [name, value] of Object.entries(experiment.metrics)) {
+	for (const [name, value] of Object.entries(metrics)) {
 		payload[name] = value;
 	}
-	const commitMessage = `${experiment.description}\n\nResult: ${JSON.stringify(payload)}`;
-	let commitResultText = "";
+	const commitMessage = `${description}\n\nResult: ${JSON.stringify(payload)}`;
 	try {
-		const commitResult = await git.commit(workDir, commitMessage, {
-			files: scopeValidation.committablePaths,
-		});
-		commitResultText = mergeStdoutStderr(commitResult);
+		const commitResult = await git.commit(cwd, commitMessage, { files });
+		const summary = `${commitResult.stdout}${commitResult.stderr}`.split("\n").find(line => line.trim().length > 0);
+		return { note: summary?.trim() ?? "committed" };
 	} catch (err) {
-		return {
-			error: `git commit failed: ${err instanceof Error ? err.message : String(err)}`,
-		};
-	}
-	const newCommit = (await git.head.short(workDir, 7)) ?? "";
-	if (newCommit.length >= 7) {
-		experiment.commit = newCommit;
+		return { error: `git commit failed: ${err instanceof Error ? err.message : String(err)}` };
 	}
-	const summaryLine = commitResultText.split("\n").find(line => line.trim().length > 0) ?? "committed";
-	return { note: summaryLine.trim() };
 }
 async function revertFailedExperiment(
-	options: AutoresearchToolFactoryOptions,
-	workDir: string,
+	cwd: string,
 	preRunDirtyPaths: string[],
+	onAutoresearchBranch: boolean,
 ): Promise<KeepCommitResult> {
-	let statusText: string;
-	try {
-		statusText = await git.status(workDir, {
-			pathspecs: ["."],
-			porcelainV1: true,
-			untrackedFiles: "all",
-			z: true,
-		});
-	} catch (err) {
-		return {
-			error: `git status failed: ${err instanceof Error ? err.message : String(err)}`,
-		};
+	if (onAutoresearchBranch) {
+		// Discard reverts only the current iteration's uncommitted changes — never
+		// rewinds prior `keep` commits. Reset to HEAD so any kept improvements
+		// already on the branch survive.
+		try {
+			await git.reset(cwd, { hard: true, target: "HEAD" });
+			await git.clean(cwd);
+			return { note: "worktree reset to HEAD" };
+		} catch (err) {
+			return { error: `git reset/clean failed: ${err instanceof Error ? err.message : String(err)}` };
+		}
 	}
-	const workDirPrefix = await readGitWorkDirPrefix(options, workDir);
+	const statusText = await tryGitStatus(cwd);
+	const workDirPrefix = await tryGitPrefix(cwd);
 	const { tracked, untracked } = computeRunModifiedPaths(preRunDirtyPaths, statusText, workDirPrefix);
-	const totalReverted = tracked.length + untracked.length;
-	if (totalReverted === 0) {
-		return { note: "nothing to revert" };
-	}
+	const total = tracked.length + untracked.length;
+	if (total === 0) return { note: "nothing to revert" };
 	if (tracked.length > 0) {
 		try {
-			await git.restore(workDir, { files: tracked, source: "HEAD", staged: true, worktree: true });
+			await git.restore(cwd, { files: tracked, source: "HEAD", staged: true, worktree: true });
 		} catch (err) {
-			return {
-				error: `git restore failed: ${err instanceof Error ? err.message : String(err)}`,
-			};
+			return { error: `git restore failed: ${err instanceof Error ? err.message : String(err)}` };
 		}
 	}
 	for (const filePath of untracked) {
-		const absolutePath = path.join(workDir, filePath);
 		try {
-			fs.rmSync(absolutePath, { force: true, recursive: true });
+			fs.rmSync(path.join(cwd, filePath), { force: true, recursive: true });
 		} catch {
-			// Best-effort removal of untracked files
+			// best effort
 		}
 	}
+	return { note: `reverted ${total} file${total === 1 ? "" : "s"}` };
+}
-	return { note: `reverted ${totalReverted} file${totalReverted === 1 ? "" : "s"}` };
+async function detectModifiedPaths(
+	cwd: string,
+	preRunDirtyPaths: string[],
+): Promise<{ modifiedTracked: string[]; modifiedUntracked: string[] }> {
+	const statusText = await tryGitStatus(cwd);
+	const workDirPrefix = await tryGitPrefix(cwd);
+	const { tracked, untracked } = computeRunModifiedPaths(preRunDirtyPaths, statusText, workDirPrefix);
+	return { modifiedTracked: tracked, modifiedUntracked: untracked };
 }
-function mergeStdoutStderr(result: { stderr: string; stdout: string }): string {
-	return `${result.stdout}${result.stderr}`;
+function computeScopeDeviations(modifiedPaths: string[], session: SessionRow): string[] {
+	const deviations: string[] = [];
+	for (const filePath of modifiedPaths) {
+		if (session.offLimits.some(spec => pathMatchesSpec(filePath, spec))) {
+			deviations.push(filePath);
+			continue;
+		}
+		if (session.scopePaths.length > 0 && !session.scopePaths.some(spec => pathMatchesSpec(filePath, spec))) {
+			deviations.push(filePath);
+		}
+	}
+	return deviations;
 }
-async function validateKeepPaths(
-	options: AutoresearchToolFactoryOptions,
-	workDir: string,
-	state: ExperimentState,
-): Promise<{ committablePaths: string[] } | string> {
-	if (state.scopePaths.length === 0) {
-		return "Files in Scope is empty for the current segment. Re-run init_experiment after fixing autoresearch.md.";
+function mergeMetrics(
+	parsed: NumericMetricMap | null,
+	overrides: NumericMetricMap | undefined,
+	primaryMetricName: string,
+): NumericMetricMap {
+	const merged: NumericMetricMap = {};
+	for (const [name, value] of Object.entries(parsed ?? {})) {
+		if (name === primaryMetricName) continue;
+		merged[name] = value;
 	}
+	for (const [name, value] of Object.entries(ensureNumericMetricMap(overrides))) {
+		merged[name] = value;
+	}
+	return merged;
+}
-	let statusText: string;
+async function tryReadHeadSha(cwd: string): Promise<string | null> {
 	try {
-		statusText = await git.status(workDir, {
-			pathspecs: ["."],
-			porcelainV1: true,
-			untrackedFiles: "all",
-			z: true,
-		});
-	} catch (err) {
-		return `git status failed: ${err instanceof Error ? err.message : String(err)}`;
+		return (await git.head.sha(cwd)) ?? null;
+	} catch {
+		return null;
 	}
+}
-	const workDirPrefix = await readGitWorkDirPrefix(options, workDir);
-	const committablePaths: string[] = [];
-	for (const entry of parseWorkDirDirtyPathsWithStatus(statusText, workDirPrefix)) {
-		if (isAutoresearchLocalStatePath(entry.path)) {
-			continue;
-		}
-		if (isAutoresearchCommittableFile(entry.path)) {
-			committablePaths.push(entry.path);
-			continue;
-		}
-		if (state.offLimits.some(spec => pathMatchesContractPath(entry.path, spec))) {
-			return `cannot keep this run because ${entry.path} is listed under Off Limits in autoresearch.md`;
-		}
-		if (!state.scopePaths.some(spec => pathMatchesContractPath(entry.path, spec))) {
-			return `cannot keep this run because ${entry.path} is outside Files in Scope`;
-		}
-		committablePaths.push(entry.path);
+async function tryGitStatus(cwd: string): Promise<string> {
+	try {
+		return await git.status(cwd, { porcelainV1: true, untrackedFiles: "all", z: true });
+	} catch {
+		return "";
 	}
-	return { committablePaths };
 }
-async function updateRunMetadata(
-	runDirectory: string | null,
-	metadata: {
-		commit: string;
-		confidence: number | null;
-		description: string;
-		gitNote: string | null;
-		loggedAt: string;
-		loggedAsi: ASIData | undefined;
-		loggedMetric: number;
-		loggedMetrics: NumericMetricMap;
-		runNumber: number | null;
-		status: ExperimentResult["status"];
-		wallClockSeconds: number | null;
-	},
-): Promise<void> {
-	if (!runDirectory) return;
-	const runJsonPath = path.join(runDirectory, "run.json");
-	let existing: Record<string, unknown> = {};
+async function tryGitPrefix(cwd: string): Promise<string> {
 	try {
-		existing = (await Bun.file(runJsonPath).json()) as Record<string, unknown>;
+		return await git.show.prefix(cwd);
 	} catch {
-		existing = {};
+		return "";
 	}
-	await Bun.write(
-		runJsonPath,
-		JSON.stringify(
-			{
-				...existing,
-				loggedRunNumber: metadata.runNumber,
-				loggedAt: metadata.loggedAt,
-				loggedAsi: metadata.loggedAsi,
-				loggedMetric: metadata.loggedMetric,
-				loggedMetrics: metadata.loggedMetrics,
-				status: metadata.status,
-				description: metadata.description,
-				commit: metadata.commit,
-				gitNote: metadata.gitNote,
-				confidence: metadata.confidence,
-				wallClockSeconds: metadata.wallClockSeconds,
-			},
-			null,
-			2,
-		),
-	);
 }
 function buildLogText(
 	state: ExperimentState,
 	experiment: ExperimentResult,
-	currentSegmentRuns: number,
+	segmentRunCount: number,
 	wallClockSeconds: number | null,
 	gitNote: string | null,
+	warnings: string[],
+	flaggedRuns: LogDetails["flaggedRuns"],
 ): string {
 	const displayRunNumber = experiment.runNumber ?? state.results.length;
 	const lines = [`Logged run #${displayRunNumber}: ${experiment.status} - ${experiment.description}`];
@@ -696,7 +478,7 @@ function buildLogText(
 	if (state.bestMetric !== null) {
 		lines.push(`Baseline ${state.metricName}: ${formatNum(state.bestMetric, state.metricUnit)}`);
 	}
-	if (currentSegmentRuns > 1 && state.bestMetric !== null && experiment.metric !== state.bestMetric) {
+	if (segmentRunCount > 1 && state.bestMetric !== null && experiment.metric !== state.bestMetric) {
 		const delta = ((experiment.metric - state.bestMetric) / state.bestMetric) * 100;
 		const sign = delta > 0 ? "+" : "";
 		lines.push(`This run: ${formatNum(experiment.metric, state.metricUnit)} (${sign}${delta.toFixed(1)}%)`);
@@ -708,7 +490,7 @@ function buildLogText(
 		const parts = Object.entries(experiment.metrics).map(([name, value]) => {
 			const unit = state.secondaryMetrics.find(metric => metric.name === name)?.unit ?? "";
 			const baseline = baselineSecondary[name];
-			if (baseline === undefined || baseline === 0 || currentSegmentRuns === 1) {
+			if (baseline === undefined || baseline === 0 || segmentRunCount === 1) {
 				return `${name}: ${formatNum(value, unit)}`;
 			}
 			const delta = ((value - baseline) / baseline) * 100;
@@ -717,6 +499,10 @@ function buildLogText(
 		});
 		lines.push(`Secondary metrics: ${parts.join("  ")}`);
 	}
+	const bestKept = findBestKeptMetric(state.results, state.currentSegment, state.bestDirection);
+	if (bestKept !== null && state.bestMetric !== null && bestKept !== state.bestMetric) {
+		lines.push(`Best kept ${state.metricName}: ${formatNum(bestKept, state.metricUnit)}`);
+	}
 	if (experiment.asi) {
 		const asiSummary = Object.entries(experiment.asi)
 			.map(([key, value]) => `${key}: ${truncateAsiValue(value)}`)
@@ -731,21 +517,19 @@ function buildLogText(
 		lines.push(`Git: ${gitNote}`);
 	}
 	if (state.maxExperiments !== null) {
-		lines.push(`Progress: ${currentSegmentRuns}/${state.maxExperiments} runs in current segment`);
-		if (currentSegmentRuns >= state.maxExperiments) {
+		lines.push(`Progress: ${segmentRunCount}/${state.maxExperiments} runs in current segment`);
+		if (segmentRunCount >= state.maxExperiments) {
 			lines.push(`Maximum experiments reached (${state.maxExperiments}). Autoresearch mode is now off.`);
 		}
 	}
-	return lines.join("\n");
-}
-async function readGitWorkDirPrefix(options: AutoresearchToolFactoryOptions, workDir: string): Promise<string> {
-	void options;
-	try {
-		return await git.show.prefix(workDir);
-	} catch {
-		return "";
+	if (flaggedRuns.length > 0) {
+		const formatted = flaggedRuns.map(({ runId, reason }) => `#${runId} (${reason})`).join(", ");
+		lines.push(`Flagged: ${formatted}`);
 	}
+	for (const warning of warnings) {
+		lines.push(`Warning: ${warning}`);
+	}
+	return lines.join("\n");
 }
 function truncateAsiValue(value: ASIData[string]): string {
@@ -764,5 +548,8 @@ function renderSummary(details: LogDetails, theme: Theme): string {
 	if (state.confidence !== null) {
 		summary += ` ${theme.fg("dim", `conf ${state.confidence.toFixed(1)}x`)}`;
 	}
+	if (details.scopeDeviations.length > 0) {
+		summary += ` ${theme.fg("warning", `deviations:${details.scopeDeviations.length}`)}`;
+	}
 	return summary;
 }