npm - @oh-my-pi/pi-coding-agent - Versions diffs - 14.5.14 → 14.6.0 - Mend

@oh-my-pi/pi-coding-agent 14.5.14 → 14.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

package/CHANGELOG.md +39 -0
package/package.json +7 -7
package/src/autoresearch/command-resume.md +5 -8
package/src/autoresearch/git.ts +41 -51
package/src/autoresearch/helpers.ts +43 -359
package/src/autoresearch/index.ts +281 -273
package/src/autoresearch/prompt-setup.md +43 -0
package/src/autoresearch/prompt.md +52 -193
package/src/autoresearch/resume-message.md +2 -8
package/src/autoresearch/state.ts +59 -166
package/src/autoresearch/storage.ts +687 -0
package/src/autoresearch/tools/init-experiment.ts +201 -290
package/src/autoresearch/tools/log-experiment.ts +304 -517
package/src/autoresearch/tools/run-experiment.ts +117 -296
package/src/autoresearch/tools/update-notes.ts +116 -0
package/src/autoresearch/types.ts +16 -66
package/src/config/settings-schema.ts +1 -1
package/src/config/settings.ts +20 -1
package/src/cursor.ts +1 -1
package/src/edit/index.ts +9 -31
package/src/edit/line-hash.ts +70 -43
package/src/edit/modes/hashline.lark +26 -0
package/src/edit/modes/hashline.ts +898 -1099
package/src/edit/modes/patch.ts +0 -7
package/src/edit/modes/replace.ts +0 -4
package/src/edit/renderer.ts +22 -20
package/src/edit/streaming.ts +8 -28
package/src/eval/eval.lark +24 -30
package/src/eval/js/context-manager.ts +5 -162
package/src/eval/js/prelude.txt +0 -12
package/src/eval/parse.ts +129 -129
package/src/eval/py/prelude.py +1 -219
package/src/export/html/template.generated.ts +1 -1
package/src/export/html/template.js +2 -2
package/src/internal-urls/docs-index.generated.ts +1 -1
package/src/modes/components/session-observer-overlay.ts +5 -2
package/src/modes/components/status-line/segments.ts +1 -1
package/src/modes/components/status-line.ts +3 -5
package/src/modes/components/tree-selector.ts +4 -5
package/src/modes/components/welcome.ts +11 -1
package/src/modes/controllers/command-controller.ts +2 -6
package/src/modes/controllers/event-controller.ts +1 -2
package/src/modes/controllers/extension-ui-controller.ts +3 -15
package/src/modes/controllers/input-controller.ts +0 -1
package/src/modes/controllers/selector-controller.ts +1 -1
package/src/modes/interactive-mode.ts +5 -7
package/src/prompts/system/system-prompt.md +14 -38
package/src/prompts/tools/ast-edit.md +8 -8
package/src/prompts/tools/ast-grep.md +10 -10
package/src/prompts/tools/eval.md +13 -31
package/src/prompts/tools/find.md +2 -1
package/src/prompts/tools/hashline.md +66 -57
package/src/prompts/tools/search.md +2 -2
package/src/session/session-manager.ts +17 -13
package/src/tools/ast-edit.ts +141 -44
package/src/tools/ast-grep.ts +112 -36
package/src/tools/eval.ts +2 -53
package/src/tools/find.ts +16 -15
package/src/tools/path-utils.ts +36 -196
package/src/tools/search.ts +56 -35
package/src/utils/edit-mode.ts +2 -11
package/src/utils/file-display-mode.ts +1 -1
package/src/utils/git.ts +17 -0
package/src/utils/session-color.ts +0 -12
package/src/utils/title-generator.ts +22 -38
package/src/autoresearch/apply-contract-to-state.ts +0 -24
package/src/autoresearch/contract.ts +0 -288
package/src/edit/modes/atom.lark +0 -29
package/src/edit/modes/atom.ts +0 -1773
package/src/prompts/tools/atom.md +0 -150

package/src/autoresearch/prompt-setup.md ADDED Viewed

@@ -0,0 +1,43 @@
+{{base_system_prompt}}
+## Autoresearch Mode — Phase 1: Harness Setup
+Autoresearch mode is active and there is no session yet. Your job in this turn is to **build the benchmark harness**, not to optimise anything. Optimisation starts only after you call `init_experiment`.
+{{#if has_goal}}
+Primary goal (for context — implement the harness so it can measure this):
+{{goal}}
+{{else}}
+There is no goal recorded yet. Infer what to optimise from the latest user message and design the harness to measure that. Capture the goal when you call `init_experiment`.
+{{/if}}
+Working directory: `{{working_dir}}`
+{{#if has_branch}}Active branch: `{{branch}}`{{/if}}
+{{#if has_baseline_warning}}
+{{baseline_warning}}
+{{/if}}
+### What you must produce
+Write `./autoresearch.sh` at the working directory. It is the canonical benchmark entrypoint and must:
+- exit 0 on success and non-zero on failure;
+- print the primary metric as a single line `METRIC <name>=<value>`;
+- print any secondary metrics as additional `METRIC <name>=<value>` lines;
+- run the same workload deterministically every time (no live network, no time-of-day dependencies, fixed seeds where applicable).
+You **may** edit anything else needed to make `autoresearch.sh` work — benchmark binaries, `Cargo.toml`, `package.json`, helper scripts, fixtures. All those edits are part of the harness baseline and will be committed for you when you call `init_experiment` on an autoresearch branch.
+### Steps
+1. Inspect the target. Read source, identify what to measure, decide on the workload.
+2. Write `autoresearch.sh` plus any supporting files (benchmark binaries, fixtures, etc.).
+3. Validate it: invoke `bash autoresearch.sh` through the regular `bash` tool. Confirm it exits 0 and emits at least one `METRIC` line. Iterate on the harness until it does.
+4. Call `init_experiment` with the goal, primary metric (matching the `METRIC` name), and scope. This snapshots the worktree as the baseline and starts Phase 2 (the iteration loop).
+### Rules
+- Do **not** call `run_experiment`, `log_experiment`, or `update_notes` yet. They will error with "no active autoresearch session" until `init_experiment` runs.
+- Do **not** treat a compile-only check as a benchmark. The harness must actually execute the workload and emit `METRIC`.
+- Do **not** create `autoresearch.md`, `autoresearch.checks.sh`, `autoresearch.program.md`, `autoresearch.ideas.md`, `autoresearch.jsonl`, `.autoresearch/`, or `autoresearch.config.json`. Session state is tracked for you.

package/src/autoresearch/prompt.md CHANGED Viewed

@@ -8,29 +8,50 @@ Autoresearch mode is active.
 Primary goal:
 {{goal}}
 {{else}}
-{{#if has_autoresearch_md}}
-Primary goal is documented in `autoresearch.md` for this session.
-{{else}}
-There is no `autoresearch.md` yet. Infer what to optimize from the latest user message and the conversation; after you create `autoresearch.md`, keep it as the durable source of truth for goal and benchmark contract.
-{{/if}}
+There is no goal recorded for this session yet. Infer what to optimize from the latest user message and the conversation; capture the goal in your notes (`update_notes`) once it is clear.
 {{/if}}
-Working directory:
-`{{working_dir}}`
+Session state and run artifacts are managed for you. The benchmark entrypoint is `bash autoresearch.sh` (committed during Phase 1). Do not edit `autoresearch.sh` mid-segment unless you intentionally bump segment via `init_experiment new_segment: true`. Do not create `autoresearch.md` or `.autoresearch/` in this repo.
+Working directory: `{{working_dir}}`
+{{#if has_branch}}Active branch: `{{branch}}`{{/if}}
+{{#if has_baseline_commit}}Baseline commit: `{{baseline_commit}}`{{/if}}
 You are running an autonomous experiment loop. Keep iterating until the user interrupts you or the configured maximum iteration count is reached.
-{{#if has_program}}
-### Local Playbook
+### Available tools
+- `init_experiment` — open or reconfigure the session. Pass `new_segment: true` to start a fresh baseline within the current session.
+- `run_experiment` — run the benchmark (`bash autoresearch.sh`). Output is captured automatically and `METRIC name=value` / `ASI key=value` lines printed by the harness are parsed back to you. The command is fixed; if you need a different workload, edit `autoresearch.sh` and bump segment via `init_experiment new_segment: true`.
+- `log_experiment` — record the result. On `keep`, modified files are committed for you; on `discard`/`crash`/`checks_failed`, the worktree is reverted. Pass `flag_runs` to mark earlier runs as suspect; flagged runs are excluded from baseline and best-metric math.
+- `update_notes` — replace the durable session playbook (`body`) or append to the ideas backlog (`append_idea`). The notes are injected into your system prompt every iteration.
-`autoresearch.program.md` exists at `{{program_path}}`.
+### Operating protocol
+1. Understand the target before touching code: read source, identify the bottleneck, verify prerequisites and benchmark inputs.
+2. Update goal, scope, or constraints via another `init_experiment` call (no segment bump) or `update_notes`. Bump segment when you intentionally change `autoresearch.sh`.
+3. Establish a baseline first.
+4. Iterate: change code, run `run_experiment`, log honestly with `log_experiment`. One coherent experiment per iteration.
+5. Keep the primary metric as the decision maker:
+   - `keep` when it improves;
+   - `discard` when it regresses or stays flat;
+   - `crash` when the run fails;
+   - `checks_failed` when validation fails (you decide what validation means; run it through the regular `bash` tool).
+6. Use ASI freely — it is opaque, just stash useful learnings (`hypothesis`, `rollback_reason`, `next_action_hint`, anything else).
+7. When confidence is low, re-run promising changes before keeping them. `log_experiment` reports a confidence score (multiples of the observed noise floor) on each kept run.
+### Scope, off-limits, and accountability
+- Edits are not blocked. You can change anything.
+- `log_experiment` records the modified paths. Files outside `scope_paths` or inside `off_limits` are recorded as `scope_deviations` on the run.
+- If you keep a run with deviations, pass `justification` explaining why. Without it, the run logs but is flagged in the next iteration's prompt as unjustified.
+- If a previous run looks reward-hacked or otherwise wrong, pass `flag_runs: [{ run_id, reason }]` on the next `log_experiment` to exclude it from baseline and best-metric calculations.
+{{#if has_notes}}
+### Your notes (use `update_notes` to edit)
+{{notes}}
-Use it as a repo-local strategy overlay for this session. `autoresearch.md` remains the source of truth for benchmark, scope, and constraints.
 {{/if}}
 {{#if has_recent_results}}
-### Current Segment Snapshot
+### Current segment snapshot
 - segment: `{{current_segment}}`
 - runs in current segment: `{{current_segment_run_count}}`
 {{#if has_baseline_metric}}
@@ -46,199 +67,37 @@ Recent runs:
 {{#if has_asi_summary}}
   ASI: {{asi_summary}}
 {{/if}}
+{{#if has_deviations}}
+  Modified outside scope: {{deviations}}{{#unless justified}} (no justification){{/unless}}
+{{/if}}
+{{#if flagged}}
+  FLAGGED: {{flagged_reason}}
+{{/if}}
 {{/each}}
 {{/if}}
-{{#if has_pending_run}}
-### Pending Run
+{{#if has_unjustified_runs}}
-An unlogged run artifact exists at `{{pending_run_directory}}`.
+### Unjustified deviations
+{{#each unjustified_runs}}
+- run `#{{run_number}}` modified `{{paths}}` outside scope without justification. Either accept it, justify it on the next log, or `flag_runs` it.
+{{/each}}
+{{/if}}
+{{#if has_pending_run}}
+### Pending run
+An unlogged run is waiting:
 - run: `#{{pending_run_number}}`
 - command: `{{pending_run_command}}`
 {{#if has_pending_run_metric}}
 - parsed `{{metric_name}}`: `{{pending_run_metric_display}}`
 {{/if}}
-- result status: {{#if pending_run_passed}}passed{{else}}failed{{/if}}
-- finish the `log_experiment` step before starting another benchmark
-{{/if}}
-### Available tools
-- `init_experiment` — initialize or reset the experiment session for the current optimization target.
-- `run_experiment` — run a benchmark or experiment command with timing, output capture, structured metric parsing, and optional backpressure checks.
-- `log_experiment` — record the result, update the dashboard, persist JSONL history, auto-commit kept experiments, and revert only run-modified files for discarded or failed experiments (pre-existing uncommitted changes are preserved).
+- result: {{#if pending_run_passed}}passed{{else}}failed{{/if}}
-### Operating protocol
-1. Understand the target before touching code.
-   - Read the relevant source files.
-   - Identify the true bottleneck or quality constraint.
-   - Check existing scripts, benchmark harnesses, and config files.
-   - Verify prerequisites, one-time setup, and benchmark inputs before the first run of a segment.
-2. Keep your notes in `autoresearch.md`.
-   - Record the goal, the benchmark command, the primary metric, important secondary metrics, the files in scope, hard constraints, preflight requirements, and the benchmark comparability invariant.
-   - Update the notes whenever the strategy changes.
-   - Keep durable conclusions in `autoresearch.md`.
-   - Use `autoresearch.ideas.md` for deferred experiment ideas that are promising but not active yet.
-3. Use `autoresearch.sh` as the canonical benchmark entrypoint.
-   - If it does not exist yet, create it.
-   - Make it print structured metric lines in the form `METRIC name=value`.
-   - Use the same workload every run unless you intentionally re-initialize with a new segment.
-   - Keep the measurement harness, evaluator, and fixed benchmark inputs stable unless you intentionally start a new segment and document the change.
-4. Initialize the loop with `init_experiment` before the first logged run of a segment.
-   - Pass `from_autoresearch_md: true` with only `name` to load the benchmark contract from `autoresearch.md` without mirroring every field in the tool call.
-   - Use `abandon_unlogged_runs: true` only when you intentionally discard unlogged run artifacts and need a fresh segment (for example after a bad or obsolete benchmark directory).
-5. Run a baseline first.
-   - Establish the baseline metric before attempting optimizations.
-   - Track secondary metrics only when they matter to correctness, quality, or obvious regressions.
-6. Iterate.
-   - Make one coherent experiment at a time.
-   - Run `run_experiment`.
-   - Interpret the result honestly.
-   - Call `log_experiment` after every run (it refreshes benchmark/scope fields from `autoresearch.md` before logging so keep validation matches the file on disk).
-   - Use `run_experiment` with `force: true` only when you must override the segment benchmark command or skip the direct-`autoresearch.sh` rule.
-   - On `log_experiment`, `force: true` relaxes ASI requirements and allows keeping a primary-metric regression; prefer normal logging when possible.
-7. Keep the primary metric as the decision maker.
-   - `keep` when the primary metric improves.
-   - `discard` when it regresses or stays flat.
-   - `crash` when the run fails.
-   - `checks_failed` when the benchmark passes but backpressure checks fail.
-8. Record ASI on every `log_experiment` call.
-   - At minimum include `hypothesis`.
-   - On `discard`, `crash`, or `checks_failed`, also include `rollback_reason` and `next_action_hint`.
-   - Use ASI to capture what you learned, not just what you changed.
-9. Prefer simpler wins.
-   - Remove dead ends.
-   - Keep equal or near-equal results when they materially simplify the implementation.
-   - Do not keep ugly complexity for tiny gains unless the payoff is clearly worth it.
-   - Do not thrash between unrelated ideas without writing down the conclusion.
-10. When confidence is low, confirm.
-    - The dashboard confidence score compares the best observed improvement against the observed noise floor.
-    - Below `1.0x` usually means the improvement is within noise.
-    - Re-run promising changes when needed before keeping them.
-### Benchmark harness guidance
-Your benchmark script SHOULD:
-- live at `autoresearch.sh`
-- run from `{{working_dir}}`
-- fail with a non-zero exit status on invalid runs
-- print the primary metric as `METRIC {{default_metric_name}}=<number>` or another explicit metric name chosen during initialization
-- print secondary metrics as additional `METRIC name=value` lines
-- avoid extra randomness when possible
-- use repeated samples and median-style summaries for fast benchmarks
-- preserve the comparability invariant for the current segment
-- keep the ground-truth evaluator and fixed benchmark inputs unchanged unless the segment is explicitly re-initialized
-### Notes file template
-Keep `autoresearch.md` concise and current.
-Suggested structure:
-```md
-# Autoresearch
-## Goal
-{{#if has_goal}}
-- {{goal}}
-{{else}}
-{{#if has_autoresearch_md}}
-- document the active target here before the first benchmark
-{{else}}
-- (derive from the user's messages, then record here)
-{{/if}}
+Finish the `log_experiment` step before starting another benchmark.
 {{/if}}
-## Benchmark
- - command:
- - primary metric:
- - metric unit:
- - direction:
- - secondary metrics: memory_mb, rss_mb
-## Files in Scope
-- path:
-## Off Limits
-- path:
-## Constraints
-- rule:
-## Baseline
-- metric:
-- notes:
-## Current best
-- metric:
-- why it won:
-## What's Been Tried
-- experiment:
-- lesson:
-```
 ### Guardrails
 - Do not game the benchmark.
 - Do not overfit to synthetic inputs if the real workload is broader.
 - Preserve correctness.
-- Only modify files that are explicitly in scope for the current session.
-- Do not use the general shell tool for file mutations during autoresearch. Use `write`, `edit`, or `ast_edit` for scoped code changes and `run_experiment` for benchmark execution.
-- If you create `autoresearch.checks.sh`, treat it as a hard gate for `keep`.
 - If the user sends another message while a run is in progress, finish the current run and logging cycle first, then address the new input in the next iteration.
-{{#if has_autoresearch_md}}
-### Resume mode
-`autoresearch.md` already exists at `{{autoresearch_md_path}}`.
-Resume from the existing notes:
-- read `autoresearch.md`
-- inspect recent git history
-- inspect `autoresearch.jsonl`
-- continue from the most promising unfinished direction on the current protected branch
-{{else}}
-### Initial setup
-`autoresearch.md` does not exist yet. You decide the benchmark contract, harness, and scope from the user's messages and the repository—do not ask the user to re-type benchmark commands or metric names in a separate UI prompt.
-Before the first benchmark:
-- Write `autoresearch.md` with goal, benchmark command (must be a **direct** invocation of `autoresearch.sh`, e.g. `bash autoresearch.sh`), primary metric name and unit, direction (`lower` or `higher`), tradeoff metrics if relevant, files in scope, off limits, and constraints.
-- Add a short preflight section: prerequisites, one-time setup, and the comparability invariant that must stay fixed across runs.
-- Mark ground-truth evaluators, fixed datasets, and other measurement-critical files as off limits or hard constraints when they define the benchmark contract.
-- Write or update `autoresearch.program.md` when you learn durable heuristics, failure patterns, or repo-specific strategy for later resume turns.
-- Create `autoresearch.sh` as the canonical benchmark entrypoint; print the primary metric as `METRIC <name>=<number>` and optional secondary metrics as additional `METRIC` lines.
-- Optionally add `autoresearch.checks.sh` if correctness or quality needs a hard gate.
-- Call `init_experiment` with arguments that match `autoresearch.md` exactly (benchmark command, metric, unit, direction, scope paths, off limits, constraints).
-- Run and log the baseline.
-Until `init_experiment` succeeds, only autoresearch control files (`autoresearch.md`, `autoresearch.sh`, `autoresearch.program.md`, `autoresearch.ideas.md`, `autoresearch.checks.sh`) may be edited; after initialization, respect Files in Scope from the contract.
-{{/if}}
-{{#if has_checks}}
-### Backpressure checks
-`autoresearch.checks.sh` exists at `{{checks_path}}` and runs automatically after passing benchmark runs.
-Treat failing checks as a failed experiment:
-- do not `keep` a run when checks fail
-- log it as `checks_failed`
-- diagnose the regression before continuing
-{{/if}}
-{{#if has_ideas}}
-### Ideas backlog
-`autoresearch.ideas.md` exists at `{{ideas_path}}`.
-Use it to keep promising but deferred experiments. `autoresearch.md` should hold durable conclusions; `autoresearch.ideas.md` is the scratch backlog. Prune stale ideas when they are disproven or superseded.
-{{/if}}

package/src/autoresearch/resume-message.md CHANGED Viewed

@@ -1,16 +1,10 @@
 Continue the autoresearch loop now.
-@{{autoresearch_md_path}}
-- Read `autoresearch.md` and `autoresearch.jsonl`.
-- Treat `autoresearch.md` as the source of truth for the current direction, scope, and constraints.
+- Re-read your notes and the recent-runs context above before deciding the next direction.
 - Inspect recent git history for context.
 {{#if has_pending_run}}
-- Inspect the latest unlogged `run.json` under `.autoresearch/runs/` and finish the pending `log_experiment` step before starting a new benchmark.
+- A previous benchmark run completed but was never logged. Finish `log_experiment` before starting a new run.
 {{/if}}
 - Continue from the most promising unfinished direction.
-{{#if has_ideas}}
-- Review `autoresearch.ideas.md` for deferred next steps and prune stale items.
-{{/if}}
 - Keep iterating until interrupted or until the configured iteration cap is reached.
 - Preserve correctness and do not game the benchmark.

package/src/autoresearch/state.ts CHANGED Viewed

@@ -1,12 +1,8 @@
-import * as fs from "node:fs";
-import * as path from "node:path";
 import type { SessionEntry } from "../session/session-manager";
-import { normalizeAutoresearchList, normalizeContractPathSpec } from "./contract";
 import { inferMetricUnitFromName, isBetter } from "./helpers";
+import type { RunRow, SessionRow } from "./storage";
 import type {
 	AutoresearchControlEntryData,
-	AutoresearchJsonConfigEntry,
-	AutoresearchJsonRunEntry,
 	AutoresearchRuntime,
 	ExperimentResult,
 	ExperimentState,
@@ -14,7 +10,6 @@ import type {
 	MetricDirection,
 	NumericMetricMap,
 	ReconstructedControlState,
-	ReconstructedExperimentData,
 	RuntimeStore,
 } from "./types";
@@ -27,13 +22,17 @@ export function createExperimentState(): ExperimentState {
 		metricUnit: "",
 		secondaryMetrics: [],
 		name: null,
+		goal: null,
 		currentSegment: 0,
 		maxExperiments: null,
 		confidence: null,
-		benchmarkCommand: null,
 		scopePaths: [],
 		offLimits: [],
 		constraints: [],
+		notes: "",
+		branch: null,
+		baselineCommit: null,
+		sessionId: null,
 	};
 }
@@ -43,7 +42,6 @@ export function createSessionRuntime(): AutoresearchRuntime {
 		autoResumeArmed: false,
 		dashboardExpanded: false,
 		lastAutoResumePendingRunNumber: null,
-		lastRunChecks: null,
 		lastRunDuration: null,
 		lastRunAsi: null,
 		lastRunArtifactDir: null,
@@ -58,11 +56,7 @@ export function createSessionRuntime(): AutoresearchRuntime {
 export function cloneExperimentState(state: ExperimentState): ExperimentState {
 	return {
 		...state,
-		results: state.results.map(result => ({
-			...result,
-			metrics: { ...result.metrics },
-			asi: result.asi ? structuredClone(result.asi) : undefined,
-		})),
+		results: state.results.map(cloneResult),
 		secondaryMetrics: state.secondaryMetrics.map(metric => ({ ...metric })),
 		scopePaths: [...state.scopePaths],
 		offLimits: [...state.offLimits],
@@ -70,12 +64,22 @@ export function cloneExperimentState(state: ExperimentState): ExperimentState {
 	};
 }
+function cloneResult(result: ExperimentResult): ExperimentResult {
+	return {
+		...result,
+		metrics: { ...result.metrics },
+		asi: result.asi ? structuredClone(result.asi) : undefined,
+		modifiedPaths: [...result.modifiedPaths],
+		scopeDeviations: [...result.scopeDeviations],
+	};
+}
 export function currentResults(results: ExperimentResult[], segment: number): ExperimentResult[] {
 	return results.filter(result => result.segment === segment);
 }
 export function findBaselineResult(results: ExperimentResult[], segment: number): ExperimentResult | null {
-	return currentResults(results, segment).find(result => result.status === "keep") ?? null;
+	return currentResults(results, segment).find(result => result.status === "keep" && !result.flagged) ?? null;
 }
 export function findBaselineMetric(results: ExperimentResult[], segment: number): number | null {
@@ -90,7 +94,7 @@ export function findBestKeptMetric(
 ): number | null {
 	let best: number | null = null;
 	for (const result of currentResults(results, segment)) {
-		if (result.status !== "keep") continue;
+		if (result.status !== "keep" || result.flagged) continue;
 		if (best === null || isBetter(result.metric, best, direction)) {
 			best = result.metric;
 		}
@@ -116,6 +120,7 @@ export function findBaselineSecondary(
 	for (const metric of knownMetrics) {
 		if (values[metric.name] !== undefined) continue;
 		for (const result of currentResults(results, segment)) {
+			if (result.flagged) continue;
 			const value = result.metrics[metric.name];
 			if (value !== undefined) {
 				values[metric.name] = value;
@@ -141,7 +146,7 @@ export function computeConfidence(
 	segment: number,
 	direction: MetricDirection,
 ): number | null {
-	const current = currentResults(results, segment).filter(result => result.metric > 0);
+	const current = currentResults(results, segment).filter(result => !result.flagged && result.metric > 0);
 	if (current.length < 3) return null;
 	const values = current.map(result => result.metric);
@@ -164,70 +169,52 @@ export function computeConfidence(
 	return Math.abs(bestKept - baseline) / mad;
 }
-export function reconstructStateFromJsonl(workDir: string): ReconstructedExperimentData {
+export function buildExperimentState(session: SessionRow, loggedRuns: RunRow[]): ExperimentState {
 	const state = createExperimentState();
-	const jsonlPath = path.join(workDir, "autoresearch.jsonl");
-	if (!fs.existsSync(jsonlPath)) {
-		return { hasLog: false, state };
-	}
-	const content = fs.readFileSync(jsonlPath, "utf8");
-	const lines = content
-		.split("\n")
-		.map(line => line.trim())
-		.filter(line => line.length > 0);
-	let segment = 0;
-	let sawConfig = false;
-	for (const line of lines) {
-		let parsed: unknown;
-		try {
-			parsed = JSON.parse(line) as unknown;
-		} catch {
-			continue;
-		}
-		const configEntry = parseConfigEntry(parsed);
-		if (configEntry) {
-			if (sawConfig || state.results.length > 0) {
-				segment += 1;
-			}
-			sawConfig = true;
-			state.currentSegment = segment;
-			if (configEntry.name) state.name = configEntry.name;
-			if (configEntry.metricName) state.metricName = configEntry.metricName;
-			if (configEntry.metricUnit !== undefined) state.metricUnit = configEntry.metricUnit;
-			if (configEntry.bestDirection) state.bestDirection = configEntry.bestDirection;
-			if (configEntry.benchmarkCommand !== undefined) state.benchmarkCommand = configEntry.benchmarkCommand;
-			state.scopePaths = cloneStringArray(configEntry.scopePaths);
-			state.offLimits = cloneStringArray(configEntry.offLimits);
-			state.constraints = cloneStringArray(configEntry.constraints);
-			state.secondaryMetrics = hydrateMetricDefs(configEntry.secondaryMetrics);
-			continue;
-		}
-		if (!isRunEntry(parsed)) continue;
+	state.name = session.name;
+	state.goal = session.goal;
+	state.metricName = session.primaryMetric;
+	state.metricUnit = session.metricUnit;
+	state.bestDirection = session.direction;
+	state.scopePaths = [...session.scopePaths];
+	state.offLimits = [...session.offLimits];
+	state.constraints = [...session.constraints];
+	state.notes = session.notes;
+	state.branch = session.branch;
+	state.baselineCommit = session.baselineCommit;
+	state.sessionId = session.id;
+	state.maxExperiments = session.maxIterations;
+	state.currentSegment = session.currentSegment;
+	state.secondaryMetrics = session.secondaryMetrics.map(name => ({ name, unit: inferMetricUnitFromName(name) }));
+	for (const run of loggedRuns) {
+		if (run.status === null) continue;
 		const result: ExperimentResult = {
-			runNumber: typeof parsed.run === "number" && Number.isFinite(parsed.run) ? parsed.run : null,
-			commit: typeof parsed.commit === "string" ? parsed.commit : "",
-			metric: typeof parsed.metric === "number" && Number.isFinite(parsed.metric) ? parsed.metric : 0,
-			metrics: cloneNumericMetrics(parsed.metrics),
-			status: isExperimentStatus(parsed.status) ? parsed.status : "keep",
-			description: typeof parsed.description === "string" ? parsed.description : "",
-			timestamp: typeof parsed.timestamp === "number" && Number.isFinite(parsed.timestamp) ? parsed.timestamp : 0,
-			segment,
-			confidence:
-				typeof parsed.confidence === "number" && Number.isFinite(parsed.confidence) ? parsed.confidence : null,
-			asi: cloneAsi(parsed.asi),
+			runNumber: run.id,
+			commit: run.commitHash ?? "",
+			metric: run.metric ?? 0,
+			metrics: run.metrics ?? {},
+			status: run.status,
+			description: run.description ?? "",
+			timestamp: run.loggedAt ?? run.startedAt,
+			segment: run.segment,
+			confidence: run.confidence,
+			asi: run.asi ?? undefined,
+			modifiedPaths: run.modifiedPaths ?? [],
+			scopeDeviations: run.scopeDeviations ?? [],
+			justification: run.justification,
+			flagged: run.flagged,
+			flaggedReason: run.flaggedReason,
 		};
 		state.results.push(result);
-		if (segment !== state.currentSegment) continue;
-		registerSecondaryMetrics(state.secondaryMetrics, result.metrics);
+		if (run.segment === state.currentSegment) {
+			registerSecondaryMetrics(state.secondaryMetrics, result.metrics);
+		}
 	}
 	state.bestMetric = findBaselineMetric(state.results, state.currentSegment);
 	state.confidence = computeConfidence(state.results, state.currentSegment, state.bestDirection);
-	return { hasLog: true, state };
+	return state;
 }
 export function reconstructControlState(entries: SessionEntry[]): ReconstructedControlState {
@@ -274,100 +261,6 @@ function registerSecondaryMetrics(metrics: MetricDef[], values: NumericMetricMap
 	}
 }
-function isConfigEntry(value: unknown): value is AutoresearchJsonConfigEntry {
-	if (typeof value !== "object" || value === null) return false;
-	const candidate = value as { type?: unknown };
-	return candidate.type === "config";
-}
-function parseConfigEntry(value: unknown): AutoresearchJsonConfigEntry | null {
-	if (!isConfigEntry(value)) return null;
-	const candidate = value as AutoresearchJsonConfigEntry;
-	const config: AutoresearchJsonConfigEntry = { type: "config" };
-	if (typeof candidate.name === "string" && candidate.name.trim().length > 0) {
-		config.name = candidate.name;
-	}
-	if (typeof candidate.metricName === "string" && candidate.metricName.trim().length > 0) {
-		config.metricName = candidate.metricName;
-	}
-	if (typeof candidate.metricUnit === "string") {
-		config.metricUnit = candidate.metricUnit;
-	}
-	if (candidate.bestDirection === "lower" || candidate.bestDirection === "higher") {
-		config.bestDirection = candidate.bestDirection;
-	}
-	if (typeof candidate.benchmarkCommand === "string" && candidate.benchmarkCommand.trim().length > 0) {
-		config.benchmarkCommand = candidate.benchmarkCommand;
-	}
-	if (Array.isArray(candidate.secondaryMetrics)) {
-		config.secondaryMetrics = normalizeAutoresearchList(
-			candidate.secondaryMetrics.filter((item): item is string => typeof item === "string"),
-		);
-	}
-	if (Array.isArray(candidate.scopePaths)) {
-		config.scopePaths = normalizeAutoresearchList(
-			candidate.scopePaths.filter((item): item is string => typeof item === "string").map(normalizeContractPathSpec),
-		);
-	}
-	if (Array.isArray(candidate.offLimits)) {
-		config.offLimits = normalizeAutoresearchList(
-			candidate.offLimits.filter((item): item is string => typeof item === "string").map(normalizeContractPathSpec),
-		);
-	}
-	if (Array.isArray(candidate.constraints)) {
-		config.constraints = normalizeAutoresearchList(
-			candidate.constraints.filter((item): item is string => typeof item === "string"),
-		);
-	}
-	return config;
-}
-function isRunEntry(value: unknown): value is AutoresearchJsonRunEntry {
-	if (typeof value !== "object" || value === null) return false;
-	const candidate = value as { type?: unknown };
-	return candidate.type === undefined || candidate.type === "run";
-}
-function isExperimentStatus(value: unknown): value is ExperimentResult["status"] {
-	return value === "keep" || value === "discard" || value === "crash" || value === "checks_failed";
-}
-function cloneNumericMetrics(value: unknown): NumericMetricMap {
-	if (typeof value !== "object" || value === null) return {};
-	const metrics = value as { [key: string]: unknown };
-	const clone: NumericMetricMap = {};
-	for (const [key, entryValue] of Object.entries(metrics)) {
-		if (key === "__proto__" || key === "constructor" || key === "prototype") continue;
-		if (typeof entryValue === "number" && Number.isFinite(entryValue)) {
-			clone[key] = entryValue;
-		}
-	}
-	return clone;
-}
-function cloneStringArray(value: unknown): string[] {
-	if (!Array.isArray(value)) return [];
-	return value.filter((item): item is string => typeof item === "string");
-}
-function hydrateMetricDefs(metricNames: string[] | undefined): MetricDef[] {
-	if (!metricNames) return [];
-	return metricNames.map(name => ({
-		name,
-		unit: inferMetricUnitFromName(name),
-	}));
-}
-function cloneAsi(value: unknown): ExperimentResult["asi"] {
-	if (typeof value !== "object" || value === null) return undefined;
-	const clone: { [key: string]: unknown } = {};
-	for (const [key, entryValue] of Object.entries(value)) {
-		if (key === "__proto__" || key === "constructor" || key === "prototype") continue;
-		clone[key] = structuredClone(entryValue);
-	}
-	return clone as ExperimentResult["asi"];
-}
 function parseControlEntry(value: unknown): AutoresearchControlEntryData | null {
 	if (typeof value !== "object" || value === null) return null;
 	const candidate = value as { goal?: unknown; mode?: unknown };