@oh-my-pi/pi-coding-agent 14.5.14 → 14.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/CHANGELOG.md +39 -0
  2. package/package.json +7 -7
  3. package/src/autoresearch/command-resume.md +5 -8
  4. package/src/autoresearch/git.ts +41 -51
  5. package/src/autoresearch/helpers.ts +43 -359
  6. package/src/autoresearch/index.ts +281 -273
  7. package/src/autoresearch/prompt-setup.md +43 -0
  8. package/src/autoresearch/prompt.md +52 -193
  9. package/src/autoresearch/resume-message.md +2 -8
  10. package/src/autoresearch/state.ts +59 -166
  11. package/src/autoresearch/storage.ts +687 -0
  12. package/src/autoresearch/tools/init-experiment.ts +201 -290
  13. package/src/autoresearch/tools/log-experiment.ts +304 -517
  14. package/src/autoresearch/tools/run-experiment.ts +117 -296
  15. package/src/autoresearch/tools/update-notes.ts +116 -0
  16. package/src/autoresearch/types.ts +16 -66
  17. package/src/config/settings-schema.ts +1 -1
  18. package/src/config/settings.ts +20 -1
  19. package/src/cursor.ts +1 -1
  20. package/src/edit/index.ts +9 -31
  21. package/src/edit/line-hash.ts +70 -43
  22. package/src/edit/modes/hashline.lark +26 -0
  23. package/src/edit/modes/hashline.ts +898 -1099
  24. package/src/edit/modes/patch.ts +0 -7
  25. package/src/edit/modes/replace.ts +0 -4
  26. package/src/edit/renderer.ts +22 -20
  27. package/src/edit/streaming.ts +8 -28
  28. package/src/eval/eval.lark +24 -30
  29. package/src/eval/js/context-manager.ts +5 -162
  30. package/src/eval/js/prelude.txt +0 -12
  31. package/src/eval/parse.ts +129 -129
  32. package/src/eval/py/prelude.py +1 -219
  33. package/src/export/html/template.generated.ts +1 -1
  34. package/src/export/html/template.js +2 -2
  35. package/src/internal-urls/docs-index.generated.ts +1 -1
  36. package/src/modes/components/session-observer-overlay.ts +5 -2
  37. package/src/modes/components/status-line/segments.ts +1 -1
  38. package/src/modes/components/status-line.ts +3 -5
  39. package/src/modes/components/tree-selector.ts +4 -5
  40. package/src/modes/components/welcome.ts +11 -1
  41. package/src/modes/controllers/command-controller.ts +2 -6
  42. package/src/modes/controllers/event-controller.ts +1 -2
  43. package/src/modes/controllers/extension-ui-controller.ts +3 -15
  44. package/src/modes/controllers/input-controller.ts +0 -1
  45. package/src/modes/controllers/selector-controller.ts +1 -1
  46. package/src/modes/interactive-mode.ts +5 -7
  47. package/src/prompts/system/system-prompt.md +14 -38
  48. package/src/prompts/tools/ast-edit.md +8 -8
  49. package/src/prompts/tools/ast-grep.md +10 -10
  50. package/src/prompts/tools/eval.md +13 -31
  51. package/src/prompts/tools/find.md +2 -1
  52. package/src/prompts/tools/hashline.md +66 -57
  53. package/src/prompts/tools/search.md +2 -2
  54. package/src/session/session-manager.ts +17 -13
  55. package/src/tools/ast-edit.ts +141 -44
  56. package/src/tools/ast-grep.ts +112 -36
  57. package/src/tools/eval.ts +2 -53
  58. package/src/tools/find.ts +16 -15
  59. package/src/tools/path-utils.ts +36 -196
  60. package/src/tools/search.ts +56 -35
  61. package/src/utils/edit-mode.ts +2 -11
  62. package/src/utils/file-display-mode.ts +1 -1
  63. package/src/utils/git.ts +17 -0
  64. package/src/utils/session-color.ts +0 -12
  65. package/src/utils/title-generator.ts +22 -38
  66. package/src/autoresearch/apply-contract-to-state.ts +0 -24
  67. package/src/autoresearch/contract.ts +0 -288
  68. package/src/edit/modes/atom.lark +0 -29
  69. package/src/edit/modes/atom.ts +0 -1773
  70. package/src/prompts/tools/atom.md +0 -150
@@ -0,0 +1,43 @@
1
+ {{base_system_prompt}}
2
+
3
+ ## Autoresearch Mode — Phase 1: Harness Setup
4
+
5
+ Autoresearch mode is active and there is no session yet. Your job in this turn is to **build the benchmark harness**, not to optimise anything. Optimisation starts only after you call `init_experiment`.
6
+
7
+ {{#if has_goal}}
8
+ Primary goal (for context — implement the harness so it can measure this):
9
+ {{goal}}
10
+ {{else}}
11
+ There is no goal recorded yet. Infer what to optimise from the latest user message and design the harness to measure that. Capture the goal when you call `init_experiment`.
12
+ {{/if}}
13
+
14
+ Working directory: `{{working_dir}}`
15
+ {{#if has_branch}}Active branch: `{{branch}}`{{/if}}
16
+ {{#if has_baseline_warning}}
17
+
18
+ {{baseline_warning}}
19
+ {{/if}}
20
+
21
+ ### What you must produce
22
+
23
+ Write `./autoresearch.sh` at the working directory. It is the canonical benchmark entrypoint and must:
24
+
25
+ - exit 0 on success and non-zero on failure;
26
+ - print the primary metric as a single line `METRIC <name>=<value>`;
27
+ - print any secondary metrics as additional `METRIC <name>=<value>` lines;
28
+ - run the same workload deterministically every time (no live network, no time-of-day dependencies, fixed seeds where applicable).
29
+
30
+ You **may** edit anything else needed to make `autoresearch.sh` work — benchmark binaries, `Cargo.toml`, `package.json`, helper scripts, fixtures. All those edits are part of the harness baseline and will be committed for you when you call `init_experiment` on an autoresearch branch.
31
+
32
+ ### Steps
33
+
34
+ 1. Inspect the target. Read source, identify what to measure, decide on the workload.
35
+ 2. Write `autoresearch.sh` plus any supporting files (benchmark binaries, fixtures, etc.).
36
+ 3. Validate it: invoke `bash autoresearch.sh` through the regular `bash` tool. Confirm it exits 0 and emits at least one `METRIC` line. Iterate on the harness until it does.
37
+ 4. Call `init_experiment` with the goal, primary metric (matching the `METRIC` name), and scope. This snapshots the worktree as the baseline and starts Phase 2 (the iteration loop).
38
+
39
+ ### Rules
40
+
41
+ - Do **not** call `run_experiment`, `log_experiment`, or `update_notes` yet. They will error with "no active autoresearch session" until `init_experiment` runs.
42
+ - Do **not** treat a compile-only check as a benchmark. The harness must actually execute the workload and emit `METRIC`.
43
+ - Do **not** create `autoresearch.md`, `autoresearch.checks.sh`, `autoresearch.program.md`, `autoresearch.ideas.md`, `autoresearch.jsonl`, `.autoresearch/`, or `autoresearch.config.json`. Session state is tracked for you.
@@ -8,29 +8,50 @@ Autoresearch mode is active.
8
8
  Primary goal:
9
9
  {{goal}}
10
10
  {{else}}
11
- {{#if has_autoresearch_md}}
12
- Primary goal is documented in `autoresearch.md` for this session.
13
- {{else}}
14
- There is no `autoresearch.md` yet. Infer what to optimize from the latest user message and the conversation; after you create `autoresearch.md`, keep it as the durable source of truth for goal and benchmark contract.
15
- {{/if}}
11
+ There is no goal recorded for this session yet. Infer what to optimize from the latest user message and the conversation; capture the goal in your notes (`update_notes`) once it is clear.
16
12
  {{/if}}
17
13
 
18
- Working directory:
19
- `{{working_dir}}`
14
+ Session state and run artifacts are managed for you. The benchmark entrypoint is `bash autoresearch.sh` (committed during Phase 1). Do not edit `autoresearch.sh` mid-segment unless you intentionally bump segment via `init_experiment new_segment: true`. Do not create `autoresearch.md` or `.autoresearch/` in this repo.
15
+
16
+ Working directory: `{{working_dir}}`
17
+ {{#if has_branch}}Active branch: `{{branch}}`{{/if}}
18
+ {{#if has_baseline_commit}}Baseline commit: `{{baseline_commit}}`{{/if}}
20
19
 
21
20
  You are running an autonomous experiment loop. Keep iterating until the user interrupts you or the configured maximum iteration count is reached.
22
- {{#if has_program}}
23
21
 
24
- ### Local Playbook
22
+ ### Available tools
23
+ - `init_experiment` — open or reconfigure the session. Pass `new_segment: true` to start a fresh baseline within the current session.
24
+ - `run_experiment` — run the benchmark (`bash autoresearch.sh`). Output is captured automatically and `METRIC name=value` / `ASI key=value` lines printed by the harness are parsed back to you. The command is fixed; if you need a different workload, edit `autoresearch.sh` and bump segment via `init_experiment new_segment: true`.
25
+ - `log_experiment` — record the result. On `keep`, modified files are committed for you; on `discard`/`crash`/`checks_failed`, the worktree is reverted. Pass `flag_runs` to mark earlier runs as suspect; flagged runs are excluded from baseline and best-metric math.
26
+ - `update_notes` — replace the durable session playbook (`body`) or append to the ideas backlog (`append_idea`). The notes are injected into your system prompt every iteration.
25
27
 
26
- `autoresearch.program.md` exists at `{{program_path}}`.
28
+ ### Operating protocol
29
+ 1. Understand the target before touching code: read source, identify the bottleneck, verify prerequisites and benchmark inputs.
30
+ 2. Update goal, scope, or constraints via another `init_experiment` call (no segment bump) or `update_notes`. Bump segment when you intentionally change `autoresearch.sh`.
31
+ 3. Establish a baseline first.
32
+ 4. Iterate: change code, run `run_experiment`, log honestly with `log_experiment`. One coherent experiment per iteration.
33
+ 5. Keep the primary metric as the decision maker:
34
+ - `keep` when it improves;
35
+ - `discard` when it regresses or stays flat;
36
+ - `crash` when the run fails;
37
+ - `checks_failed` when validation fails (you decide what validation means; run it through the regular `bash` tool).
38
+ 6. Use ASI freely — it is opaque, just stash useful learnings (`hypothesis`, `rollback_reason`, `next_action_hint`, anything else).
39
+ 7. When confidence is low, re-run promising changes before keeping them. `log_experiment` reports a confidence score (multiples of the observed noise floor) on each kept run.
40
+
41
+ ### Scope, off-limits, and accountability
42
+ - Edits are not blocked. You can change anything.
43
+ - `log_experiment` records the modified paths. Files outside `scope_paths` or inside `off_limits` are recorded as `scope_deviations` on the run.
44
+ - If you keep a run with deviations, pass `justification` explaining why. Without it, the run logs but is flagged in the next iteration's prompt as unjustified.
45
+ - If a previous run looks reward-hacked or otherwise wrong, pass `flag_runs: [{ run_id, reason }]` on the next `log_experiment` to exclude it from baseline and best-metric calculations.
46
+
47
+ {{#if has_notes}}
48
+ ### Your notes (use `update_notes` to edit)
49
+
50
+ {{notes}}
27
51
 
28
- Use it as a repo-local strategy overlay for this session. `autoresearch.md` remains the source of truth for benchmark, scope, and constraints.
29
52
  {{/if}}
30
53
  {{#if has_recent_results}}
31
-
32
- ### Current Segment Snapshot
33
-
54
+ ### Current segment snapshot
34
55
  - segment: `{{current_segment}}`
35
56
  - runs in current segment: `{{current_segment_run_count}}`
36
57
  {{#if has_baseline_metric}}
@@ -46,199 +67,37 @@ Recent runs:
46
67
  {{#if has_asi_summary}}
47
68
  ASI: {{asi_summary}}
48
69
  {{/if}}
70
+ {{#if has_deviations}}
71
+ Modified outside scope: {{deviations}}{{#unless justified}} (no justification){{/unless}}
72
+ {{/if}}
73
+ {{#if flagged}}
74
+ FLAGGED: {{flagged_reason}}
75
+ {{/if}}
49
76
  {{/each}}
50
77
  {{/if}}
51
- {{#if has_pending_run}}
52
-
53
- ### Pending Run
78
+ {{#if has_unjustified_runs}}
54
79
 
55
- An unlogged run artifact exists at `{{pending_run_directory}}`.
80
+ ### Unjustified deviations
81
+ {{#each unjustified_runs}}
82
+ - run `#{{run_number}}` modified `{{paths}}` outside scope without justification. Either accept it, justify it on the next log, or `flag_runs` it.
83
+ {{/each}}
84
+ {{/if}}
85
+ {{#if has_pending_run}}
56
86
 
87
+ ### Pending run
88
+ An unlogged run is waiting:
57
89
  - run: `#{{pending_run_number}}`
58
90
  - command: `{{pending_run_command}}`
59
91
  {{#if has_pending_run_metric}}
60
92
  - parsed `{{metric_name}}`: `{{pending_run_metric_display}}`
61
93
  {{/if}}
62
- - result status: {{#if pending_run_passed}}passed{{else}}failed{{/if}}
63
- - finish the `log_experiment` step before starting another benchmark
64
- {{/if}}
65
-
66
- ### Available tools
67
-
68
- - `init_experiment` — initialize or reset the experiment session for the current optimization target.
69
- - `run_experiment` — run a benchmark or experiment command with timing, output capture, structured metric parsing, and optional backpressure checks.
70
- - `log_experiment` — record the result, update the dashboard, persist JSONL history, auto-commit kept experiments, and revert only run-modified files for discarded or failed experiments (pre-existing uncommitted changes are preserved).
94
+ - result: {{#if pending_run_passed}}passed{{else}}failed{{/if}}
71
95
 
72
- ### Operating protocol
73
-
74
- 1. Understand the target before touching code.
75
- - Read the relevant source files.
76
- - Identify the true bottleneck or quality constraint.
77
- - Check existing scripts, benchmark harnesses, and config files.
78
- - Verify prerequisites, one-time setup, and benchmark inputs before the first run of a segment.
79
- 2. Keep your notes in `autoresearch.md`.
80
- - Record the goal, the benchmark command, the primary metric, important secondary metrics, the files in scope, hard constraints, preflight requirements, and the benchmark comparability invariant.
81
- - Update the notes whenever the strategy changes.
82
- - Keep durable conclusions in `autoresearch.md`.
83
- - Use `autoresearch.ideas.md` for deferred experiment ideas that are promising but not active yet.
84
- 3. Use `autoresearch.sh` as the canonical benchmark entrypoint.
85
- - If it does not exist yet, create it.
86
- - Make it print structured metric lines in the form `METRIC name=value`.
87
- - Use the same workload every run unless you intentionally re-initialize with a new segment.
88
- - Keep the measurement harness, evaluator, and fixed benchmark inputs stable unless you intentionally start a new segment and document the change.
89
- 4. Initialize the loop with `init_experiment` before the first logged run of a segment.
90
- - Pass `from_autoresearch_md: true` with only `name` to load the benchmark contract from `autoresearch.md` without mirroring every field in the tool call.
91
- - Use `abandon_unlogged_runs: true` only when you intentionally discard unlogged run artifacts and need a fresh segment (for example after a bad or obsolete benchmark directory).
92
- 5. Run a baseline first.
93
- - Establish the baseline metric before attempting optimizations.
94
- - Track secondary metrics only when they matter to correctness, quality, or obvious regressions.
95
- 6. Iterate.
96
- - Make one coherent experiment at a time.
97
- - Run `run_experiment`.
98
- - Interpret the result honestly.
99
- - Call `log_experiment` after every run (it refreshes benchmark/scope fields from `autoresearch.md` before logging so keep validation matches the file on disk).
100
- - Use `run_experiment` with `force: true` only when you must override the segment benchmark command or skip the direct-`autoresearch.sh` rule.
101
- - On `log_experiment`, `force: true` relaxes ASI requirements and allows keeping a primary-metric regression; prefer normal logging when possible.
102
- 7. Keep the primary metric as the decision maker.
103
- - `keep` when the primary metric improves.
104
- - `discard` when it regresses or stays flat.
105
- - `crash` when the run fails.
106
- - `checks_failed` when the benchmark passes but backpressure checks fail.
107
- 8. Record ASI on every `log_experiment` call.
108
- - At minimum include `hypothesis`.
109
- - On `discard`, `crash`, or `checks_failed`, also include `rollback_reason` and `next_action_hint`.
110
- - Use ASI to capture what you learned, not just what you changed.
111
- 9. Prefer simpler wins.
112
- - Remove dead ends.
113
- - Keep equal or near-equal results when they materially simplify the implementation.
114
- - Do not keep ugly complexity for tiny gains unless the payoff is clearly worth it.
115
- - Do not thrash between unrelated ideas without writing down the conclusion.
116
- 10. When confidence is low, confirm.
117
- - The dashboard confidence score compares the best observed improvement against the observed noise floor.
118
- - Below `1.0x` usually means the improvement is within noise.
119
- - Re-run promising changes when needed before keeping them.
120
-
121
- ### Benchmark harness guidance
122
-
123
- Your benchmark script SHOULD:
124
-
125
- - live at `autoresearch.sh`
126
- - run from `{{working_dir}}`
127
- - fail with a non-zero exit status on invalid runs
128
- - print the primary metric as `METRIC {{default_metric_name}}=<number>` or another explicit metric name chosen during initialization
129
- - print secondary metrics as additional `METRIC name=value` lines
130
- - avoid extra randomness when possible
131
- - use repeated samples and median-style summaries for fast benchmarks
132
- - preserve the comparability invariant for the current segment
133
- - keep the ground-truth evaluator and fixed benchmark inputs unchanged unless the segment is explicitly re-initialized
134
-
135
- ### Notes file template
136
-
137
- Keep `autoresearch.md` concise and current.
138
-
139
- Suggested structure:
140
-
141
- ```md
142
- # Autoresearch
143
-
144
- ## Goal
145
- {{#if has_goal}}
146
- - {{goal}}
147
- {{else}}
148
- {{#if has_autoresearch_md}}
149
- - document the active target here before the first benchmark
150
- {{else}}
151
- - (derive from the user's messages, then record here)
152
- {{/if}}
96
+ Finish the `log_experiment` step before starting another benchmark.
153
97
  {{/if}}
154
98
 
155
- ## Benchmark
156
- - command:
157
- - primary metric:
158
- - metric unit:
159
- - direction:
160
- - secondary metrics: memory_mb, rss_mb
161
-
162
- ## Files in Scope
163
- - path:
164
-
165
- ## Off Limits
166
- - path:
167
-
168
- ## Constraints
169
- - rule:
170
-
171
- ## Baseline
172
- - metric:
173
- - notes:
174
-
175
- ## Current best
176
- - metric:
177
- - why it won:
178
-
179
- ## What's Been Tried
180
- - experiment:
181
- - lesson:
182
- ```
183
-
184
99
  ### Guardrails
185
-
186
100
  - Do not game the benchmark.
187
101
  - Do not overfit to synthetic inputs if the real workload is broader.
188
102
  - Preserve correctness.
189
- - Only modify files that are explicitly in scope for the current session.
190
- - Do not use the general shell tool for file mutations during autoresearch. Use `write`, `edit`, or `ast_edit` for scoped code changes and `run_experiment` for benchmark execution.
191
- - If you create `autoresearch.checks.sh`, treat it as a hard gate for `keep`.
192
103
  - If the user sends another message while a run is in progress, finish the current run and logging cycle first, then address the new input in the next iteration.
193
-
194
- {{#if has_autoresearch_md}}
195
- ### Resume mode
196
-
197
- `autoresearch.md` already exists at `{{autoresearch_md_path}}`.
198
-
199
- Resume from the existing notes:
200
-
201
- - read `autoresearch.md`
202
- - inspect recent git history
203
- - inspect `autoresearch.jsonl`
204
- - continue from the most promising unfinished direction on the current protected branch
205
-
206
- {{else}}
207
- ### Initial setup
208
-
209
- `autoresearch.md` does not exist yet. You decide the benchmark contract, harness, and scope from the user's messages and the repository—do not ask the user to re-type benchmark commands or metric names in a separate UI prompt.
210
-
211
- Before the first benchmark:
212
-
213
- - Write `autoresearch.md` with goal, benchmark command (must be a **direct** invocation of `autoresearch.sh`, e.g. `bash autoresearch.sh`), primary metric name and unit, direction (`lower` or `higher`), tradeoff metrics if relevant, files in scope, off limits, and constraints.
214
- - Add a short preflight section: prerequisites, one-time setup, and the comparability invariant that must stay fixed across runs.
215
- - Mark ground-truth evaluators, fixed datasets, and other measurement-critical files as off limits or hard constraints when they define the benchmark contract.
216
- - Write or update `autoresearch.program.md` when you learn durable heuristics, failure patterns, or repo-specific strategy for later resume turns.
217
- - Create `autoresearch.sh` as the canonical benchmark entrypoint; print the primary metric as `METRIC <name>=<number>` and optional secondary metrics as additional `METRIC` lines.
218
- - Optionally add `autoresearch.checks.sh` if correctness or quality needs a hard gate.
219
- - Call `init_experiment` with arguments that match `autoresearch.md` exactly (benchmark command, metric, unit, direction, scope paths, off limits, constraints).
220
- - Run and log the baseline.
221
-
222
- Until `init_experiment` succeeds, only autoresearch control files (`autoresearch.md`, `autoresearch.sh`, `autoresearch.program.md`, `autoresearch.ideas.md`, `autoresearch.checks.sh`) may be edited; after initialization, respect Files in Scope from the contract.
223
-
224
- {{/if}}
225
- {{#if has_checks}}
226
- ### Backpressure checks
227
-
228
- `autoresearch.checks.sh` exists at `{{checks_path}}` and runs automatically after passing benchmark runs.
229
-
230
- Treat failing checks as a failed experiment:
231
-
232
- - do not `keep` a run when checks fail
233
- - log it as `checks_failed`
234
- - diagnose the regression before continuing
235
-
236
- {{/if}}
237
- {{#if has_ideas}}
238
- ### Ideas backlog
239
-
240
- `autoresearch.ideas.md` exists at `{{ideas_path}}`.
241
-
242
- Use it to keep promising but deferred experiments. `autoresearch.md` should hold durable conclusions; `autoresearch.ideas.md` is the scratch backlog. Prune stale ideas when they are disproven or superseded.
243
-
244
- {{/if}}
@@ -1,16 +1,10 @@
1
1
  Continue the autoresearch loop now.
2
2
 
3
- @{{autoresearch_md_path}}
4
-
5
- - Read `autoresearch.md` and `autoresearch.jsonl`.
6
- - Treat `autoresearch.md` as the source of truth for the current direction, scope, and constraints.
3
+ - Re-read your notes and the recent-runs context above before deciding the next direction.
7
4
  - Inspect recent git history for context.
8
5
  {{#if has_pending_run}}
9
- - Inspect the latest unlogged `run.json` under `.autoresearch/runs/` and finish the pending `log_experiment` step before starting a new benchmark.
6
+ - A previous benchmark run completed but was never logged. Finish `log_experiment` before starting a new run.
10
7
  {{/if}}
11
8
  - Continue from the most promising unfinished direction.
12
- {{#if has_ideas}}
13
- - Review `autoresearch.ideas.md` for deferred next steps and prune stale items.
14
- {{/if}}
15
9
  - Keep iterating until interrupted or until the configured iteration cap is reached.
16
10
  - Preserve correctness and do not game the benchmark.
@@ -1,12 +1,8 @@
1
- import * as fs from "node:fs";
2
- import * as path from "node:path";
3
1
  import type { SessionEntry } from "../session/session-manager";
4
- import { normalizeAutoresearchList, normalizeContractPathSpec } from "./contract";
5
2
  import { inferMetricUnitFromName, isBetter } from "./helpers";
3
+ import type { RunRow, SessionRow } from "./storage";
6
4
  import type {
7
5
  AutoresearchControlEntryData,
8
- AutoresearchJsonConfigEntry,
9
- AutoresearchJsonRunEntry,
10
6
  AutoresearchRuntime,
11
7
  ExperimentResult,
12
8
  ExperimentState,
@@ -14,7 +10,6 @@ import type {
14
10
  MetricDirection,
15
11
  NumericMetricMap,
16
12
  ReconstructedControlState,
17
- ReconstructedExperimentData,
18
13
  RuntimeStore,
19
14
  } from "./types";
20
15
 
@@ -27,13 +22,17 @@ export function createExperimentState(): ExperimentState {
27
22
  metricUnit: "",
28
23
  secondaryMetrics: [],
29
24
  name: null,
25
+ goal: null,
30
26
  currentSegment: 0,
31
27
  maxExperiments: null,
32
28
  confidence: null,
33
- benchmarkCommand: null,
34
29
  scopePaths: [],
35
30
  offLimits: [],
36
31
  constraints: [],
32
+ notes: "",
33
+ branch: null,
34
+ baselineCommit: null,
35
+ sessionId: null,
37
36
  };
38
37
  }
39
38
 
@@ -43,7 +42,6 @@ export function createSessionRuntime(): AutoresearchRuntime {
43
42
  autoResumeArmed: false,
44
43
  dashboardExpanded: false,
45
44
  lastAutoResumePendingRunNumber: null,
46
- lastRunChecks: null,
47
45
  lastRunDuration: null,
48
46
  lastRunAsi: null,
49
47
  lastRunArtifactDir: null,
@@ -58,11 +56,7 @@ export function createSessionRuntime(): AutoresearchRuntime {
58
56
  export function cloneExperimentState(state: ExperimentState): ExperimentState {
59
57
  return {
60
58
  ...state,
61
- results: state.results.map(result => ({
62
- ...result,
63
- metrics: { ...result.metrics },
64
- asi: result.asi ? structuredClone(result.asi) : undefined,
65
- })),
59
+ results: state.results.map(cloneResult),
66
60
  secondaryMetrics: state.secondaryMetrics.map(metric => ({ ...metric })),
67
61
  scopePaths: [...state.scopePaths],
68
62
  offLimits: [...state.offLimits],
@@ -70,12 +64,22 @@ export function cloneExperimentState(state: ExperimentState): ExperimentState {
70
64
  };
71
65
  }
72
66
 
67
+ function cloneResult(result: ExperimentResult): ExperimentResult {
68
+ return {
69
+ ...result,
70
+ metrics: { ...result.metrics },
71
+ asi: result.asi ? structuredClone(result.asi) : undefined,
72
+ modifiedPaths: [...result.modifiedPaths],
73
+ scopeDeviations: [...result.scopeDeviations],
74
+ };
75
+ }
76
+
73
77
  export function currentResults(results: ExperimentResult[], segment: number): ExperimentResult[] {
74
78
  return results.filter(result => result.segment === segment);
75
79
  }
76
80
 
77
81
  export function findBaselineResult(results: ExperimentResult[], segment: number): ExperimentResult | null {
78
- return currentResults(results, segment).find(result => result.status === "keep") ?? null;
82
+ return currentResults(results, segment).find(result => result.status === "keep" && !result.flagged) ?? null;
79
83
  }
80
84
 
81
85
  export function findBaselineMetric(results: ExperimentResult[], segment: number): number | null {
@@ -90,7 +94,7 @@ export function findBestKeptMetric(
90
94
  ): number | null {
91
95
  let best: number | null = null;
92
96
  for (const result of currentResults(results, segment)) {
93
- if (result.status !== "keep") continue;
97
+ if (result.status !== "keep" || result.flagged) continue;
94
98
  if (best === null || isBetter(result.metric, best, direction)) {
95
99
  best = result.metric;
96
100
  }
@@ -116,6 +120,7 @@ export function findBaselineSecondary(
116
120
  for (const metric of knownMetrics) {
117
121
  if (values[metric.name] !== undefined) continue;
118
122
  for (const result of currentResults(results, segment)) {
123
+ if (result.flagged) continue;
119
124
  const value = result.metrics[metric.name];
120
125
  if (value !== undefined) {
121
126
  values[metric.name] = value;
@@ -141,7 +146,7 @@ export function computeConfidence(
141
146
  segment: number,
142
147
  direction: MetricDirection,
143
148
  ): number | null {
144
- const current = currentResults(results, segment).filter(result => result.metric > 0);
149
+ const current = currentResults(results, segment).filter(result => !result.flagged && result.metric > 0);
145
150
  if (current.length < 3) return null;
146
151
 
147
152
  const values = current.map(result => result.metric);
@@ -164,70 +169,52 @@ export function computeConfidence(
164
169
  return Math.abs(bestKept - baseline) / mad;
165
170
  }
166
171
 
167
- export function reconstructStateFromJsonl(workDir: string): ReconstructedExperimentData {
172
+ export function buildExperimentState(session: SessionRow, loggedRuns: RunRow[]): ExperimentState {
168
173
  const state = createExperimentState();
169
- const jsonlPath = path.join(workDir, "autoresearch.jsonl");
170
- if (!fs.existsSync(jsonlPath)) {
171
- return { hasLog: false, state };
172
- }
173
-
174
- const content = fs.readFileSync(jsonlPath, "utf8");
175
- const lines = content
176
- .split("\n")
177
- .map(line => line.trim())
178
- .filter(line => line.length > 0);
179
-
180
- let segment = 0;
181
- let sawConfig = false;
182
- for (const line of lines) {
183
- let parsed: unknown;
184
- try {
185
- parsed = JSON.parse(line) as unknown;
186
- } catch {
187
- continue;
188
- }
189
-
190
- const configEntry = parseConfigEntry(parsed);
191
- if (configEntry) {
192
- if (sawConfig || state.results.length > 0) {
193
- segment += 1;
194
- }
195
- sawConfig = true;
196
- state.currentSegment = segment;
197
- if (configEntry.name) state.name = configEntry.name;
198
- if (configEntry.metricName) state.metricName = configEntry.metricName;
199
- if (configEntry.metricUnit !== undefined) state.metricUnit = configEntry.metricUnit;
200
- if (configEntry.bestDirection) state.bestDirection = configEntry.bestDirection;
201
- if (configEntry.benchmarkCommand !== undefined) state.benchmarkCommand = configEntry.benchmarkCommand;
202
- state.scopePaths = cloneStringArray(configEntry.scopePaths);
203
- state.offLimits = cloneStringArray(configEntry.offLimits);
204
- state.constraints = cloneStringArray(configEntry.constraints);
205
- state.secondaryMetrics = hydrateMetricDefs(configEntry.secondaryMetrics);
206
- continue;
207
- }
208
-
209
- if (!isRunEntry(parsed)) continue;
174
+ state.name = session.name;
175
+ state.goal = session.goal;
176
+ state.metricName = session.primaryMetric;
177
+ state.metricUnit = session.metricUnit;
178
+ state.bestDirection = session.direction;
179
+ state.scopePaths = [...session.scopePaths];
180
+ state.offLimits = [...session.offLimits];
181
+ state.constraints = [...session.constraints];
182
+ state.notes = session.notes;
183
+ state.branch = session.branch;
184
+ state.baselineCommit = session.baselineCommit;
185
+ state.sessionId = session.id;
186
+ state.maxExperiments = session.maxIterations;
187
+ state.currentSegment = session.currentSegment;
188
+ state.secondaryMetrics = session.secondaryMetrics.map(name => ({ name, unit: inferMetricUnitFromName(name) }));
189
+
190
+ for (const run of loggedRuns) {
191
+ if (run.status === null) continue;
210
192
  const result: ExperimentResult = {
211
- runNumber: typeof parsed.run === "number" && Number.isFinite(parsed.run) ? parsed.run : null,
212
- commit: typeof parsed.commit === "string" ? parsed.commit : "",
213
- metric: typeof parsed.metric === "number" && Number.isFinite(parsed.metric) ? parsed.metric : 0,
214
- metrics: cloneNumericMetrics(parsed.metrics),
215
- status: isExperimentStatus(parsed.status) ? parsed.status : "keep",
216
- description: typeof parsed.description === "string" ? parsed.description : "",
217
- timestamp: typeof parsed.timestamp === "number" && Number.isFinite(parsed.timestamp) ? parsed.timestamp : 0,
218
- segment,
219
- confidence:
220
- typeof parsed.confidence === "number" && Number.isFinite(parsed.confidence) ? parsed.confidence : null,
221
- asi: cloneAsi(parsed.asi),
193
+ runNumber: run.id,
194
+ commit: run.commitHash ?? "",
195
+ metric: run.metric ?? 0,
196
+ metrics: run.metrics ?? {},
197
+ status: run.status,
198
+ description: run.description ?? "",
199
+ timestamp: run.loggedAt ?? run.startedAt,
200
+ segment: run.segment,
201
+ confidence: run.confidence,
202
+ asi: run.asi ?? undefined,
203
+ modifiedPaths: run.modifiedPaths ?? [],
204
+ scopeDeviations: run.scopeDeviations ?? [],
205
+ justification: run.justification,
206
+ flagged: run.flagged,
207
+ flaggedReason: run.flaggedReason,
222
208
  };
223
209
  state.results.push(result);
224
- if (segment !== state.currentSegment) continue;
225
- registerSecondaryMetrics(state.secondaryMetrics, result.metrics);
210
+ if (run.segment === state.currentSegment) {
211
+ registerSecondaryMetrics(state.secondaryMetrics, result.metrics);
212
+ }
226
213
  }
227
214
 
228
215
  state.bestMetric = findBaselineMetric(state.results, state.currentSegment);
229
216
  state.confidence = computeConfidence(state.results, state.currentSegment, state.bestDirection);
230
- return { hasLog: true, state };
217
+ return state;
231
218
  }
232
219
 
233
220
  export function reconstructControlState(entries: SessionEntry[]): ReconstructedControlState {
@@ -274,100 +261,6 @@ function registerSecondaryMetrics(metrics: MetricDef[], values: NumericMetricMap
274
261
  }
275
262
  }
276
263
 
277
- function isConfigEntry(value: unknown): value is AutoresearchJsonConfigEntry {
278
- if (typeof value !== "object" || value === null) return false;
279
- const candidate = value as { type?: unknown };
280
- return candidate.type === "config";
281
- }
282
-
283
- function parseConfigEntry(value: unknown): AutoresearchJsonConfigEntry | null {
284
- if (!isConfigEntry(value)) return null;
285
- const candidate = value as AutoresearchJsonConfigEntry;
286
- const config: AutoresearchJsonConfigEntry = { type: "config" };
287
- if (typeof candidate.name === "string" && candidate.name.trim().length > 0) {
288
- config.name = candidate.name;
289
- }
290
- if (typeof candidate.metricName === "string" && candidate.metricName.trim().length > 0) {
291
- config.metricName = candidate.metricName;
292
- }
293
- if (typeof candidate.metricUnit === "string") {
294
- config.metricUnit = candidate.metricUnit;
295
- }
296
- if (candidate.bestDirection === "lower" || candidate.bestDirection === "higher") {
297
- config.bestDirection = candidate.bestDirection;
298
- }
299
- if (typeof candidate.benchmarkCommand === "string" && candidate.benchmarkCommand.trim().length > 0) {
300
- config.benchmarkCommand = candidate.benchmarkCommand;
301
- }
302
- if (Array.isArray(candidate.secondaryMetrics)) {
303
- config.secondaryMetrics = normalizeAutoresearchList(
304
- candidate.secondaryMetrics.filter((item): item is string => typeof item === "string"),
305
- );
306
- }
307
- if (Array.isArray(candidate.scopePaths)) {
308
- config.scopePaths = normalizeAutoresearchList(
309
- candidate.scopePaths.filter((item): item is string => typeof item === "string").map(normalizeContractPathSpec),
310
- );
311
- }
312
- if (Array.isArray(candidate.offLimits)) {
313
- config.offLimits = normalizeAutoresearchList(
314
- candidate.offLimits.filter((item): item is string => typeof item === "string").map(normalizeContractPathSpec),
315
- );
316
- }
317
- if (Array.isArray(candidate.constraints)) {
318
- config.constraints = normalizeAutoresearchList(
319
- candidate.constraints.filter((item): item is string => typeof item === "string"),
320
- );
321
- }
322
- return config;
323
- }
324
-
325
- function isRunEntry(value: unknown): value is AutoresearchJsonRunEntry {
326
- if (typeof value !== "object" || value === null) return false;
327
- const candidate = value as { type?: unknown };
328
- return candidate.type === undefined || candidate.type === "run";
329
- }
330
-
331
- function isExperimentStatus(value: unknown): value is ExperimentResult["status"] {
332
- return value === "keep" || value === "discard" || value === "crash" || value === "checks_failed";
333
- }
334
-
335
- function cloneNumericMetrics(value: unknown): NumericMetricMap {
336
- if (typeof value !== "object" || value === null) return {};
337
- const metrics = value as { [key: string]: unknown };
338
- const clone: NumericMetricMap = {};
339
- for (const [key, entryValue] of Object.entries(metrics)) {
340
- if (key === "__proto__" || key === "constructor" || key === "prototype") continue;
341
- if (typeof entryValue === "number" && Number.isFinite(entryValue)) {
342
- clone[key] = entryValue;
343
- }
344
- }
345
- return clone;
346
- }
347
-
348
- function cloneStringArray(value: unknown): string[] {
349
- if (!Array.isArray(value)) return [];
350
- return value.filter((item): item is string => typeof item === "string");
351
- }
352
-
353
- function hydrateMetricDefs(metricNames: string[] | undefined): MetricDef[] {
354
- if (!metricNames) return [];
355
- return metricNames.map(name => ({
356
- name,
357
- unit: inferMetricUnitFromName(name),
358
- }));
359
- }
360
-
361
- function cloneAsi(value: unknown): ExperimentResult["asi"] {
362
- if (typeof value !== "object" || value === null) return undefined;
363
- const clone: { [key: string]: unknown } = {};
364
- for (const [key, entryValue] of Object.entries(value)) {
365
- if (key === "__proto__" || key === "constructor" || key === "prototype") continue;
366
- clone[key] = structuredClone(entryValue);
367
- }
368
- return clone as ExperimentResult["asi"];
369
- }
370
-
371
264
  function parseControlEntry(value: unknown): AutoresearchControlEntryData | null {
372
265
  if (typeof value !== "object" || value === null) return null;
373
266
  const candidate = value as { goal?: unknown; mode?: unknown };