@oh-my-pi/pi-coding-agent 13.14.0 → 13.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +140 -0
- package/package.json +10 -8
- package/src/autoresearch/command-initialize.md +34 -0
- package/src/autoresearch/command-resume.md +17 -0
- package/src/autoresearch/contract.ts +332 -0
- package/src/autoresearch/dashboard.ts +447 -0
- package/src/autoresearch/git.ts +243 -0
- package/src/autoresearch/helpers.ts +458 -0
- package/src/autoresearch/index.ts +693 -0
- package/src/autoresearch/prompt.md +227 -0
- package/src/autoresearch/resume-message.md +16 -0
- package/src/autoresearch/state.ts +386 -0
- package/src/autoresearch/tools/init-experiment.ts +310 -0
- package/src/autoresearch/tools/log-experiment.ts +833 -0
- package/src/autoresearch/tools/run-experiment.ts +640 -0
- package/src/autoresearch/types.ts +218 -0
- package/src/cli/args.ts +8 -2
- package/src/cli/initial-message.ts +58 -0
- package/src/config/keybindings.ts +417 -212
- package/src/config/model-registry.ts +1 -0
- package/src/config/model-resolver.ts +57 -9
- package/src/config/settings-schema.ts +38 -10
- package/src/config/settings.ts +1 -4
- package/src/exec/bash-executor.ts +7 -5
- package/src/export/html/template.css +43 -13
- package/src/export/html/template.generated.ts +1 -1
- package/src/export/html/template.html +1 -0
- package/src/export/html/template.js +107 -0
- package/src/extensibility/extensions/types.ts +31 -8
- package/src/internal-urls/docs-index.generated.ts +1 -1
- package/src/lsp/index.ts +1 -1
- package/src/main.ts +44 -44
- package/src/mcp/oauth-discovery.ts +1 -1
- package/src/modes/acp/acp-agent.ts +957 -0
- package/src/modes/acp/acp-event-mapper.ts +531 -0
- package/src/modes/acp/acp-mode.ts +13 -0
- package/src/modes/acp/index.ts +2 -0
- package/src/modes/components/agent-dashboard.ts +5 -4
- package/src/modes/components/bash-execution.ts +40 -11
- package/src/modes/components/custom-editor.ts +47 -47
- package/src/modes/components/extensions/extension-dashboard.ts +2 -1
- package/src/modes/components/history-search.ts +2 -1
- package/src/modes/components/hook-editor.ts +2 -1
- package/src/modes/components/hook-input.ts +8 -7
- package/src/modes/components/hook-selector.ts +15 -10
- package/src/modes/components/keybinding-hints.ts +9 -9
- package/src/modes/components/login-dialog.ts +3 -3
- package/src/modes/components/mcp-add-wizard.ts +2 -1
- package/src/modes/components/model-selector.ts +14 -3
- package/src/modes/components/oauth-selector.ts +2 -1
- package/src/modes/components/python-execution.ts +2 -3
- package/src/modes/components/session-selector.ts +2 -1
- package/src/modes/components/settings-selector.ts +2 -1
- package/src/modes/components/status-line-segment-editor.ts +2 -1
- package/src/modes/components/tool-execution.ts +4 -5
- package/src/modes/components/tree-selector.ts +3 -2
- package/src/modes/components/user-message-selector.ts +3 -8
- package/src/modes/components/user-message.ts +16 -0
- package/src/modes/controllers/command-controller.ts +0 -2
- package/src/modes/controllers/extension-ui-controller.ts +89 -4
- package/src/modes/controllers/input-controller.ts +29 -23
- package/src/modes/controllers/mcp-command-controller.ts +1 -1
- package/src/modes/index.ts +1 -0
- package/src/modes/interactive-mode.ts +17 -5
- package/src/modes/print-mode.ts +1 -1
- package/src/modes/prompt-action-autocomplete.ts +7 -7
- package/src/modes/rpc/rpc-mode.ts +7 -2
- package/src/modes/rpc/rpc-types.ts +1 -0
- package/src/modes/theme/theme.ts +53 -44
- package/src/modes/types.ts +9 -2
- package/src/modes/utils/hotkeys-markdown.ts +19 -19
- package/src/modes/utils/keybinding-matchers.ts +21 -0
- package/src/modes/utils/ui-helpers.ts +1 -1
- package/src/patch/hashline.ts +139 -127
- package/src/patch/index.ts +77 -59
- package/src/patch/shared.ts +19 -11
- package/src/prompts/tools/hashline.md +43 -116
- package/src/sdk.ts +34 -17
- package/src/session/agent-session.ts +123 -30
- package/src/session/session-manager.ts +32 -31
- package/src/session/streaming-output.ts +87 -37
- package/src/tools/ask.ts +56 -30
- package/src/tools/bash-interactive.ts +2 -6
- package/src/tools/bash-interceptor.ts +1 -39
- package/src/tools/bash-skill-urls.ts +1 -1
- package/src/tools/browser.ts +1 -1
- package/src/tools/gemini-image.ts +1 -1
- package/src/tools/python.ts +2 -2
- package/src/tools/resolve.ts +1 -1
- package/src/utils/child-process.ts +88 -0
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,146 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
## [13.15.0] - 2026-03-23
|
|
6
|
+
### Breaking Changes
|
|
7
|
+
|
|
8
|
+
- Changed hashline edit schema from flat `op`/`pos`/`end`/`lines` fields to structured `loc`/`content` format with location-specific objects
|
|
9
|
+
- Renamed hashline edit operations: `replace_line` → `{ line: anchor }`, `replace_range` → `{ block: { pos, end } }`, `append_at` → `{ append: anchor }`, `prepend_at` → `{ prepend: anchor }`, `append_file` → `"append"`, `prepend_file` → `"prepend"`
|
|
10
|
+
- Changed `lines` parameter to `content` in hashline edit entries
|
|
11
|
+
- Renamed hashline edit operation types: `append` → `append_at`, `prepend` → `prepend_at`, `append_eof` → `append_file`, `prepend_bof` → `prepend_file`
|
|
12
|
+
- Changed hashline edit operation types from `replace` (with optional `end`) to explicit `replace_line` and `replace_range` operations
|
|
13
|
+
- Added required `append_eof` and `prepend_bof` operations for file-level edits; `append` and `prepend` now require an anchor position
|
|
14
|
+
- Made `pos` parameter required for `replace_line`, `append`, and `prepend` operations; `append_eof` and `prepend_bof` no longer accept anchors
|
|
15
|
+
|
|
16
|
+
### Added
|
|
17
|
+
|
|
18
|
+
- Added prompt for tradeoff metrics during autoresearch setup to collect secondary metrics alongside primary metric
|
|
19
|
+
- Added validation of contract path specifications to reject absolute paths and parent directory references
|
|
20
|
+
- Added stricter benchmark command validation in `isAutoresearchShCommand()` to reject chained commands, pipes, and redirects
|
|
21
|
+
- Added protection against prototype pollution in ASI data and metric cloning by filtering `__proto__`, `constructor`, and `prototype` keys
|
|
22
|
+
- Added `autoResumeArmed` flag to track when autoresearch should automatically resume pending runs
|
|
23
|
+
- Added `lastAutoResumePendingRunNumber` to prevent duplicate auto-resume prompts for the same pending run
|
|
24
|
+
- Added `git clean -X` invocation during failed experiment rollback to remove ignored build artifacts
|
|
25
|
+
- Added validation to reject `init_experiment` when a previous run is still pending and unlogged
|
|
26
|
+
- Added autoresearch contract system for validating benchmark commands, metrics, scope paths, off-limits paths, and constraints with fingerprint tracking to detect configuration drift
|
|
27
|
+
- Added `autoresearch.program.md` support for repo-local playbook overlays that guide session strategy while preserving `autoresearch.md` as source of truth
|
|
28
|
+
- Added pending run artifact tracking and recovery to resume incomplete experiments from `.autoresearch/runs/` directory with run numbers and benchmark logs
|
|
29
|
+
- Added run directory organization with numbered run artifacts, benchmark logs, and optional checks logs for experiment traceability
|
|
30
|
+
- Added segment fingerprinting to detect when benchmark configuration changes between runs and warn about potential incomparability
|
|
31
|
+
- Added support for secondary metrics tracking alongside primary metric with configurable direction (lower/higher is better)
|
|
32
|
+
- Added `getCurrentAutoresearchBranch()` helper to detect and validate existing autoresearch branches for session resumption
|
|
33
|
+
- Added `PendingRunSummary` type to track unlogged run state including parsed metrics, ASI data, and pass/fail status
|
|
34
|
+
- Added hidden next-turn message delivery via `deliverAs: 'nextTurn'` with optional `triggerTurn` to queue context for next LLM call without exposing in editable queue
|
|
35
|
+
- Added `#queueHiddenNextTurnMessage()` and `#promptQueuedHiddenNextTurnMessages()` to AgentSession for autonomous tool reactions
|
|
36
|
+
- Added resume context support in `command-resume.md` template for user-provided guidance when resuming sessions
|
|
37
|
+
- Added current segment snapshot display in autoresearch prompt showing recent runs, baseline metrics, and best results
|
|
38
|
+
- Added pending run indicator in autoresearch prompt to guide users to complete unlogged experiments before starting new benchmarks
|
|
39
|
+
- Added local playbook section in autoresearch prompt when `autoresearch.program.md` exists
|
|
40
|
+
- Added tab replacement in dashboard and tool output rendering to prevent display corruption from shell commands with tabs
|
|
41
|
+
- Added boundary duplication warning when replace_range or replace_line operations include a last inserted line that matches the next surviving line, helping detect off-by-one range errors
|
|
42
|
+
- Added git branch isolation for autoresearch sessions via `ensureAutoresearchBranch()` to safely revert failed experiments
|
|
43
|
+
- Added branch status line to autoresearch initialization and resume prompts showing created or reused branch name
|
|
44
|
+
- Added `Files in Scope`, `Off Limits`, and `Constraints` sections to autoresearch.md template for explicit scope definition
|
|
45
|
+
- Added validation of ASI metadata requirements in `log_experiment` tool, requiring hypothesis for all runs and rollback context for failed runs
|
|
46
|
+
- Added keybinding matcher utilities `matchesAppInterrupt()` and `matchesSelectCancel()` for consistent escape key handling across components
|
|
47
|
+
- Added support for customizable `app.interrupt` and `tui.select.cancel` keybindings in interactive components
|
|
48
|
+
- Added `defaultInactive` property to `ToolDefinition` to allow tools to be registered but excluded from the initial active set, with extension responsibility for activation/deactivation
|
|
49
|
+
- Added dynamic tool activation/deactivation in autoresearch mode via `setActiveTools()` API
|
|
50
|
+
- Added separate initialization and resume workflows for autoresearch with `command-initialize.md` and `command-resume.md` prompts
|
|
51
|
+
- Added intent dialog to prompt users for autoresearch optimization goals when starting fresh
|
|
52
|
+
- Added automatic detection of existing `autoresearch.md` to resume from previous sessions without re-prompting for intent
|
|
53
|
+
- Added autoresearch extension with autonomous experiment loop capabilities
|
|
54
|
+
- Added `init_experiment` tool to initialize and reset autoresearch sessions with configurable metrics
|
|
55
|
+
- Added `log_experiment` tool to record experiment results with metric parsing and confidence tracking
|
|
56
|
+
- Added `run_experiment` tool to execute commands and capture metrics with timeout and crash detection
|
|
57
|
+
- Added autoresearch dashboard controller for displaying experiment results and optimization progress
|
|
58
|
+
- Added support for secondary metrics tracking alongside primary metric
|
|
59
|
+
- Added `ExtensionWidgetContent` and `ExtensionUiComponentFactory` types for flexible widget configuration
|
|
60
|
+
- Added `ExtensionWidgetOptions` interface with `placement` parameter to position widgets above or below editor
|
|
61
|
+
- Added `WidgetPlacement` type supporting 'aboveEditor' and 'belowEditor' placement options
|
|
62
|
+
- Added `hookWidgetContainerAbove` and `hookWidgetContainerBelow` containers to InteractiveMode for separate widget management
|
|
63
|
+
- Added autoresearch mode for autonomous experiment loops with init_experiment, log_experiment, and run_experiment tools
|
|
64
|
+
- Added autoresearch dashboard widget displaying experiment results, metrics, and optimization progress
|
|
65
|
+
- Added support for metric tracking with configurable direction (lower/higher is better) and secondary metrics
|
|
66
|
+
- Added widget placement options to position extensions above or below the editor via `placement` parameter
|
|
67
|
+
- Added `ExtensionWidgetContent` and `ExtensionWidgetOptions` types for flexible widget configuration
|
|
68
|
+
- Added ACP (Agent Client Protocol) mode for headless agent operation via `--mode acp`
|
|
69
|
+
- Added support for Agent Client Protocol SDK integration with session management, MCP server configuration, and streaming communication
|
|
70
|
+
- Added `ensureOnDisk()` method to SessionManager to persist sessions immediately for ACP discovery
|
|
71
|
+
|
|
72
|
+
### Changed
|
|
73
|
+
|
|
74
|
+
- Changed `isAutoresearchShCommand()` to use proper command-line argument parsing instead of regex, improving accuracy for complex shell invocations
|
|
75
|
+
- Changed autoresearch initialization prompt to display collected tradeoff metrics in the setup summary
|
|
76
|
+
- Changed `command-initialize.md` template to include guidance on preflight requirements, comparability invariants, and marking measurement-critical files as off-limits
|
|
77
|
+
- Changed `command-initialize.md` to instruct users to write or update `autoresearch.program.md` with durable heuristics and repo-specific strategy
|
|
78
|
+
- Changed autoresearch resume guidance to emphasize continuing on the current protected branch rather than switching branches
|
|
79
|
+
- Changed autoresearch prompt to clarify that `autoresearch.md` holds durable conclusions while `autoresearch.ideas.md` is the scratch backlog
|
|
80
|
+
- Changed autoresearch prompt guidance to require stable measurement harness and fixed benchmark inputs unless intentionally starting a new segment
|
|
81
|
+
- Changed autoresearch prompt to recommend keeping equal or near-equal results when they materially simplify implementation
|
|
82
|
+
- Changed `init_experiment` to reset pending run state (checks, duration, ASI, artifact directory) when initializing a new segment
|
|
83
|
+
- Changed `log_experiment` to set `autoResumeArmed` flag after successfully logging a run to enable auto-resume on next agent turn
|
|
84
|
+
- Changed `run_experiment` to set `autoResumeArmed` flag and update dashboard after completing a run
|
|
85
|
+
- Changed auto-resume logic to only prompt when a new pending run exists or when `autoResumeArmed` is explicitly set, preventing duplicate prompts
|
|
86
|
+
- Changed path normalization in contract validation to use `path.posix.normalize()` for consistent path handling
|
|
87
|
+
- Changed autoresearch initialization to collect and validate benchmark command, metric definition, scope paths, off-limits list, and constraints before `init_experiment`
|
|
88
|
+
- Changed `init_experiment` to require exact benchmark command, metric definition, scope, off-limits, and constraints matching collected contract
|
|
89
|
+
- Changed `log_experiment` to record run number, benchmark command, scope paths, off-limits list, constraints, and segment fingerprint with each result
|
|
90
|
+
- Changed `run_experiment` to organize output in numbered run directories with separate benchmark and checks logs for artifact preservation
|
|
91
|
+
- Changed autoresearch dashboard to show pending run indicator when unlogged experiment exists
|
|
92
|
+
- Changed autoresearch resume workflow to detect and offer recovery of pending run artifacts before continuing experiment loop
|
|
93
|
+
- Changed `ExperimentResult` to include `runNumber`, `benchmarkCommand`, `scopePaths`, `offLimits`, `constraints`, and `segmentFingerprint` fields
|
|
94
|
+
- Changed `RunningExperiment` to track `runDirectory` and `runNumber` for artifact organization
|
|
95
|
+
- Changed `AutoresearchRuntime` to include `lastRunArtifactDir`, `lastRunNumber`, `lastRunSummary`, `benchmarkCommand`, `secondaryMetrics`, `scopePaths`, `offLimits`, `constraints`, and `segmentFingerprint`
|
|
96
|
+
- Changed autoresearch prompts to emphasize `autoresearch.md` as source of truth for benchmark, scope, and constraints
|
|
97
|
+
- Changed `command-initialize.md` to display collected setup (benchmark command, metric, direction, scope, off-limits, constraints) before initialization
|
|
98
|
+
- Changed `resume-message.md` to reference pending run artifacts and guide completion of unlogged experiments
|
|
99
|
+
- Changed `sendMessage()` API documentation to clarify `deliverAs: 'nextTurn'` behavior for hidden context delivery
|
|
100
|
+
- Changed `SendMessageHandler` type documentation to explain hidden next-turn message queuing during prompt teardown
|
|
101
|
+
- Changed autoresearch startup to create or reuse a dedicated `autoresearch/...` git branch before enabling the experiment loop
|
|
102
|
+
- Changed autoresearch to refuse startup when unrelated worktree changes would make auto-reverts unsafe
|
|
103
|
+
- Changed autoresearch prompts to emphasize scope and constraints as source of truth for session direction
|
|
104
|
+
- Changed component escape key handling to use keybinding manager for `app.interrupt` and `tui.select.cancel` with fallback to raw Escape matching
|
|
105
|
+
- Updated autoresearch prompt guidance to require explicit files in scope, off-limits paths, and session constraints
|
|
106
|
+
- Changed autoresearch command to use intent-based initialization instead of goal parameter, with user input dialog for new sessions
|
|
107
|
+
- Changed autoresearch startup to create or reuse a dedicated `autoresearch/...` git branch before enabling the experiment loop, and to refuse startup when unrelated worktree changes would make auto-reverts unsafe
|
|
108
|
+
- Changed autoresearch startup to activate experiment tools (`init_experiment`, `run_experiment`, `log_experiment`) only when autoresearch mode is enabled
|
|
109
|
+
- Changed autoresearch shutdown to deactivate experiment tools when mode is disabled or cleared
|
|
110
|
+
- Changed autoresearch session rehydration to dynamically manage experiment tool activation based on session state
|
|
111
|
+
- Changed autoresearch prompts and notes guidance to require explicit files in scope, off-limits paths, and session constraints
|
|
112
|
+
- Refactored hashline edit validation to enforce stricter anchor requirements per operation type
|
|
113
|
+
- Updated edit application logic to handle explicit file-level operations (`append_eof`, `prepend_bof`) separately from anchor-based operations
|
|
114
|
+
- Changed `setWidget` API to accept `ExtensionWidgetOptions` parameter for placement control
|
|
115
|
+
- Changed widget placement logic to manage widgets above and below editor separately
|
|
116
|
+
- Changed hashline edit application to preserve duplicated boundary lines exactly as provided instead of auto-correcting them
|
|
117
|
+
- Updated RPC mode to support widget placement option in `setWidget` requests
|
|
118
|
+
- Changed hashline edit application to preserve duplicated boundary lines exactly as provided instead of auto-correcting them
|
|
119
|
+
- Changed widget API to support placement options and component factories in addition to string arrays
|
|
120
|
+
- Updated extension UI controller to manage widgets above and below the editor separately
|
|
121
|
+
- Updated ask tool rendering to support markdown formatting in questions and option labels
|
|
122
|
+
- Refactored hook input and selector components to render titles as markdown for richer text formatting
|
|
123
|
+
- Changed session collection to include sessions with zero messages, enabling ACP mode to create discoverable sessions immediately
|
|
124
|
+
- Changed session persistence logic to use atomic file rewrite when flushing unflushed sessions to prevent duplication
|
|
125
|
+
- Removed hashline edit autocorrection for duplicated boundary lines; escaped-tab autocorrection remains available for leading `\\t` sequences
|
|
126
|
+
|
|
127
|
+
### Removed
|
|
128
|
+
|
|
129
|
+
- Removed `command-start.md` prompt template in favor of separate initialize and resume workflows
|
|
130
|
+
- Removed auto-correction of off-by-one range edits that duplicated closing braces or boundary lines
|
|
131
|
+
- Removed `shouldAutocorrect` function and related boundary line deduplication logic from hashline editor
|
|
132
|
+
- Removed auto-correction of off-by-one range edits that duplicated closing braces or boundary lines
|
|
133
|
+
|
|
134
|
+
### Fixed
|
|
135
|
+
|
|
136
|
+
- Fixed boundary duplication warnings to always display when replacement lines match the next surviving line, even when auto-correction is disabled
|
|
137
|
+
- Fixed secondary metrics validation to properly reject missing configured metrics and new metrics without force flag
|
|
138
|
+
- Fixed ASI data cloning to prevent prototype pollution attacks by filtering reserved property names
|
|
139
|
+
- Fixed autoresearch resume to detect and recover pending run artifacts that were left unlogged from previous sessions
|
|
140
|
+
- Fixed dashboard overlay to display when running experiment even with zero completed results
|
|
141
|
+
- Fixed tab character rendering in dashboard command display and tool output summaries
|
|
142
|
+
- Fixed autoresearch logging to require durable ASI metadata (hypothesis, rollback_reason, next_action_hint) for every run including rollback context for discarded, crashed, and checks-failed experiments
|
|
143
|
+
- Fixed autoresearch logging to require durable ASI metadata for every run, including rollback context for discarded, crashed, and checks-failed experiments
|
|
144
|
+
|
|
5
145
|
## [13.14.0] - 2026-03-20
|
|
6
146
|
|
|
7
147
|
### Added
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"type": "module",
|
|
3
3
|
"name": "@oh-my-pi/pi-coding-agent",
|
|
4
|
-
"version": "13.
|
|
4
|
+
"version": "13.15.2",
|
|
5
5
|
"description": "Coding agent CLI with read, bash, edit, write tools and session management",
|
|
6
6
|
"homepage": "https://github.com/can1357/oh-my-pi",
|
|
7
7
|
"author": "Can Boluk",
|
|
@@ -40,13 +40,14 @@
|
|
|
40
40
|
"test": "bun test"
|
|
41
41
|
},
|
|
42
42
|
"dependencies": {
|
|
43
|
+
"@agentclientprotocol/sdk": "0.16.1",
|
|
43
44
|
"@mozilla/readability": "^0.6",
|
|
44
|
-
"@oh-my-pi/omp-stats": "13.
|
|
45
|
-
"@oh-my-pi/pi-agent-core": "13.
|
|
46
|
-
"@oh-my-pi/pi-ai": "13.
|
|
47
|
-
"@oh-my-pi/pi-natives": "13.
|
|
48
|
-
"@oh-my-pi/pi-tui": "13.
|
|
49
|
-
"@oh-my-pi/pi-utils": "13.
|
|
45
|
+
"@oh-my-pi/omp-stats": "13.15.2",
|
|
46
|
+
"@oh-my-pi/pi-agent-core": "13.15.2",
|
|
47
|
+
"@oh-my-pi/pi-ai": "13.15.2",
|
|
48
|
+
"@oh-my-pi/pi-natives": "13.15.2",
|
|
49
|
+
"@oh-my-pi/pi-tui": "13.15.2",
|
|
50
|
+
"@oh-my-pi/pi-utils": "13.15.2",
|
|
50
51
|
"@sinclair/typebox": "^0.34",
|
|
51
52
|
"@xterm/headless": "^6.0",
|
|
52
53
|
"ajv": "^8.18",
|
|
@@ -54,7 +55,8 @@
|
|
|
54
55
|
"diff": "^8.0",
|
|
55
56
|
"handlebars": "^4.7",
|
|
56
57
|
"linkedom": "^0.18",
|
|
57
|
-
"puppeteer": "^24.37"
|
|
58
|
+
"puppeteer": "^24.37",
|
|
59
|
+
"zod": "4.3.6"
|
|
58
60
|
},
|
|
59
61
|
"devDependencies": {
|
|
60
62
|
"@types/bun": "^1.3"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Set up autoresearch for this intent:
|
|
2
|
+
|
|
3
|
+
{{intent}}
|
|
4
|
+
|
|
5
|
+
{{branch_status_line}}
|
|
6
|
+
|
|
7
|
+
Collected setup:
|
|
8
|
+
|
|
9
|
+
- benchmark command: `{{benchmark_command}}`
|
|
10
|
+
- primary metric: `{{metric_name}}`
|
|
11
|
+
- metric unit: `{{metric_unit}}`
|
|
12
|
+
- direction: `{{direction}}`
|
|
13
|
+
- tradeoff metrics:
|
|
14
|
+
{{{secondary_metrics_block}}}
|
|
15
|
+
- files in scope:
|
|
16
|
+
{{{scope_paths_block}}}
|
|
17
|
+
- off limits:
|
|
18
|
+
{{{off_limits_block}}}
|
|
19
|
+
- constraints:
|
|
20
|
+
{{{constraints_block}}}
|
|
21
|
+
|
|
22
|
+
Explain briefly what autoresearch will do in this repository, then initialize the workspace.
|
|
23
|
+
|
|
24
|
+
Your first actions:
|
|
25
|
+
- write `autoresearch.md`
|
|
26
|
+
- record the collected benchmark command, primary metric, metric unit, direction, tradeoff metrics, scope, off-limits list, and constraints in `autoresearch.md`
|
|
27
|
+
- add a short preflight section in `autoresearch.md` covering prerequisites, one-time setup, and the comparability invariant that must stay fixed across runs
|
|
28
|
+
- explicitly mark the ground-truth evaluator, fixed datasets, and other measurement-critical files as off-limits or hard constraints when they define the benchmark contract
|
|
29
|
+
- write or update `autoresearch.program.md` when you learn durable heuristics, failure patterns, or repo-specific strategy that future resume turns should inherit
|
|
30
|
+
- define the benchmark entrypoint in `autoresearch.sh`
|
|
31
|
+
- optionally add `autoresearch.checks.sh` if correctness or quality needs a hard gate
|
|
32
|
+
- run `init_experiment` with the exact collected benchmark command, metric definition, scope paths, off-limits list, and constraints
|
|
33
|
+
- run and log the baseline
|
|
34
|
+
- keep iterating until interrupted or until the configured iteration cap is reached
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Resume autoresearch from the attached notes.
|
|
2
|
+
|
|
3
|
+
@{{autoresearch_md_path}}
|
|
4
|
+
|
|
5
|
+
{{branch_status_line}}
|
|
6
|
+
{{#if has_resume_context}}
|
|
7
|
+
|
|
8
|
+
Additional context from the user:
|
|
9
|
+
|
|
10
|
+
{{resume_context}}
|
|
11
|
+
{{/if}}
|
|
12
|
+
|
|
13
|
+
Use the notes as the source of truth for the current direction, scope, and constraints.
|
|
14
|
+
- inspect recent git history for context
|
|
15
|
+
- inspect `autoresearch.jsonl` if it exists
|
|
16
|
+
- continue the most promising unfinished direction on the current protected branch
|
|
17
|
+
- keep iterating until interrupted or until the configured iteration cap is reached
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
import * as crypto from "node:crypto";
|
|
2
|
+
import * as fs from "node:fs";
|
|
3
|
+
import * as path from "node:path";
|
|
4
|
+
import type { AutoresearchBenchmarkContract, AutoresearchContract, MetricDirection } from "./types";
|
|
5
|
+
|
|
6
|
+
export interface AutoresearchContractLoadResult {
|
|
7
|
+
contract: AutoresearchContract;
|
|
8
|
+
errors: string[];
|
|
9
|
+
path: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface AutoresearchScriptSnapshot {
|
|
13
|
+
benchmarkScript: string;
|
|
14
|
+
benchmarkScriptPath: string;
|
|
15
|
+
checksScript: string | null;
|
|
16
|
+
checksScriptPath: string;
|
|
17
|
+
errors: string[];
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const HEADING_REGEX = /^##\s+(.+?)\s*$/;
|
|
21
|
+
const LIST_ITEM_REGEX = /^\s*[-*]\s+(.*)$/;
|
|
22
|
+
const KEY_VALUE_REGEX = /^\s*[-*]\s+([^:]+):\s*(.*)$/;
|
|
23
|
+
|
|
24
|
+
export function readAutoresearchContract(workDir: string): AutoresearchContractLoadResult {
|
|
25
|
+
const contractPath = path.join(workDir, "autoresearch.md");
|
|
26
|
+
let content = "";
|
|
27
|
+
try {
|
|
28
|
+
content = fs.readFileSync(contractPath, "utf8");
|
|
29
|
+
} catch {
|
|
30
|
+
return {
|
|
31
|
+
contract: createEmptyAutoresearchContract(),
|
|
32
|
+
errors: [`${contractPath} does not exist. Create it before initializing autoresearch.`],
|
|
33
|
+
path: contractPath,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const contract = parseAutoresearchContract(content);
|
|
38
|
+
const errors = validateAutoresearchContract(contract);
|
|
39
|
+
return { contract, errors, path: contractPath };
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export function parseAutoresearchContract(markdown: string): AutoresearchContract {
|
|
43
|
+
const sections = extractSections(markdown);
|
|
44
|
+
return {
|
|
45
|
+
benchmark: parseBenchmarkSection(sections.get("benchmark") ?? ""),
|
|
46
|
+
scopePaths: parseListSection(sections.get("files in scope") ?? "", normalizeContractPathSpec),
|
|
47
|
+
offLimits: parseListSection(sections.get("off limits") ?? "", normalizeContractPathSpec),
|
|
48
|
+
constraints: parseListSection(sections.get("constraints") ?? ""),
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export function validateAutoresearchContract(contract: AutoresearchContract): string[] {
|
|
53
|
+
const errors: string[] = [];
|
|
54
|
+
if (!contract.benchmark.command) {
|
|
55
|
+
errors.push("Benchmark.command is required in autoresearch.md.");
|
|
56
|
+
}
|
|
57
|
+
if (!contract.benchmark.primaryMetric) {
|
|
58
|
+
errors.push("Benchmark.primary metric is required in autoresearch.md.");
|
|
59
|
+
}
|
|
60
|
+
if (!contract.benchmark.direction) {
|
|
61
|
+
errors.push("Benchmark.direction must be `lower` or `higher` in autoresearch.md.");
|
|
62
|
+
}
|
|
63
|
+
if (contract.scopePaths.length === 0) {
|
|
64
|
+
errors.push("Files in Scope must contain at least one path in autoresearch.md.");
|
|
65
|
+
}
|
|
66
|
+
for (const scopePath of contract.scopePaths) {
|
|
67
|
+
if (isUnsafeContractPathSpec(scopePath)) {
|
|
68
|
+
errors.push(`Files in Scope contains an invalid path: ${scopePath}`);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
for (const offLimitsPath of contract.offLimits) {
|
|
72
|
+
if (isUnsafeContractPathSpec(offLimitsPath)) {
|
|
73
|
+
errors.push(`Off Limits contains an invalid path: ${offLimitsPath}`);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return errors;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export function buildAutoresearchSegmentFingerprint(
|
|
80
|
+
contract: AutoresearchContract,
|
|
81
|
+
scripts: {
|
|
82
|
+
benchmarkScript: string;
|
|
83
|
+
checksScript: string | null;
|
|
84
|
+
},
|
|
85
|
+
): string {
|
|
86
|
+
const payload = {
|
|
87
|
+
benchmark: contract.benchmark,
|
|
88
|
+
scopePaths: contract.scopePaths,
|
|
89
|
+
offLimits: contract.offLimits,
|
|
90
|
+
constraints: contract.constraints,
|
|
91
|
+
scripts,
|
|
92
|
+
};
|
|
93
|
+
return crypto.createHash("sha256").update(JSON.stringify(payload)).digest("hex");
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export function getAutoresearchFingerprintMismatchError(
|
|
97
|
+
stateFingerprint: string | null,
|
|
98
|
+
workDir: string,
|
|
99
|
+
): string | null {
|
|
100
|
+
if (!stateFingerprint) {
|
|
101
|
+
return "The current segment has no fingerprint metadata. Re-run init_experiment before continuing.";
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const contractResult = readAutoresearchContract(workDir);
|
|
105
|
+
const scriptSnapshot = loadAutoresearchScriptSnapshot(workDir);
|
|
106
|
+
const errors = [...contractResult.errors, ...scriptSnapshot.errors];
|
|
107
|
+
if (errors.length > 0) {
|
|
108
|
+
return `${errors.join(" ")} Re-run init_experiment after fixing the workspace contract.`;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const currentFingerprint = buildAutoresearchSegmentFingerprint(contractResult.contract, {
|
|
112
|
+
benchmarkScript: scriptSnapshot.benchmarkScript,
|
|
113
|
+
checksScript: scriptSnapshot.checksScript,
|
|
114
|
+
});
|
|
115
|
+
if (currentFingerprint === stateFingerprint) {
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
return "autoresearch.md, autoresearch.sh, or autoresearch.checks.sh changed since the current segment was initialized. Re-run init_experiment before continuing.";
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export function loadAutoresearchScriptSnapshot(workDir: string): AutoresearchScriptSnapshot {
|
|
123
|
+
const benchmarkScriptPath = path.join(workDir, "autoresearch.sh");
|
|
124
|
+
const checksScriptPath = path.join(workDir, "autoresearch.checks.sh");
|
|
125
|
+
const errors: string[] = [];
|
|
126
|
+
|
|
127
|
+
let benchmarkScript = "";
|
|
128
|
+
try {
|
|
129
|
+
benchmarkScript = fs.readFileSync(benchmarkScriptPath, "utf8");
|
|
130
|
+
} catch {
|
|
131
|
+
errors.push(`${benchmarkScriptPath} does not exist. Create it before initializing autoresearch.`);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
let checksScript: string | null = null;
|
|
135
|
+
try {
|
|
136
|
+
checksScript = fs.readFileSync(checksScriptPath, "utf8");
|
|
137
|
+
} catch {
|
|
138
|
+
checksScript = null;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
benchmarkScript,
|
|
143
|
+
benchmarkScriptPath,
|
|
144
|
+
checksScript,
|
|
145
|
+
checksScriptPath,
|
|
146
|
+
errors,
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
export function normalizeAutoresearchList(values: readonly string[]): string[] {
|
|
151
|
+
const normalized: string[] = [];
|
|
152
|
+
const seen = new Set<string>();
|
|
153
|
+
for (const value of values) {
|
|
154
|
+
const trimmed = value.trim();
|
|
155
|
+
if (trimmed.length === 0) continue;
|
|
156
|
+
if (seen.has(trimmed)) continue;
|
|
157
|
+
seen.add(trimmed);
|
|
158
|
+
normalized.push(trimmed);
|
|
159
|
+
}
|
|
160
|
+
return normalized;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
export function normalizeContractPathSpec(value: string): string {
|
|
164
|
+
const normalized = path.posix.normalize(value.trim().replaceAll("\\", "/"));
|
|
165
|
+
if (normalized === "." || normalized === "./") return ".";
|
|
166
|
+
return normalized.replace(/^\.\/+/, "").replace(/\/+$/, "");
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
export function pathMatchesContractPath(pathValue: string, specValue: string): boolean {
|
|
170
|
+
const normalizedPath = normalizeContractPathSpec(pathValue);
|
|
171
|
+
const normalizedSpec = normalizeContractPathSpec(specValue);
|
|
172
|
+
if (normalizedSpec === ".") return true;
|
|
173
|
+
return normalizedPath === normalizedSpec || normalizedPath.startsWith(`${normalizedSpec}/`);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export function contractListsEqual(left: readonly string[], right: readonly string[]): boolean {
|
|
177
|
+
const normalizedLeft = normalizeAutoresearchList(left);
|
|
178
|
+
const normalizedRight = normalizeAutoresearchList(right);
|
|
179
|
+
if (normalizedLeft.length !== normalizedRight.length) return false;
|
|
180
|
+
return normalizedLeft.every((value, index) => value === normalizedRight[index]);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
export function contractPathListsEqual(left: readonly string[], right: readonly string[]): boolean {
|
|
184
|
+
const normalizedLeft = normalizeContractPathList(left);
|
|
185
|
+
const normalizedRight = normalizeContractPathList(right);
|
|
186
|
+
if (normalizedLeft.length !== normalizedRight.length) return false;
|
|
187
|
+
return normalizedLeft.every((value, index) => value === normalizedRight[index]);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function createEmptyAutoresearchContract(): AutoresearchContract {
|
|
191
|
+
return {
|
|
192
|
+
benchmark: {
|
|
193
|
+
command: null,
|
|
194
|
+
primaryMetric: null,
|
|
195
|
+
metricUnit: "",
|
|
196
|
+
direction: null,
|
|
197
|
+
secondaryMetrics: [],
|
|
198
|
+
},
|
|
199
|
+
scopePaths: [],
|
|
200
|
+
offLimits: [],
|
|
201
|
+
constraints: [],
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
function normalizeContractPathList(values: readonly string[]): string[] {
|
|
206
|
+
return normalizeAutoresearchList(values.map(normalizeContractPathSpec)).sort((left, right) =>
|
|
207
|
+
left.localeCompare(right),
|
|
208
|
+
);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
function extractSections(markdown: string): Map<string, string> {
|
|
212
|
+
const sections = new Map<string, string>();
|
|
213
|
+
const lines = markdown.split("\n");
|
|
214
|
+
let currentHeading: string | null = null;
|
|
215
|
+
let currentLines: string[] = [];
|
|
216
|
+
|
|
217
|
+
for (const line of lines) {
|
|
218
|
+
const headingMatch = line.match(HEADING_REGEX);
|
|
219
|
+
if (headingMatch) {
|
|
220
|
+
if (currentHeading) {
|
|
221
|
+
sections.set(currentHeading, currentLines.join("\n").trim());
|
|
222
|
+
}
|
|
223
|
+
currentHeading = headingMatch[1]?.trim().toLowerCase() ?? null;
|
|
224
|
+
currentLines = [];
|
|
225
|
+
continue;
|
|
226
|
+
}
|
|
227
|
+
if (currentHeading) {
|
|
228
|
+
currentLines.push(line);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
if (currentHeading) {
|
|
233
|
+
sections.set(currentHeading, currentLines.join("\n").trim());
|
|
234
|
+
}
|
|
235
|
+
return sections;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
function parseBenchmarkSection(section: string): AutoresearchBenchmarkContract {
|
|
239
|
+
const entries = new Map<string, string>();
|
|
240
|
+
const lines = section.split("\n");
|
|
241
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
242
|
+
const rawLine = lines[index] ?? "";
|
|
243
|
+
const match = rawLine.match(KEY_VALUE_REGEX);
|
|
244
|
+
if (!match) continue;
|
|
245
|
+
const key = normalizeKey(match[1] ?? "");
|
|
246
|
+
let value = (match[2] ?? "").trim();
|
|
247
|
+
if (key === "secondarymetrics") {
|
|
248
|
+
const nestedItems: string[] = [];
|
|
249
|
+
for (let nestedIndex = index + 1; nestedIndex < lines.length; nestedIndex += 1) {
|
|
250
|
+
const nestedLine = lines[nestedIndex] ?? "";
|
|
251
|
+
if (nestedLine.match(KEY_VALUE_REGEX)) break;
|
|
252
|
+
const nestedMatch = nestedLine.match(/^\s{2,}[-*]\s+(.*)$/);
|
|
253
|
+
if (!nestedMatch) {
|
|
254
|
+
if (nestedLine.trim().length > 0) break;
|
|
255
|
+
continue;
|
|
256
|
+
}
|
|
257
|
+
nestedItems.push((nestedMatch[1] ?? "").trim());
|
|
258
|
+
index = nestedIndex;
|
|
259
|
+
}
|
|
260
|
+
if (nestedItems.length > 0) {
|
|
261
|
+
value = [value, ...nestedItems].filter(Boolean).join(", ");
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
entries.set(key, value);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
const direction = parseDirection(entries.get("direction"));
|
|
268
|
+
return {
|
|
269
|
+
command: readNullableEntry(entries.get("command")),
|
|
270
|
+
primaryMetric: readNullableEntry(entries.get("primarymetric")),
|
|
271
|
+
metricUnit: entries.get("metricunit")?.trim() ?? "",
|
|
272
|
+
direction,
|
|
273
|
+
secondaryMetrics: parseSecondaryMetrics(entries.get("secondarymetrics")),
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
function parseListSection(section: string, normalizeItem?: (value: string) => string): string[] {
|
|
278
|
+
const items: string[] = [];
|
|
279
|
+
let activeItem: string | null = null;
|
|
280
|
+
for (const rawLine of section.split("\n")) {
|
|
281
|
+
const line = rawLine.trimEnd();
|
|
282
|
+
if (line.trim().length === 0) continue;
|
|
283
|
+
const match = rawLine.match(LIST_ITEM_REGEX);
|
|
284
|
+
if (match) {
|
|
285
|
+
if (activeItem) items.push(activeItem);
|
|
286
|
+
activeItem = (match[1] ?? "").trim();
|
|
287
|
+
continue;
|
|
288
|
+
}
|
|
289
|
+
if (activeItem && /^\s{2,}\S/.test(rawLine)) {
|
|
290
|
+
activeItem = `${activeItem} ${line.trim()}`;
|
|
291
|
+
continue;
|
|
292
|
+
}
|
|
293
|
+
if (activeItem) {
|
|
294
|
+
items.push(activeItem);
|
|
295
|
+
activeItem = null;
|
|
296
|
+
}
|
|
297
|
+
items.push(line.trim());
|
|
298
|
+
}
|
|
299
|
+
if (activeItem) {
|
|
300
|
+
items.push(activeItem);
|
|
301
|
+
}
|
|
302
|
+
const normalizedItems = normalizeAutoresearchList(items);
|
|
303
|
+
return normalizeItem ? normalizedItems.map(normalizeItem) : normalizedItems;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
function normalizeKey(value: string): string {
|
|
307
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, "");
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
function parseDirection(value: string | undefined): MetricDirection | null {
|
|
311
|
+
if (value === "lower" || value === "higher") return value;
|
|
312
|
+
return null;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
function readNullableEntry(value: string | undefined): string | null {
|
|
316
|
+
const trimmed = value?.trim() ?? "";
|
|
317
|
+
return trimmed.length > 0 ? trimmed : null;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
function parseSecondaryMetrics(value: string | undefined): string[] {
|
|
321
|
+
if (!value) return [];
|
|
322
|
+
return normalizeAutoresearchList(
|
|
323
|
+
value
|
|
324
|
+
.split(",")
|
|
325
|
+
.map(entry => entry.trim())
|
|
326
|
+
.filter(Boolean),
|
|
327
|
+
);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
function isUnsafeContractPathSpec(value: string): boolean {
|
|
331
|
+
return path.posix.isAbsolute(value) || value === ".." || value.startsWith("../");
|
|
332
|
+
}
|