autoresearcher 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +124 -0
- package/bin/autoresearcher.js +8 -0
- package/package.json +37 -0
- package/src/cli.js +357 -0
- package/src/config.js +29 -0
- package/src/internal-backend.js +72 -0
- package/src/progress-chart.js +207 -0
- package/src/run-loop.js +238 -0
- package/src/shell.js +35 -0
package/README.md
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# autoresearcher
|
|
2
|
+
|
|
3
|
+
`autoresearcher` is a standalone terminal CLI for benchmark-driven autonomous research loops.
|
|
4
|
+
|
|
5
|
+
It runs this cycle repeatedly:
|
|
6
|
+
|
|
7
|
+
1. Run one internal headless agent iteration.
|
|
8
|
+
2. Run your benchmark command.
|
|
9
|
+
3. Parse metric with a regex.
|
|
10
|
+
4. Keep iteration only if metric improved.
|
|
11
|
+
|
|
12
|
+
## Quick Start
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
npm i -g autoresearcher
|
|
16
|
+
|
|
17
|
+
# In your research repo:
|
|
18
|
+
cd /path/to/your/new-repo
|
|
19
|
+
autoresearcher wizard
|
|
20
|
+
autoresearcher run --iterations 20
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Commands
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
autoresearcher init
|
|
27
|
+
autoresearcher wizard
|
|
28
|
+
autoresearcher run [--iterations N]
|
|
29
|
+
autoresearcher progress [--run-id <id>] [--output <file.svg|file.png>]
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Important Config
|
|
33
|
+
|
|
34
|
+
The `init` command creates `.autoresearcher/config.json`:
|
|
35
|
+
|
|
36
|
+
```json
|
|
37
|
+
{
|
|
38
|
+
"agentMode": "internal",
|
|
39
|
+
"agentPromptFile": "program.md",
|
|
40
|
+
"agentPrompt": "Improve the benchmark metric while preserving correctness, test behavior, and safety.",
|
|
41
|
+
"agentCommand": "./scripts/agent-step.sh",
|
|
42
|
+
"backendAgent": "",
|
|
43
|
+
"backendModel": "",
|
|
44
|
+
"backendMaxIterations": 1,
|
|
45
|
+
"benchmarkCommand": "./scripts/benchmark.sh",
|
|
46
|
+
"metricRegex": "score=([0-9.]+)",
|
|
47
|
+
"direction": "max",
|
|
48
|
+
"iterations": 20,
|
|
49
|
+
"autoCommit": false,
|
|
50
|
+
"onRejectCommand": "",
|
|
51
|
+
"onKeepCommand": "",
|
|
52
|
+
"stopOnAgentFailure": true,
|
|
53
|
+
"streamAgentOutput": true,
|
|
54
|
+
"commitMessageTemplate": "research: improved metric to {metric} (iter {iteration})"
|
|
55
|
+
}
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
`agentMode: "internal"` is the default. For a fully custom step command, set `agentMode` to `"command"` and edit `agentCommand`.
|
|
59
|
+
|
|
60
|
+
## Example Configs
|
|
61
|
+
|
|
62
|
+
Default internal headless mode:
|
|
63
|
+
|
|
64
|
+
```json
|
|
65
|
+
{
|
|
66
|
+
"agentMode": "internal",
|
|
67
|
+
"agentPromptFile": "program.md",
|
|
68
|
+
"agentPrompt": "Improve benchmark with safe, minimal changes.",
|
|
69
|
+
"backendAgent": "amp",
|
|
70
|
+
"backendModel": "claude-sonnet-4-5-20250929",
|
|
71
|
+
"backendMaxIterations": 1,
|
|
72
|
+
"benchmarkCommand": "./scripts/benchmark.sh",
|
|
73
|
+
"metricRegex": "score=([0-9.]+)",
|
|
74
|
+
"direction": "max",
|
|
75
|
+
"iterations": 40,
|
|
76
|
+
"autoCommit": false
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Custom command mode:
|
|
81
|
+
|
|
82
|
+
```json
|
|
83
|
+
{
|
|
84
|
+
"agentMode": "command",
|
|
85
|
+
"agentCommand": "./scripts/agent-step.sh",
|
|
86
|
+
"benchmarkCommand": "./scripts/benchmark.sh",
|
|
87
|
+
"metricRegex": "score=([0-9.]+)",
|
|
88
|
+
"direction": "max",
|
|
89
|
+
"iterations": 40
|
|
90
|
+
}
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Typical Real Setup
|
|
94
|
+
|
|
95
|
+
1. Start with internal mode and tailor `agentPrompt` to your objective.
|
|
96
|
+
2. Set one provider key (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, or `OPENROUTER_API_KEY`).
|
|
97
|
+
3. Optionally pin `backendAgent` and `backendModel`.
|
|
98
|
+
4. Edit `program.md` with your experiment objective and constraints.
|
|
99
|
+
5. Replace `./scripts/benchmark.sh` so it prints one numeric metric, like `score=0.8123`.
|
|
100
|
+
6. Set `direction` to `max` or `min`.
|
|
101
|
+
7. Optionally switch to `agentMode: "command"` and customize `agentCommand`.
|
|
102
|
+
8. Optionally set `onRejectCommand` to revert non-improving changes.
|
|
103
|
+
|
|
104
|
+
## Progress Graph
|
|
105
|
+
|
|
106
|
+
Generate a chart from the latest run:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
autoresearcher progress --output progress.png
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
For a specific run ID:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
autoresearcher progress --run-id 2026-03-13T12-51-48-680Z --output run.svg
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Example progress graph:
|
|
119
|
+
|
|
120
|
+

|
|
121
|
+
|
|
122
|
+
## Logs
|
|
123
|
+
|
|
124
|
+
Every run writes JSONL logs in `.autoresearcher/runs/<timestamp>.jsonl`.
|
package/package.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "autoresearcher",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Benchmark-driven autonomous research CLI for post-quantum and blockchain R&D",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"autoresearcher": "bin/autoresearcher.js"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"start": "node ./bin/autoresearcher.js",
|
|
11
|
+
"check": "node ./bin/autoresearcher.js --help",
|
|
12
|
+
"build": "echo \"no build step for plain node cli\""
|
|
13
|
+
},
|
|
14
|
+
"keywords": [
|
|
15
|
+
"cli",
|
|
16
|
+
"autoresearch",
|
|
17
|
+
"benchmark",
|
|
18
|
+
"agent",
|
|
19
|
+
"post-quantum",
|
|
20
|
+
"blockchain",
|
|
21
|
+
"cryptography"
|
|
22
|
+
],
|
|
23
|
+
"repository": {
|
|
24
|
+
"type": "git",
|
|
25
|
+
"url": "git+https://github.com/multivmlabs/autoresearcher.git"
|
|
26
|
+
},
|
|
27
|
+
"files": [
|
|
28
|
+
"bin",
|
|
29
|
+
"src",
|
|
30
|
+
"README.md"
|
|
31
|
+
],
|
|
32
|
+
"homepage": "https://autoresearcher.multivmlabs.com",
|
|
33
|
+
"license": "MIT",
|
|
34
|
+
"dependencies": {
|
|
35
|
+
"ralph-starter": "^0.4.4"
|
|
36
|
+
}
|
|
37
|
+
}
|
package/src/cli.js
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { stdin as input, stdout as output } from 'node:process';
|
|
4
|
+
import readline from 'node:readline/promises';
|
|
5
|
+
import { loadConfig, writeConfig, CONFIG_DIR } from './config.js';
|
|
6
|
+
import { DEFAULT_AGENT_MODE, DEFAULT_AGENT_PROMPT } from './internal-backend.js';
|
|
7
|
+
import { generateProgressChart } from './progress-chart.js';
|
|
8
|
+
import { runResearchLoop } from './run-loop.js';
|
|
9
|
+
|
|
10
|
+
const AGENT_CHOICES = [
|
|
11
|
+
{ label: 'Auto-detect (recommended)', value: '' },
|
|
12
|
+
{ label: 'amp', value: 'amp' },
|
|
13
|
+
{ label: 'claude-code', value: 'claude-code' },
|
|
14
|
+
{ label: 'codex', value: 'codex' },
|
|
15
|
+
{ label: 'cursor', value: 'cursor' },
|
|
16
|
+
{ label: 'opencode', value: 'opencode' },
|
|
17
|
+
{ label: 'openclaw', value: 'openclaw' },
|
|
18
|
+
];
|
|
19
|
+
|
|
20
|
+
const MODEL_CHOICES = [
|
|
21
|
+
{ label: 'Use backend default (recommended)', value: '' },
|
|
22
|
+
{ label: 'claude-sonnet-4-5-20250929', value: 'claude-sonnet-4-5-20250929' },
|
|
23
|
+
{ label: 'gpt-5', value: 'gpt-5' },
|
|
24
|
+
{ label: 'Custom model ID', value: '__custom__' },
|
|
25
|
+
];
|
|
26
|
+
|
|
27
|
+
function parseArgs(args) {
|
|
28
|
+
const result = { _: [] };
|
|
29
|
+
|
|
30
|
+
for (let i = 0; i < args.length; i++) {
|
|
31
|
+
const token = args[i];
|
|
32
|
+
if (!token.startsWith('--')) {
|
|
33
|
+
result._.push(token);
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const key = token.slice(2);
|
|
38
|
+
const next = args[i + 1];
|
|
39
|
+
if (!next || next.startsWith('--')) {
|
|
40
|
+
result[key] = true;
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
result[key] = next;
|
|
45
|
+
i++;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
return result;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function printHelp() {
|
|
52
|
+
console.log(`
|
|
53
|
+
autoresearcher - benchmark-driven autonomous research CLI
|
|
54
|
+
|
|
55
|
+
Usage:
|
|
56
|
+
autoresearcher init
|
|
57
|
+
autoresearcher wizard
|
|
58
|
+
autoresearcher run [--iterations N] [--agent-prompt "..."] [--benchmark-command "..."]
|
|
59
|
+
autoresearcher progress [--run-id <id>] [--output <file.svg|file.png>]
|
|
60
|
+
autoresearcher --help
|
|
61
|
+
|
|
62
|
+
Key config fields:
|
|
63
|
+
agentMode "internal" (default) or "command"
|
|
64
|
+
agentPromptFile Markdown objective file (default: program.md)
|
|
65
|
+
agentPrompt Iteration objective for internal headless agent backend
|
|
66
|
+
backendAgent Optional backend agent override (amp/codex/claude-code/...)
|
|
67
|
+
backendModel Optional backend model override (provider-specific)
|
|
68
|
+
agentCommand Shell command when agentMode is "command"
|
|
69
|
+
benchmarkCommand Shell command that prints metric output
|
|
70
|
+
metricRegex Regex with capture group, e.g. "score=([0-9.]+)"
|
|
71
|
+
direction "max" or "min"
|
|
72
|
+
iterations Loop count
|
|
73
|
+
autoCommit true/false (git add/commit on improvements)
|
|
74
|
+
onRejectCommand Optional command for rejected iterations
|
|
75
|
+
|
|
76
|
+
Examples:
|
|
77
|
+
autoresearcher init
|
|
78
|
+
autoresearcher wizard
|
|
79
|
+
autoresearcher run --iterations 30
|
|
80
|
+
autoresearcher progress --output progress.png
|
|
81
|
+
autoresearcher run --agent-prompt "improve benchmark metric without regressions"
|
|
82
|
+
autoresearcher run --agent-command "amp -p 'improve benchmark'" --benchmark-command "./scripts/benchmark.sh"
|
|
83
|
+
`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function createDefaultConfig() {
|
|
87
|
+
return {
|
|
88
|
+
agentMode: DEFAULT_AGENT_MODE,
|
|
89
|
+
agentPromptFile: 'program.md',
|
|
90
|
+
agentPrompt: DEFAULT_AGENT_PROMPT,
|
|
91
|
+
agentCommand: './scripts/agent-step.sh',
|
|
92
|
+
backendAgent: '',
|
|
93
|
+
backendModel: '',
|
|
94
|
+
backendMaxIterations: 1,
|
|
95
|
+
benchmarkCommand: './scripts/benchmark.sh',
|
|
96
|
+
metricRegex: 'score=([0-9.]+)',
|
|
97
|
+
direction: 'max',
|
|
98
|
+
iterations: 20,
|
|
99
|
+
autoCommit: false,
|
|
100
|
+
onRejectCommand: '',
|
|
101
|
+
onKeepCommand: '',
|
|
102
|
+
stopOnAgentFailure: true,
|
|
103
|
+
streamAgentOutput: true,
|
|
104
|
+
commitMessageTemplate: 'research: improved metric to {metric} (iter {iteration})',
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
async function writeProgramFile(cwd, programFile = 'program.md', objective = DEFAULT_AGENT_PROMPT) {
|
|
109
|
+
if (!programFile) return;
|
|
110
|
+
|
|
111
|
+
const programPath = path.resolve(cwd, programFile);
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
await fs.access(programPath);
|
|
115
|
+
return;
|
|
116
|
+
} catch {
|
|
117
|
+
// File does not exist yet, create it from template.
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const template = `# Research Program
|
|
121
|
+
|
|
122
|
+
Objective:
|
|
123
|
+
${objective}
|
|
124
|
+
|
|
125
|
+
Constraints:
|
|
126
|
+
- Preserve correctness and existing behavior.
|
|
127
|
+
- Keep changes minimal and measurable.
|
|
128
|
+
- Avoid adding unnecessary complexity.
|
|
129
|
+
|
|
130
|
+
Iteration Policy:
|
|
131
|
+
1. Propose one improvement.
|
|
132
|
+
2. Run benchmark.
|
|
133
|
+
3. Keep only metric improvements.
|
|
134
|
+
`;
|
|
135
|
+
|
|
136
|
+
await fs.writeFile(programPath, template, 'utf8');
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
async function selectChoice(rl, title, options, currentValue = '') {
|
|
140
|
+
const fallbackIndex = 0;
|
|
141
|
+
const currentIndex = options.findIndex((option) => option.value === currentValue);
|
|
142
|
+
const defaultIndex = currentIndex >= 0 ? currentIndex : fallbackIndex;
|
|
143
|
+
|
|
144
|
+
console.log(`\n${title}`);
|
|
145
|
+
options.forEach((option, index) => {
|
|
146
|
+
console.log(` ${index + 1}. ${option.label}`);
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
while (true) {
|
|
150
|
+
const answer = (await rl.question(`Select option [${defaultIndex + 1}]: `)).trim();
|
|
151
|
+
if (!answer) {
|
|
152
|
+
return options[defaultIndex].value;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const index = Number.parseInt(answer, 10);
|
|
156
|
+
if (Number.isInteger(index) && index >= 1 && index <= options.length) {
|
|
157
|
+
return options[index - 1].value;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
console.log(`Please enter a number between 1 and ${options.length}.`);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
async function askText(rl, label, defaultValue = '') {
|
|
165
|
+
const suffix = defaultValue ? ` [${defaultValue}]` : '';
|
|
166
|
+
const answer = (await rl.question(`${label}${suffix}: `)).trim();
|
|
167
|
+
return answer || defaultValue;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
async function runWizard() {
|
|
171
|
+
if (!process.stdin.isTTY || !process.stdout.isTTY) {
|
|
172
|
+
throw new Error('Wizard requires an interactive terminal. Use "autoresearcher init" for non-interactive setup.');
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
const cwd = process.cwd();
|
|
176
|
+
const existingConfig = await loadConfig(cwd).catch(() => null);
|
|
177
|
+
const base = {
|
|
178
|
+
...createDefaultConfig(),
|
|
179
|
+
...(existingConfig || {}),
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
const rl = readline.createInterface({ input, output });
|
|
183
|
+
|
|
184
|
+
try {
|
|
185
|
+
console.log('autoresearcher setup wizard');
|
|
186
|
+
console.log('This configures internal mode with your preferred agent and model.');
|
|
187
|
+
|
|
188
|
+
const backendAgent = await selectChoice(rl, 'Choose backend agent', AGENT_CHOICES, base.backendAgent || '');
|
|
189
|
+
const modelSelection = await selectChoice(rl, 'Choose backend model', MODEL_CHOICES, base.backendModel || '');
|
|
190
|
+
|
|
191
|
+
let backendModel = modelSelection;
|
|
192
|
+
if (modelSelection === '__custom__') {
|
|
193
|
+
backendModel = await askText(rl, 'Enter custom model ID', base.backendModel || '');
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
const agentPrompt = await askText(rl, 'Iteration objective prompt', base.agentPrompt || DEFAULT_AGENT_PROMPT);
|
|
197
|
+
const agentPromptFile = await askText(rl, 'Objective file path', base.agentPromptFile || 'program.md');
|
|
198
|
+
const benchmarkCommand = await askText(rl, 'Benchmark command', base.benchmarkCommand || './scripts/benchmark.sh');
|
|
199
|
+
const metricRegex = await askText(rl, 'Metric regex', base.metricRegex || 'score=([0-9.]+)');
|
|
200
|
+
const direction = await selectChoice(
|
|
201
|
+
rl,
|
|
202
|
+
'Metric direction',
|
|
203
|
+
[
|
|
204
|
+
{ label: 'max (higher is better)', value: 'max' },
|
|
205
|
+
{ label: 'min (lower is better)', value: 'min' },
|
|
206
|
+
],
|
|
207
|
+
base.direction === 'min' ? 'min' : 'max'
|
|
208
|
+
);
|
|
209
|
+
|
|
210
|
+
const iterationInput = await askText(rl, 'Iterations per run', String(base.iterations ?? 20));
|
|
211
|
+
const parsedIterations = Number.parseInt(iterationInput, 10);
|
|
212
|
+
const iterations = Number.isInteger(parsedIterations) && parsedIterations > 0 ? parsedIterations : 20;
|
|
213
|
+
|
|
214
|
+
const config = {
|
|
215
|
+
...base,
|
|
216
|
+
agentMode: DEFAULT_AGENT_MODE,
|
|
217
|
+
backendAgent,
|
|
218
|
+
backendModel,
|
|
219
|
+
agentPromptFile,
|
|
220
|
+
agentPrompt,
|
|
221
|
+
benchmarkCommand,
|
|
222
|
+
metricRegex,
|
|
223
|
+
direction,
|
|
224
|
+
iterations,
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
const configPath = await writeConfig(config, cwd);
|
|
228
|
+
await writeExampleScripts(cwd);
|
|
229
|
+
await writeProgramFile(cwd, config.agentPromptFile, config.agentPrompt);
|
|
230
|
+
|
|
231
|
+
console.log(`\nSaved config to ${configPath}`);
|
|
232
|
+
console.log('Created or refreshed program/objective and script templates');
|
|
233
|
+
console.log('Run: autoresearcher run');
|
|
234
|
+
} finally {
|
|
235
|
+
rl.close();
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
async function writeExampleScripts(cwd) {
|
|
240
|
+
const scriptsDir = path.join(cwd, 'scripts');
|
|
241
|
+
await fs.mkdir(scriptsDir, { recursive: true });
|
|
242
|
+
|
|
243
|
+
const benchmarkPath = path.join(scriptsDir, 'benchmark.sh');
|
|
244
|
+
const agentPath = path.join(scriptsDir, 'agent-step.sh');
|
|
245
|
+
|
|
246
|
+
const benchmarkScript = `#!/usr/bin/env bash
|
|
247
|
+
set -euo pipefail
|
|
248
|
+
|
|
249
|
+
# Demo metric: random score for quick smoke testing.
|
|
250
|
+
SCORE=$(awk 'BEGIN{srand(); printf "%.6f", rand()}')
|
|
251
|
+
echo "score=$SCORE"
|
|
252
|
+
`;
|
|
253
|
+
|
|
254
|
+
const agentScript = `#!/usr/bin/env bash
|
|
255
|
+
set -euo pipefail
|
|
256
|
+
|
|
257
|
+
echo "agent iteration: \${AR_ITERATION:-0}"
|
|
258
|
+
# Optional: switch agentMode to "command" and use this script.
|
|
259
|
+
# Example: amp -p "improve benchmark metric"
|
|
260
|
+
`;
|
|
261
|
+
|
|
262
|
+
await fs.writeFile(benchmarkPath, benchmarkScript, 'utf8');
|
|
263
|
+
await fs.writeFile(agentPath, agentScript, 'utf8');
|
|
264
|
+
await fs.chmod(benchmarkPath, 0o755);
|
|
265
|
+
await fs.chmod(agentPath, 0o755);
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
async function runInit() {
|
|
269
|
+
const config = createDefaultConfig();
|
|
270
|
+
|
|
271
|
+
const configPath = await writeConfig(config, process.cwd());
|
|
272
|
+
await writeExampleScripts(process.cwd());
|
|
273
|
+
await writeProgramFile(process.cwd(), config.agentPromptFile, config.agentPrompt);
|
|
274
|
+
|
|
275
|
+
console.log(`Initialized ${CONFIG_DIR} config at:`);
|
|
276
|
+
console.log(` ${configPath}`);
|
|
277
|
+
console.log('Created program.md and example scripts in ./scripts/.');
|
|
278
|
+
console.log('Next step: edit program.md and benchmark script, then run: autoresearcher run');
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
function buildOverrides(parsed) {
|
|
282
|
+
const map = {
|
|
283
|
+
'agent-mode': 'agentMode',
|
|
284
|
+
'agent-prompt-file': 'agentPromptFile',
|
|
285
|
+
'agent-prompt': 'agentPrompt',
|
|
286
|
+
iterations: 'iterations',
|
|
287
|
+
'agent-command': 'agentCommand',
|
|
288
|
+
'backend-agent': 'backendAgent',
|
|
289
|
+
'backend-model': 'backendModel',
|
|
290
|
+
'backend-max-iterations': 'backendMaxIterations',
|
|
291
|
+
'benchmark-command': 'benchmarkCommand',
|
|
292
|
+
'metric-regex': 'metricRegex',
|
|
293
|
+
direction: 'direction',
|
|
294
|
+
'on-reject-command': 'onRejectCommand',
|
|
295
|
+
'on-keep-command': 'onKeepCommand',
|
|
296
|
+
'commit-template': 'commitMessageTemplate',
|
|
297
|
+
};
|
|
298
|
+
|
|
299
|
+
const overrides = {};
|
|
300
|
+
for (const [cliKey, configKey] of Object.entries(map)) {
|
|
301
|
+
if (parsed[cliKey] !== undefined) {
|
|
302
|
+
overrides[configKey] = parsed[cliKey];
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
if (parsed['agent-command'] !== undefined && parsed['agent-mode'] === undefined) {
|
|
307
|
+
overrides.agentMode = 'command';
|
|
308
|
+
}
|
|
309
|
+
if (parsed['agent-prompt'] !== undefined && parsed['agent-mode'] === undefined) {
|
|
310
|
+
overrides.agentMode = 'internal';
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
if (parsed['auto-commit'] === true) overrides.autoCommit = true;
|
|
314
|
+
if (parsed['no-auto-commit'] === true) overrides.autoCommit = false;
|
|
315
|
+
if (parsed['stream-agent-output'] === true) overrides.streamAgentOutput = true;
|
|
316
|
+
if (parsed['no-stream-agent-output'] === true) overrides.streamAgentOutput = false;
|
|
317
|
+
|
|
318
|
+
return overrides;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
export async function runCli(args) {
|
|
322
|
+
if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
|
|
323
|
+
printHelp();
|
|
324
|
+
return;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
const command = args[0];
|
|
328
|
+
const parsed = parseArgs(args.slice(1));
|
|
329
|
+
|
|
330
|
+
if (command === 'init') {
|
|
331
|
+
await runInit();
|
|
332
|
+
return;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
if (command === 'wizard') {
|
|
336
|
+
await runWizard();
|
|
337
|
+
return;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
if (command === 'run') {
|
|
341
|
+
const config = await loadConfig(process.cwd());
|
|
342
|
+
const overrides = buildOverrides(parsed);
|
|
343
|
+
await runResearchLoop(config, overrides);
|
|
344
|
+
return;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
if (command === 'progress') {
|
|
348
|
+
const chartPath = await generateProgressChart({
|
|
349
|
+
runId: parsed['run-id'],
|
|
350
|
+
output: parsed.output,
|
|
351
|
+
});
|
|
352
|
+
console.log(`Progress chart written: ${chartPath}`);
|
|
353
|
+
return;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
throw new Error(`Unknown command: ${command}`);
|
|
357
|
+
}
|
package/src/config.js
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
|
|
4
|
+
export const CONFIG_DIR = '.autoresearcher';
|
|
5
|
+
export const CONFIG_FILE = 'config.json';
|
|
6
|
+
const LEGACY_CONFIG_DIR = '.ar-agent';
|
|
7
|
+
|
|
8
|
+
export function getConfigPath(cwd = process.cwd()) {
|
|
9
|
+
return path.join(cwd, CONFIG_DIR, CONFIG_FILE);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export async function loadConfig(cwd = process.cwd()) {
|
|
13
|
+
let filePath = getConfigPath(cwd);
|
|
14
|
+
try {
|
|
15
|
+
await fs.access(filePath);
|
|
16
|
+
} catch {
|
|
17
|
+
filePath = path.join(cwd, LEGACY_CONFIG_DIR, CONFIG_FILE);
|
|
18
|
+
}
|
|
19
|
+
const raw = await fs.readFile(filePath, 'utf8');
|
|
20
|
+
return JSON.parse(raw);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export async function writeConfig(config, cwd = process.cwd()) {
|
|
24
|
+
const dirPath = path.join(cwd, CONFIG_DIR);
|
|
25
|
+
const filePath = getConfigPath(cwd);
|
|
26
|
+
await fs.mkdir(dirPath, { recursive: true });
|
|
27
|
+
await fs.writeFile(filePath, `${JSON.stringify(config, null, 2)}\n`, 'utf8');
|
|
28
|
+
return filePath;
|
|
29
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import { createRequire } from 'node:module';
|
|
3
|
+
|
|
4
|
+
const require = createRequire(import.meta.url);
|
|
5
|
+
|
|
6
|
+
export const DEFAULT_AGENT_MODE = 'internal';
|
|
7
|
+
export const DEFAULT_AGENT_PROMPT =
|
|
8
|
+
'Improve the benchmark metric while preserving correctness, test behavior, and safety.';
|
|
9
|
+
|
|
10
|
+
function shellQuote(value) {
|
|
11
|
+
return `'${String(value).replaceAll("'", "'\\''")}'`;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function resolveBackendRunner() {
|
|
15
|
+
if (process.env.AR_INTERNAL_BACKEND_COMMAND) {
|
|
16
|
+
return process.env.AR_INTERNAL_BACKEND_COMMAND;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
try {
|
|
20
|
+
const packagePath = require.resolve('ralph-starter/package.json');
|
|
21
|
+
const packageDir = path.dirname(packagePath);
|
|
22
|
+
const cliPath = path.join(packageDir, 'dist', 'cli.js');
|
|
23
|
+
return `${shellQuote(process.execPath)} ${shellQuote(cliPath)}`;
|
|
24
|
+
} catch {
|
|
25
|
+
return 'ralph-starter';
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export function resolveAgentMode(config) {
|
|
30
|
+
if (config.agentMode === 'command' || config.agentMode === 'internal') {
|
|
31
|
+
return config.agentMode;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
if (config.agentCommand) {
|
|
35
|
+
return 'command';
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return DEFAULT_AGENT_MODE;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function buildInternalBackendCommand({
|
|
42
|
+
cwd,
|
|
43
|
+
iteration,
|
|
44
|
+
runId,
|
|
45
|
+
agentPrompt,
|
|
46
|
+
backendAgent,
|
|
47
|
+
backendModel,
|
|
48
|
+
backendMaxIterations,
|
|
49
|
+
}) {
|
|
50
|
+
const backendRunner = resolveBackendRunner();
|
|
51
|
+
const maxIterations = Number(backendMaxIterations ?? 1);
|
|
52
|
+
const safeMaxIterations = Number.isFinite(maxIterations) && maxIterations > 0 ? Math.floor(maxIterations) : 1;
|
|
53
|
+
const prompt = agentPrompt || DEFAULT_AGENT_PROMPT;
|
|
54
|
+
|
|
55
|
+
const contextualPrompt = `${prompt}\n\nIteration context:\n- run_id: ${runId}\n- iteration: ${iteration}`;
|
|
56
|
+
|
|
57
|
+
let command = `${backendRunner} run ${shellQuote(contextualPrompt)}`;
|
|
58
|
+
command += ' --auto';
|
|
59
|
+
command += ` --max-iterations ${safeMaxIterations}`;
|
|
60
|
+
command += ` --output-dir ${shellQuote(cwd)}`;
|
|
61
|
+
command += ' --no-track-progress --no-track-cost';
|
|
62
|
+
|
|
63
|
+
if (backendAgent) {
|
|
64
|
+
command += ` --agent ${shellQuote(backendAgent)}`;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if (backendModel) {
|
|
68
|
+
command += ` --model ${shellQuote(backendModel)}`;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return command;
|
|
72
|
+
}
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { CONFIG_DIR } from './config.js';
|
|
4
|
+
import { runCommand } from './shell.js';
|
|
5
|
+
|
|
6
|
+
function shellQuote(value) {
|
|
7
|
+
return `'${String(value).replaceAll("'", "'\\''")}'`;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function linePath(points, xScale, yScale) {
|
|
11
|
+
if (points.length === 0) return '';
|
|
12
|
+
return points
|
|
13
|
+
.map((point, index) => {
|
|
14
|
+
const x = xScale(point.iteration);
|
|
15
|
+
const y = yScale(point.metric);
|
|
16
|
+
return `${index === 0 ? 'M' : 'L'}${x.toFixed(2)} ${y.toFixed(2)}`;
|
|
17
|
+
})
|
|
18
|
+
.join(' ');
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function buildTicks(minValue, maxValue, count = 5) {
|
|
22
|
+
if (minValue === maxValue) return [minValue];
|
|
23
|
+
const step = (maxValue - minValue) / (count - 1);
|
|
24
|
+
return Array.from({ length: count }, (_, index) => minValue + step * index);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async function resolveRunLogFile(cwd, runId) {
|
|
28
|
+
const runsDir = path.join(cwd, CONFIG_DIR, 'runs');
|
|
29
|
+
await fs.mkdir(runsDir, { recursive: true });
|
|
30
|
+
|
|
31
|
+
if (runId) {
|
|
32
|
+
return {
|
|
33
|
+
runId,
|
|
34
|
+
filePath: path.join(runsDir, `${runId}.jsonl`),
|
|
35
|
+
runsDir,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const files = await fs.readdir(runsDir);
|
|
40
|
+
const runFiles = files.filter((file) => file.endsWith('.jsonl')).sort();
|
|
41
|
+
if (runFiles.length === 0) {
|
|
42
|
+
throw new Error(`No run logs found in ${runsDir}`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const fileName = runFiles[runFiles.length - 1];
|
|
46
|
+
return {
|
|
47
|
+
runId: fileName.replace(/\.jsonl$/, ''),
|
|
48
|
+
filePath: path.join(runsDir, fileName),
|
|
49
|
+
runsDir,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
async function loadChartData(filePath) {
|
|
54
|
+
const raw = await fs.readFile(filePath, 'utf8');
|
|
55
|
+
const lines = raw
|
|
56
|
+
.split('\n')
|
|
57
|
+
.map((line) => line.trim())
|
|
58
|
+
.filter(Boolean);
|
|
59
|
+
|
|
60
|
+
const entries = lines.map((line) => JSON.parse(line));
|
|
61
|
+
|
|
62
|
+
const metricPoints = entries
|
|
63
|
+
.filter((entry) => Number.isFinite(entry.metric) && Number.isFinite(entry.iteration))
|
|
64
|
+
.map((entry) => ({
|
|
65
|
+
iteration: Number(entry.iteration),
|
|
66
|
+
metric: Number(entry.metric),
|
|
67
|
+
status: entry.status,
|
|
68
|
+
}));
|
|
69
|
+
|
|
70
|
+
const bestPoints = entries
|
|
71
|
+
.filter((entry) => Number.isFinite(entry.bestMetric) && Number.isFinite(entry.iteration))
|
|
72
|
+
.map((entry) => ({
|
|
73
|
+
iteration: Number(entry.iteration),
|
|
74
|
+
metric: Number(entry.bestMetric),
|
|
75
|
+
}));
|
|
76
|
+
|
|
77
|
+
if (metricPoints.length === 0) {
|
|
78
|
+
throw new Error('Run log does not contain plottable metric points.');
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return { metricPoints, bestPoints };
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function renderSvg({ runId, metricPoints, bestPoints }) {
|
|
85
|
+
const width = 1200;
|
|
86
|
+
const height = 680;
|
|
87
|
+
const margin = { top: 72, right: 48, bottom: 72, left: 88 };
|
|
88
|
+
const plotWidth = width - margin.left - margin.right;
|
|
89
|
+
const plotHeight = height - margin.top - margin.bottom;
|
|
90
|
+
|
|
91
|
+
const allMetrics = [...metricPoints.map((point) => point.metric), ...bestPoints.map((point) => point.metric)];
|
|
92
|
+
const minMetric = Math.min(...allMetrics);
|
|
93
|
+
const maxMetric = Math.max(...allMetrics);
|
|
94
|
+
const metricRange = Math.max(maxMetric - minMetric, Math.max(Math.abs(maxMetric) * 0.05, 1e-9));
|
|
95
|
+
const yMin = minMetric - metricRange * 0.15;
|
|
96
|
+
const yMax = maxMetric + metricRange * 0.15;
|
|
97
|
+
|
|
98
|
+
const minIteration = Math.min(...metricPoints.map((point) => point.iteration));
|
|
99
|
+
const maxIteration = Math.max(...metricPoints.map((point) => point.iteration));
|
|
100
|
+
const iterationRange = Math.max(maxIteration - minIteration, 1);
|
|
101
|
+
|
|
102
|
+
const xScale = (iteration) => margin.left + ((iteration - minIteration) / iterationRange) * plotWidth;
|
|
103
|
+
const yScale = (metric) => margin.top + ((yMax - metric) / (yMax - yMin)) * plotHeight;
|
|
104
|
+
|
|
105
|
+
const xTicks = buildTicks(minIteration, maxIteration, Math.min(6, iterationRange + 1));
|
|
106
|
+
const yTicks = buildTicks(yMin, yMax, 6);
|
|
107
|
+
|
|
108
|
+
const metricPath = linePath(metricPoints, xScale, yScale);
|
|
109
|
+
const bestPath = linePath(bestPoints, xScale, yScale);
|
|
110
|
+
|
|
111
|
+
const pointDots = metricPoints
|
|
112
|
+
.map((point) => {
|
|
113
|
+
const color = point.status === 'keep' ? '#7fd3ff' : '#4a5f79';
|
|
114
|
+
return `<circle cx="${xScale(point.iteration).toFixed(2)}" cy="${yScale(point.metric).toFixed(2)}" r="4" fill="${color}" />`;
|
|
115
|
+
})
|
|
116
|
+
.join('\n');
|
|
117
|
+
|
|
118
|
+
const xTickLines = xTicks
|
|
119
|
+
.map((tick) => {
|
|
120
|
+
const x = xScale(tick).toFixed(2);
|
|
121
|
+
const label = Number.isInteger(tick) ? String(Math.round(tick)) : tick.toFixed(1);
|
|
122
|
+
return `<line x1="${x}" y1="${margin.top}" x2="${x}" y2="${height - margin.bottom}" stroke="#14202f" stroke-width="1" />
|
|
123
|
+
<text x="${x}" y="${height - margin.bottom + 26}" text-anchor="middle" fill="#9bb0c8" font-size="13">${label}</text>`;
|
|
124
|
+
})
|
|
125
|
+
.join('\n');
|
|
126
|
+
|
|
127
|
+
const yTickLines = yTicks
|
|
128
|
+
.map((tick) => {
|
|
129
|
+
const y = yScale(tick).toFixed(2);
|
|
130
|
+
return `<line x1="${margin.left}" y1="${y}" x2="${width - margin.right}" y2="${y}" stroke="#14202f" stroke-width="1" />
|
|
131
|
+
<text x="${margin.left - 12}" y="${(Number(y) + 5).toFixed(2)}" text-anchor="end" fill="#9bb0c8" font-size="13">${tick.toFixed(6)}</text>`;
|
|
132
|
+
})
|
|
133
|
+
.join('\n');
|
|
134
|
+
|
|
135
|
+
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
136
|
+
<svg xmlns="http://www.w3.org/2000/svg" width="${width}" height="${height}" viewBox="0 0 ${width} ${height}">
|
|
137
|
+
<rect width="100%" height="100%" fill="#000000" />
|
|
138
|
+
<text x="${margin.left}" y="40" fill="#f2f7ff" font-size="28" font-family="Inter, system-ui, sans-serif" font-weight="700">autoresearcher progress</text>
|
|
139
|
+
<text x="${margin.left}" y="62" fill="#8ea1b9" font-size="14" font-family="Inter, system-ui, sans-serif">run ${runId}</text>
|
|
140
|
+
|
|
141
|
+
${xTickLines}
|
|
142
|
+
${yTickLines}
|
|
143
|
+
|
|
144
|
+
<line x1="${margin.left}" y1="${height - margin.bottom}" x2="${width - margin.right}" y2="${height - margin.bottom}" stroke="#304156" stroke-width="1.5" />
|
|
145
|
+
<line x1="${margin.left}" y1="${margin.top}" x2="${margin.left}" y2="${height - margin.bottom}" stroke="#304156" stroke-width="1.5" />
|
|
146
|
+
|
|
147
|
+
<path d="${metricPath}" fill="none" stroke="#4a5f79" stroke-width="2" />
|
|
148
|
+
<path d="${bestPath}" fill="none" stroke="#7fd3ff" stroke-width="3" />
|
|
149
|
+
${pointDots}
|
|
150
|
+
|
|
151
|
+
<text x="${margin.left}" y="${height - 18}" fill="#8ea1b9" font-size="13" font-family="Inter, system-ui, sans-serif">iteration</text>
|
|
152
|
+
<text x="18" y="${margin.top - 18}" fill="#8ea1b9" font-size="13" font-family="Inter, system-ui, sans-serif">metric</text>
|
|
153
|
+
|
|
154
|
+
<rect x="${width - 255}" y="30" width="205" height="54" rx="10" fill="#070b11" stroke="#1b2635" />
|
|
155
|
+
<line x1="${width - 240}" y1="50" x2="${width - 200}" y2="50" stroke="#7fd3ff" stroke-width="3" />
|
|
156
|
+
<text x="${width - 192}" y="54" fill="#d5e8fb" font-size="13" font-family="Inter, system-ui, sans-serif">best so far</text>
|
|
157
|
+
<line x1="${width - 240}" y1="70" x2="${width - 200}" y2="70" stroke="#4a5f79" stroke-width="2" />
|
|
158
|
+
<text x="${width - 192}" y="74" fill="#d5e8fb" font-size="13" font-family="Inter, system-ui, sans-serif">iteration metric</text>
|
|
159
|
+
</svg>`;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
export async function generateProgressChart({ runId, output }) {
|
|
163
|
+
const cwd = process.cwd();
|
|
164
|
+
const { runId: resolvedRunId, filePath, runsDir } = await resolveRunLogFile(cwd, runId);
|
|
165
|
+
const chartData = await loadChartData(filePath);
|
|
166
|
+
|
|
167
|
+
const chartsDir = path.join(path.dirname(runsDir), 'charts');
|
|
168
|
+
await fs.mkdir(chartsDir, { recursive: true });
|
|
169
|
+
|
|
170
|
+
const outputPath = output
|
|
171
|
+
? path.resolve(cwd, output)
|
|
172
|
+
: path.join(chartsDir, `${resolvedRunId}.svg`);
|
|
173
|
+
|
|
174
|
+
const ext = path.extname(outputPath).toLowerCase();
|
|
175
|
+
const svg = renderSvg({ runId: resolvedRunId, ...chartData });
|
|
176
|
+
|
|
177
|
+
if (!ext || ext === '.svg') {
|
|
178
|
+
const target = ext ? outputPath : `${outputPath}.svg`;
|
|
179
|
+
await fs.mkdir(path.dirname(target), { recursive: true });
|
|
180
|
+
await fs.writeFile(target, svg, 'utf8');
|
|
181
|
+
return target;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (ext === '.png') {
|
|
185
|
+
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
186
|
+
const tempSvgPath = path.join(
|
|
187
|
+
path.dirname(outputPath),
|
|
188
|
+
`${path.basename(outputPath, '.png')}.tmp-${Date.now()}.svg`
|
|
189
|
+
);
|
|
190
|
+
|
|
191
|
+
await fs.writeFile(tempSvgPath, svg, 'utf8');
|
|
192
|
+
const convertResult = await runCommand(
|
|
193
|
+
`sips -s format png ${shellQuote(tempSvgPath)} --out ${shellQuote(outputPath)}`,
|
|
194
|
+
{ cwd }
|
|
195
|
+
);
|
|
196
|
+
|
|
197
|
+
await fs.unlink(tempSvgPath).catch(() => {});
|
|
198
|
+
|
|
199
|
+
if (convertResult.code !== 0) {
|
|
200
|
+
throw new Error('PNG export requires macOS sips. Use --output <name>.svg or install a converter.');
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return outputPath;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
throw new Error('Unsupported output extension. Use .svg or .png');
|
|
207
|
+
}
|
package/src/run-loop.js
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { CONFIG_DIR } from './config.js';
|
|
4
|
+
import {
|
|
5
|
+
buildInternalBackendCommand,
|
|
6
|
+
DEFAULT_AGENT_PROMPT,
|
|
7
|
+
resolveAgentMode,
|
|
8
|
+
} from './internal-backend.js';
|
|
9
|
+
import { runCommand } from './shell.js';
|
|
10
|
+
|
|
11
|
+
function parseMetric(output, metricRegex) {
|
|
12
|
+
const regex = new RegExp(metricRegex, 'm');
|
|
13
|
+
const match = output.match(regex);
|
|
14
|
+
if (!match || !match[1]) return null;
|
|
15
|
+
const metric = Number(match[1]);
|
|
16
|
+
return Number.isFinite(metric) ? metric : null;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function isBetter(metric, best, direction) {
|
|
20
|
+
if (best == null) return true;
|
|
21
|
+
return direction === 'min' ? metric < best : metric > best;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
async function appendRunLog(cwd, runId, entry) {
|
|
25
|
+
const runsDir = path.join(cwd, CONFIG_DIR, 'runs');
|
|
26
|
+
const runFile = path.join(runsDir, `${runId}.jsonl`);
|
|
27
|
+
await fs.mkdir(runsDir, { recursive: true });
|
|
28
|
+
await fs.appendFile(runFile, `${JSON.stringify(entry)}\n`, 'utf8');
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
async function getGitCommit(cwd) {
|
|
32
|
+
const result = await runCommand('git rev-parse --short HEAD', { cwd });
|
|
33
|
+
return result.code === 0 ? result.stdout.trim() : null;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async function hasGitChanges(cwd) {
|
|
37
|
+
const result = await runCommand('git status --porcelain', { cwd });
|
|
38
|
+
return result.code === 0 && result.stdout.trim().length > 0;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function getAgentStepCommand(merged, cwd, iteration, runId, agentPrompt) {
|
|
42
|
+
const agentMode = resolveAgentMode(merged);
|
|
43
|
+
|
|
44
|
+
if (agentMode === 'internal') {
|
|
45
|
+
return {
|
|
46
|
+
agentMode,
|
|
47
|
+
command: buildInternalBackendCommand({
|
|
48
|
+
cwd,
|
|
49
|
+
iteration,
|
|
50
|
+
runId,
|
|
51
|
+
agentPrompt,
|
|
52
|
+
backendAgent: merged.backendAgent,
|
|
53
|
+
backendModel: merged.backendModel,
|
|
54
|
+
backendMaxIterations: merged.backendMaxIterations,
|
|
55
|
+
}),
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
agentMode,
|
|
61
|
+
command: merged.agentCommand,
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
async function resolveAgentPrompt(merged, cwd) {
|
|
66
|
+
if (merged.agentPromptFile) {
|
|
67
|
+
const promptPath = path.resolve(cwd, merged.agentPromptFile);
|
|
68
|
+
try {
|
|
69
|
+
const raw = await fs.readFile(promptPath, 'utf8');
|
|
70
|
+
const prompt = raw.trim();
|
|
71
|
+
if (prompt) {
|
|
72
|
+
return { prompt, source: merged.agentPromptFile };
|
|
73
|
+
}
|
|
74
|
+
} catch {
|
|
75
|
+
// Fall back to inline prompt when file does not exist.
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return {
|
|
80
|
+
prompt: merged.agentPrompt || DEFAULT_AGENT_PROMPT,
|
|
81
|
+
source: 'config.agentPrompt',
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export async function runResearchLoop(config, cliOverrides = {}) {
|
|
86
|
+
const cwd = process.cwd();
|
|
87
|
+
const merged = {
|
|
88
|
+
...config,
|
|
89
|
+
...cliOverrides,
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
const required = ['benchmarkCommand', 'metricRegex'];
|
|
93
|
+
for (const key of required) {
|
|
94
|
+
if (!merged[key]) {
|
|
95
|
+
throw new Error(`Missing required config: ${key}`);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const agentMode = resolveAgentMode(merged);
|
|
100
|
+
if (agentMode === 'command' && !merged.agentCommand) {
|
|
101
|
+
throw new Error('Missing required config: agentCommand (required when agentMode="command")');
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const direction = merged.direction === 'min' ? 'min' : 'max';
|
|
105
|
+
const iterations = Number(merged.iterations ?? 20);
|
|
106
|
+
const runId = new Date().toISOString().replace(/[:.]/g, '-');
|
|
107
|
+
const resolvedPrompt = await resolveAgentPrompt(merged, cwd);
|
|
108
|
+
|
|
109
|
+
let bestMetric = null;
|
|
110
|
+
let bestIteration = 0;
|
|
111
|
+
|
|
112
|
+
console.log(`Starting run ${runId}`);
|
|
113
|
+
console.log(`Agent mode: ${agentMode}`);
|
|
114
|
+
if (agentMode === 'internal') {
|
|
115
|
+
console.log(`Agent objective source: ${resolvedPrompt.source}`);
|
|
116
|
+
console.log(`Agent objective: ${resolvedPrompt.prompt}`);
|
|
117
|
+
if (merged.backendAgent) {
|
|
118
|
+
console.log(`Backend agent: ${merged.backendAgent}`);
|
|
119
|
+
}
|
|
120
|
+
if (merged.backendModel) {
|
|
121
|
+
console.log(`Backend model: ${merged.backendModel}`);
|
|
122
|
+
}
|
|
123
|
+
} else {
|
|
124
|
+
console.log(`Agent command: ${merged.agentCommand}`);
|
|
125
|
+
}
|
|
126
|
+
console.log(`Benchmark command: ${merged.benchmarkCommand}`);
|
|
127
|
+
console.log(`Direction: ${direction} (${direction === 'min' ? 'lower is better' : 'higher is better'})`);
|
|
128
|
+
console.log(`Iterations: ${iterations}`);
|
|
129
|
+
|
|
130
|
+
for (let i = 1; i <= iterations; i++) {
|
|
131
|
+
console.log(`\n--- Iteration ${i}/${iterations} ---`);
|
|
132
|
+
const beforeCommit = await getGitCommit(cwd);
|
|
133
|
+
|
|
134
|
+
const agentStep = getAgentStepCommand(merged, cwd, i, runId, resolvedPrompt.prompt);
|
|
135
|
+
|
|
136
|
+
const agentResult = await runCommand(agentStep.command, {
|
|
137
|
+
cwd,
|
|
138
|
+
stream: merged.streamAgentOutput === true,
|
|
139
|
+
env: { AR_ITERATION: String(i), AR_RUN_ID: runId },
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
if (agentResult.code !== 0) {
|
|
143
|
+
console.log(`Agent step failed with code ${agentResult.code}`);
|
|
144
|
+
if (merged.stopOnAgentFailure !== false) {
|
|
145
|
+
await appendRunLog(cwd, runId, {
|
|
146
|
+
iteration: i,
|
|
147
|
+
status: 'agent_failed',
|
|
148
|
+
agentMode: agentStep.agentMode,
|
|
149
|
+
agentExitCode: agentResult.code,
|
|
150
|
+
beforeCommit,
|
|
151
|
+
timestamp: new Date().toISOString(),
|
|
152
|
+
});
|
|
153
|
+
break;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const benchmarkResult = await runCommand(merged.benchmarkCommand, { cwd });
|
|
158
|
+
const benchmarkOutput = `${benchmarkResult.stdout}\n${benchmarkResult.stderr}`;
|
|
159
|
+
const metric = parseMetric(benchmarkOutput, merged.metricRegex);
|
|
160
|
+
|
|
161
|
+
if (benchmarkResult.code !== 0 || metric == null) {
|
|
162
|
+
console.log('Benchmark failed or metric could not be parsed.');
|
|
163
|
+
if (benchmarkResult.stdout) console.log(benchmarkResult.stdout.trim());
|
|
164
|
+
if (benchmarkResult.stderr) console.log(benchmarkResult.stderr.trim());
|
|
165
|
+
|
|
166
|
+
if (merged.onRejectCommand) {
|
|
167
|
+
await runCommand(merged.onRejectCommand, { cwd, stream: true });
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
await appendRunLog(cwd, runId, {
|
|
171
|
+
iteration: i,
|
|
172
|
+
status: 'benchmark_failed',
|
|
173
|
+
benchmarkExitCode: benchmarkResult.code,
|
|
174
|
+
parsedMetric: metric,
|
|
175
|
+
beforeCommit,
|
|
176
|
+
timestamp: new Date().toISOString(),
|
|
177
|
+
});
|
|
178
|
+
continue;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const improved = isBetter(metric, bestMetric, direction);
|
|
182
|
+
console.log(`Metric: ${metric}${bestMetric == null ? ' (baseline)' : ` | best: ${bestMetric}`}`);
|
|
183
|
+
|
|
184
|
+
if (improved) {
|
|
185
|
+
bestMetric = metric;
|
|
186
|
+
bestIteration = i;
|
|
187
|
+
console.log('Result: improved -> keep');
|
|
188
|
+
|
|
189
|
+
if (merged.autoCommit === true && (await hasGitChanges(cwd))) {
|
|
190
|
+
const commitMessage = merged.commitMessageTemplate
|
|
191
|
+
? merged.commitMessageTemplate
|
|
192
|
+
.replaceAll('{iteration}', String(i))
|
|
193
|
+
.replaceAll('{metric}', String(metric))
|
|
194
|
+
: `research: improve metric to ${metric} (iter ${i})`;
|
|
195
|
+
|
|
196
|
+
await runCommand('git add -A', { cwd });
|
|
197
|
+
const commitResult = await runCommand(`git commit -m "${commitMessage.replaceAll('"', '\\"')}"`, {
|
|
198
|
+
cwd,
|
|
199
|
+
});
|
|
200
|
+
if (commitResult.code === 0) {
|
|
201
|
+
console.log(`Committed: ${commitMessage}`);
|
|
202
|
+
} else {
|
|
203
|
+
console.log('Commit skipped (possibly no staged changes).');
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
if (merged.onKeepCommand) {
|
|
208
|
+
await runCommand(merged.onKeepCommand, { cwd, stream: true });
|
|
209
|
+
}
|
|
210
|
+
} else {
|
|
211
|
+
console.log('Result: not improved -> reject');
|
|
212
|
+
if (merged.onRejectCommand) {
|
|
213
|
+
await runCommand(merged.onRejectCommand, { cwd, stream: true });
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
await appendRunLog(cwd, runId, {
|
|
218
|
+
iteration: i,
|
|
219
|
+
status: improved ? 'keep' : 'reject',
|
|
220
|
+
agentMode: agentStep.agentMode,
|
|
221
|
+
metric,
|
|
222
|
+
bestMetric,
|
|
223
|
+
beforeCommit,
|
|
224
|
+
afterCommit: await getGitCommit(cwd),
|
|
225
|
+
timestamp: new Date().toISOString(),
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
console.log('\n=== Run Summary ===');
|
|
230
|
+
if (bestMetric == null) {
|
|
231
|
+
console.log('No valid metric was recorded.');
|
|
232
|
+
} else {
|
|
233
|
+
console.log(`Best metric: ${bestMetric}`);
|
|
234
|
+
console.log(`Best iteration: ${bestIteration}`);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
console.log(`Run logs: ${CONFIG_DIR}/runs/${runId}.jsonl`);
|
|
238
|
+
}
|
package/src/shell.js
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { spawn } from 'node:child_process';
|
|
2
|
+
|
|
3
|
+
export async function runCommand(command, options = {}) {
|
|
4
|
+
const cwd = options.cwd || process.cwd();
|
|
5
|
+
const env = { ...process.env, ...(options.env || {}) };
|
|
6
|
+
|
|
7
|
+
return new Promise((resolve) => {
|
|
8
|
+
const child = spawn(command, {
|
|
9
|
+
cwd,
|
|
10
|
+
env,
|
|
11
|
+
shell: true,
|
|
12
|
+
stdio: options.stream ? 'inherit' : 'pipe',
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
let stdout = '';
|
|
16
|
+
let stderr = '';
|
|
17
|
+
|
|
18
|
+
if (!options.stream) {
|
|
19
|
+
child.stdout?.on('data', (chunk) => {
|
|
20
|
+
stdout += chunk.toString();
|
|
21
|
+
});
|
|
22
|
+
child.stderr?.on('data', (chunk) => {
|
|
23
|
+
stderr += chunk.toString();
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
child.on('close', (code) => {
|
|
28
|
+
resolve({
|
|
29
|
+
code: code ?? 1,
|
|
30
|
+
stdout,
|
|
31
|
+
stderr,
|
|
32
|
+
});
|
|
33
|
+
});
|
|
34
|
+
});
|
|
35
|
+
}
|