@zhijiewang/openharness 1.4.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/index.js +97 -15
- package/dist/harness/config.d.ts +10 -0
- package/dist/harness/sandbox.d.ts +34 -0
- package/dist/harness/sandbox.js +104 -0
- package/dist/harness/submit-handler.js +44 -10
- package/dist/harness/traces.d.ts +58 -0
- package/dist/harness/traces.js +183 -0
- package/dist/main.js +2 -0
- package/dist/query/context-manager.d.ts +56 -0
- package/dist/query/context-manager.js +108 -0
- package/dist/query/index.js +5 -1
- package/dist/sdk/index.d.ts +76 -0
- package/dist/sdk/index.js +146 -0
- package/dist/services/EvaluatorLoop.d.ts +61 -0
- package/dist/services/EvaluatorLoop.js +157 -0
- package/dist/services/MetaHarness.d.ts +61 -0
- package/dist/services/MetaHarness.js +216 -0
- package/dist/tools/AgentTool/index.js +8 -1
- package/dist/tools/MonitorTool/index.d.ts +2 -2
- package/package.json +6 -2
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Meta-Harness — self-optimizing agent harness.
|
|
3
|
+
*
|
|
4
|
+
* Inspired by AutoAgent (which hit #1 on SpreadsheetBench by letting
|
|
5
|
+
* the agent optimize its own harness overnight).
|
|
6
|
+
*
|
|
7
|
+
* Flow:
|
|
8
|
+
* 1. Run benchmark → get baseline score
|
|
9
|
+
* 2. Ask LLM to suggest a config change
|
|
10
|
+
* 3. Apply change → re-run benchmark
|
|
11
|
+
* 4. If score improved, keep; otherwise revert
|
|
12
|
+
* 5. Repeat for N iterations
|
|
13
|
+
*
|
|
14
|
+
* What it optimizes:
|
|
15
|
+
* - System prompt (trim, rephrase, add instructions)
|
|
16
|
+
* - Tool selection (which tools are core vs deferred)
|
|
17
|
+
* - Model router configuration
|
|
18
|
+
* - Compression strategy
|
|
19
|
+
* - Permission rules
|
|
20
|
+
*/
|
|
21
|
+
import { readOhConfig, writeOhConfig, invalidateConfigCache } from '../harness/config.js';
|
|
22
|
+
import { copyFileSync, existsSync } from 'node:fs';
|
|
23
|
+
import { join } from 'node:path';
|
|
24
|
+
import { execSync } from 'node:child_process';
|
|
25
|
+
// ── Benchmark Runner ──
|
|
26
|
+
/**
|
|
27
|
+
* Run a benchmark command and extract a score.
|
|
28
|
+
* Score is derived from test results: pass_rate + speed_bonus.
|
|
29
|
+
*/
|
|
30
|
+
export async function runBenchmark(command) {
|
|
31
|
+
const start = Date.now();
|
|
32
|
+
try {
|
|
33
|
+
const output = execSync(command, {
|
|
34
|
+
encoding: 'utf-8',
|
|
35
|
+
timeout: 300_000, // 5 minute max
|
|
36
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
37
|
+
});
|
|
38
|
+
// Parse test results to extract score
|
|
39
|
+
const score = extractScore(output);
|
|
40
|
+
return {
|
|
41
|
+
score,
|
|
42
|
+
details: output.slice(-500),
|
|
43
|
+
durationMs: Date.now() - start,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
catch (err) {
|
|
47
|
+
const output = String(err.stdout ?? err.stderr ?? err.message ?? '');
|
|
48
|
+
const score = extractScore(output);
|
|
49
|
+
return {
|
|
50
|
+
score: score > 0 ? score * 0.5 : 0, // Penalty for non-zero exit
|
|
51
|
+
details: output.slice(-500),
|
|
52
|
+
durationMs: Date.now() - start,
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
/** Extract a 0-1 score from test output */
|
|
57
|
+
function extractScore(output) {
|
|
58
|
+
// Look for common test result patterns
|
|
59
|
+
// "X passed, Y failed" → pass_rate
|
|
60
|
+
const passMatch = output.match(/(\d+)\s+pass/i);
|
|
61
|
+
const failMatch = output.match(/(\d+)\s+fail/i);
|
|
62
|
+
if (passMatch) {
|
|
63
|
+
const passed = parseInt(passMatch[1]);
|
|
64
|
+
const failed = failMatch ? parseInt(failMatch[1]) : 0;
|
|
65
|
+
const total = passed + failed;
|
|
66
|
+
return total > 0 ? passed / total : 0;
|
|
67
|
+
}
|
|
68
|
+
// "# pass N" (TAP format)
|
|
69
|
+
const tapPass = output.match(/# pass\s+(\d+)/);
|
|
70
|
+
const tapFail = output.match(/# fail\s+(\d+)/);
|
|
71
|
+
if (tapPass) {
|
|
72
|
+
const passed = parseInt(tapPass[1]);
|
|
73
|
+
const failed = tapFail ? parseInt(tapFail[1]) : 0;
|
|
74
|
+
const total = passed + failed;
|
|
75
|
+
return total > 0 ? passed / total : 0;
|
|
76
|
+
}
|
|
77
|
+
// Exit code 0 = 1.0, non-zero = 0
|
|
78
|
+
return output.includes('error') || output.includes('FAIL') ? 0.3 : 0.8;
|
|
79
|
+
}
|
|
80
|
+
// ── Meta-Harness ──
|
|
81
|
+
export class MetaHarness {
|
|
82
|
+
provider;
|
|
83
|
+
benchmarkCommand;
|
|
84
|
+
model;
|
|
85
|
+
constructor(provider, benchmarkCommand, model) {
|
|
86
|
+
this.provider = provider;
|
|
87
|
+
this.benchmarkCommand = benchmarkCommand;
|
|
88
|
+
this.model = model;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Run the optimization loop.
|
|
92
|
+
*/
|
|
93
|
+
async optimize(iterations) {
|
|
94
|
+
const totalStart = Date.now();
|
|
95
|
+
const changes = [];
|
|
96
|
+
// Backup current config
|
|
97
|
+
const configPath = join('.oh', 'config.yaml');
|
|
98
|
+
const backupPath = join('.oh', 'config.yaml.backup');
|
|
99
|
+
if (existsSync(configPath)) {
|
|
100
|
+
copyFileSync(configPath, backupPath);
|
|
101
|
+
}
|
|
102
|
+
// Get baseline score
|
|
103
|
+
const baseline = await runBenchmark(this.benchmarkCommand);
|
|
104
|
+
let bestScore = baseline.score;
|
|
105
|
+
for (let i = 0; i < iterations; i++) {
|
|
106
|
+
// Ask LLM to suggest an optimization
|
|
107
|
+
const suggestion = await this.suggestChange(bestScore, changes);
|
|
108
|
+
if (!suggestion)
|
|
109
|
+
continue;
|
|
110
|
+
// Apply the change
|
|
111
|
+
this.applyChange(suggestion);
|
|
112
|
+
// Re-benchmark
|
|
113
|
+
const result = await runBenchmark(this.benchmarkCommand);
|
|
114
|
+
if (result.score > bestScore) {
|
|
115
|
+
// Keep the change
|
|
116
|
+
const impact = result.score - bestScore;
|
|
117
|
+
changes.push({ ...suggestion, impact });
|
|
118
|
+
bestScore = result.score;
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
// Revert
|
|
122
|
+
this.revertChange(suggestion);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return {
|
|
126
|
+
initialScore: baseline.score,
|
|
127
|
+
finalScore: bestScore,
|
|
128
|
+
iterations,
|
|
129
|
+
changes,
|
|
130
|
+
totalDurationMs: Date.now() - totalStart,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
async suggestChange(currentScore, previousChanges) {
|
|
134
|
+
const config = readOhConfig();
|
|
135
|
+
const configStr = JSON.stringify(config, null, 2);
|
|
136
|
+
const prevChangesStr = previousChanges.length > 0
|
|
137
|
+
? `\nPrevious successful changes:\n${previousChanges.map(c => `- ${c.description} (+${c.impact.toFixed(3)})`).join('\n')}`
|
|
138
|
+
: '';
|
|
139
|
+
const prompt = `You are optimizing an AI agent harness configuration. Current score: ${currentScore.toFixed(3)}/1.0.
|
|
140
|
+
${prevChangesStr}
|
|
141
|
+
|
|
142
|
+
Current config:
|
|
143
|
+
${configStr.slice(0, 2000)}
|
|
144
|
+
|
|
145
|
+
Suggest ONE specific configuration change that might improve the benchmark score. Focus on:
|
|
146
|
+
- System prompt optimization
|
|
147
|
+
- Tool selection (which tools are core)
|
|
148
|
+
- Permission rules that speed up automation
|
|
149
|
+
- Verification configuration
|
|
150
|
+
|
|
151
|
+
Respond with JSON: {"description": "what to change", "field": "config.path", "newValue": "the new value"}`;
|
|
152
|
+
try {
|
|
153
|
+
const response = await this.provider.complete([{ role: 'user', content: prompt, uuid: `meta-${Date.now()}`, timestamp: Date.now() }], 'You are a harness optimization engine. Respond ONLY with valid JSON.', undefined, this.model);
|
|
154
|
+
const jsonMatch = response.content.match(/\{[\s\S]*\}/);
|
|
155
|
+
if (!jsonMatch)
|
|
156
|
+
return null;
|
|
157
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
158
|
+
return {
|
|
159
|
+
description: parsed.description ?? 'unknown change',
|
|
160
|
+
field: parsed.field ?? 'unknown',
|
|
161
|
+
oldValue: undefined,
|
|
162
|
+
newValue: parsed.newValue,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
catch {
|
|
166
|
+
return null;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
applyChange(change) {
|
|
170
|
+
invalidateConfigCache();
|
|
171
|
+
// Apply change to config by reading, modifying, and writing back
|
|
172
|
+
const config = readOhConfig() ?? {};
|
|
173
|
+
try {
|
|
174
|
+
// Simple top-level field update (nested paths would need lodash.set)
|
|
175
|
+
const field = change.field.replace(/^config\./, '');
|
|
176
|
+
config[field] = change.newValue;
|
|
177
|
+
writeOhConfig(config);
|
|
178
|
+
}
|
|
179
|
+
catch { /* revert will handle failures */ }
|
|
180
|
+
}
|
|
181
|
+
revertChange(change) {
|
|
182
|
+
invalidateConfigCache();
|
|
183
|
+
// Revert by re-reading the backup config
|
|
184
|
+
const backupPath = join('.oh', 'config.yaml.backup');
|
|
185
|
+
const configPath = join('.oh', 'config.yaml');
|
|
186
|
+
if (existsSync(backupPath)) {
|
|
187
|
+
copyFileSync(backupPath, configPath);
|
|
188
|
+
invalidateConfigCache();
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
/** Format optimization results for display */
|
|
193
|
+
export function formatOptimizationResult(result) {
|
|
194
|
+
const lines = [];
|
|
195
|
+
const improvement = result.finalScore - result.initialScore;
|
|
196
|
+
const pct = result.initialScore > 0 ? (improvement / result.initialScore * 100).toFixed(1) : '0';
|
|
197
|
+
lines.push(`Meta-Harness Optimization Complete`);
|
|
198
|
+
lines.push(`${'─'.repeat(40)}`);
|
|
199
|
+
lines.push(`Initial score: ${result.initialScore.toFixed(3)}`);
|
|
200
|
+
lines.push(`Final score: ${result.finalScore.toFixed(3)} (${improvement >= 0 ? '+' : ''}${pct}%)`);
|
|
201
|
+
lines.push(`Iterations: ${result.iterations}`);
|
|
202
|
+
lines.push(`Duration: ${Math.round(result.totalDurationMs / 1000)}s`);
|
|
203
|
+
if (result.changes.length > 0) {
|
|
204
|
+
lines.push('');
|
|
205
|
+
lines.push('Applied changes:');
|
|
206
|
+
for (const c of result.changes) {
|
|
207
|
+
lines.push(` +${c.impact.toFixed(3)} ${c.description}`);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
else {
|
|
211
|
+
lines.push('');
|
|
212
|
+
lines.push('No improvements found in this run.');
|
|
213
|
+
}
|
|
214
|
+
return lines.join('\n');
|
|
215
|
+
}
|
|
216
|
+
//# sourceMappingURL=MetaHarness.js.map
|
|
@@ -194,7 +194,14 @@ export const AgentTool = {
|
|
|
194
194
|
}
|
|
195
195
|
}
|
|
196
196
|
emitHook("subagentStop", { agentId });
|
|
197
|
-
|
|
197
|
+
// Context folding: collapse long sub-agent output to summary
|
|
198
|
+
let output = finalText || "(sub-agent completed with no text output)";
|
|
199
|
+
if (output.length > 2000) {
|
|
200
|
+
const { ContextManager } = await import("../../query/context-manager.js");
|
|
201
|
+
const cm = new ContextManager();
|
|
202
|
+
output = cm.foldSubagentResult(agentId, output);
|
|
203
|
+
}
|
|
204
|
+
return { output, isError: false };
|
|
198
205
|
},
|
|
199
206
|
prompt() {
|
|
200
207
|
return `Spawn a sub-agent with its own tool-use loop to handle a delegated task autonomously. The sub-agent runs in an isolated git worktree to prevent file conflicts. Parameters:
|
|
@@ -7,13 +7,13 @@ declare const inputSchema: z.ZodObject<{
|
|
|
7
7
|
maxLines: z.ZodOptional<z.ZodNumber>;
|
|
8
8
|
}, "strip", z.ZodTypeAny, {
|
|
9
9
|
command: string;
|
|
10
|
-
pattern?: string | undefined;
|
|
11
10
|
timeout?: number | undefined;
|
|
11
|
+
pattern?: string | undefined;
|
|
12
12
|
maxLines?: number | undefined;
|
|
13
13
|
}, {
|
|
14
14
|
command: string;
|
|
15
|
-
pattern?: string | undefined;
|
|
16
15
|
timeout?: number | undefined;
|
|
16
|
+
pattern?: string | undefined;
|
|
17
17
|
maxLines?: number | undefined;
|
|
18
18
|
}>;
|
|
19
19
|
export declare const MonitorTool: Tool<typeof inputSchema>;
|
package/package.json
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@zhijiewang/openharness",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"description": "Open-source terminal coding agent. Works with any LLM.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
7
7
|
"openharness": "./dist/main.js",
|
|
8
8
|
"oh": "./dist/main.js"
|
|
9
9
|
},
|
|
10
|
-
"main": "./dist/
|
|
10
|
+
"main": "./dist/sdk/index.js",
|
|
11
|
+
"exports": {
|
|
12
|
+
".": "./dist/sdk/index.js",
|
|
13
|
+
"./cli": "./dist/main.js"
|
|
14
|
+
},
|
|
11
15
|
"files": [
|
|
12
16
|
"dist/**/*.js",
|
|
13
17
|
"dist/**/*.d.ts",
|