@zhijiewang/openharness 1.4.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,216 @@
1
+ /**
2
+ * Meta-Harness — self-optimizing agent harness.
3
+ *
4
+ * Inspired by AutoAgent (which hit #1 on SpreadsheetBench by letting
5
+ * the agent optimize its own harness overnight).
6
+ *
7
+ * Flow:
8
+ * 1. Run benchmark → get baseline score
9
+ * 2. Ask LLM to suggest a config change
10
+ * 3. Apply change → re-run benchmark
11
+ * 4. If score improved, keep; otherwise revert
12
+ * 5. Repeat for N iterations
13
+ *
14
+ * What it optimizes:
15
+ * - System prompt (trim, rephrase, add instructions)
16
+ * - Tool selection (which tools are core vs deferred)
17
+ * - Model router configuration
18
+ * - Compression strategy
19
+ * - Permission rules
20
+ */
21
+ import { readOhConfig, writeOhConfig, invalidateConfigCache } from '../harness/config.js';
22
+ import { copyFileSync, existsSync } from 'node:fs';
23
+ import { join } from 'node:path';
24
+ import { execSync } from 'node:child_process';
25
+ // ── Benchmark Runner ──
26
+ /**
27
+ * Run a benchmark command and extract a score.
28
+ * Score is derived from test results: pass_rate + speed_bonus.
29
+ */
30
+ export async function runBenchmark(command) {
31
+ const start = Date.now();
32
+ try {
33
+ const output = execSync(command, {
34
+ encoding: 'utf-8',
35
+ timeout: 300_000, // 5 minute max
36
+ stdio: ['pipe', 'pipe', 'pipe'],
37
+ });
38
+ // Parse test results to extract score
39
+ const score = extractScore(output);
40
+ return {
41
+ score,
42
+ details: output.slice(-500),
43
+ durationMs: Date.now() - start,
44
+ };
45
+ }
46
+ catch (err) {
47
+ const output = String(err.stdout ?? err.stderr ?? err.message ?? '');
48
+ const score = extractScore(output);
49
+ return {
50
+ score: score > 0 ? score * 0.5 : 0, // Penalty for non-zero exit
51
+ details: output.slice(-500),
52
+ durationMs: Date.now() - start,
53
+ };
54
+ }
55
+ }
56
+ /** Extract a 0-1 score from test output */
57
+ function extractScore(output) {
58
+ // Look for common test result patterns
59
+ // "X passed, Y failed" → pass_rate
60
+ const passMatch = output.match(/(\d+)\s+pass/i);
61
+ const failMatch = output.match(/(\d+)\s+fail/i);
62
+ if (passMatch) {
63
+ const passed = parseInt(passMatch[1]);
64
+ const failed = failMatch ? parseInt(failMatch[1]) : 0;
65
+ const total = passed + failed;
66
+ return total > 0 ? passed / total : 0;
67
+ }
68
+ // "# pass N" (TAP format)
69
+ const tapPass = output.match(/# pass\s+(\d+)/);
70
+ const tapFail = output.match(/# fail\s+(\d+)/);
71
+ if (tapPass) {
72
+ const passed = parseInt(tapPass[1]);
73
+ const failed = tapFail ? parseInt(tapFail[1]) : 0;
74
+ const total = passed + failed;
75
+ return total > 0 ? passed / total : 0;
76
+ }
77
+ // Exit code 0 = 1.0, non-zero = 0
78
+ return output.includes('error') || output.includes('FAIL') ? 0.3 : 0.8;
79
+ }
80
+ // ── Meta-Harness ──
81
+ export class MetaHarness {
82
+ provider;
83
+ benchmarkCommand;
84
+ model;
85
+ constructor(provider, benchmarkCommand, model) {
86
+ this.provider = provider;
87
+ this.benchmarkCommand = benchmarkCommand;
88
+ this.model = model;
89
+ }
90
+ /**
91
+ * Run the optimization loop.
92
+ */
93
+ async optimize(iterations) {
94
+ const totalStart = Date.now();
95
+ const changes = [];
96
+ // Backup current config
97
+ const configPath = join('.oh', 'config.yaml');
98
+ const backupPath = join('.oh', 'config.yaml.backup');
99
+ if (existsSync(configPath)) {
100
+ copyFileSync(configPath, backupPath);
101
+ }
102
+ // Get baseline score
103
+ const baseline = await runBenchmark(this.benchmarkCommand);
104
+ let bestScore = baseline.score;
105
+ for (let i = 0; i < iterations; i++) {
106
+ // Ask LLM to suggest an optimization
107
+ const suggestion = await this.suggestChange(bestScore, changes);
108
+ if (!suggestion)
109
+ continue;
110
+ // Apply the change
111
+ this.applyChange(suggestion);
112
+ // Re-benchmark
113
+ const result = await runBenchmark(this.benchmarkCommand);
114
+ if (result.score > bestScore) {
115
+ // Keep the change
116
+ const impact = result.score - bestScore;
117
+ changes.push({ ...suggestion, impact });
118
+ bestScore = result.score;
119
+ }
120
+ else {
121
+ // Revert
122
+ this.revertChange(suggestion);
123
+ }
124
+ }
125
+ return {
126
+ initialScore: baseline.score,
127
+ finalScore: bestScore,
128
+ iterations,
129
+ changes,
130
+ totalDurationMs: Date.now() - totalStart,
131
+ };
132
+ }
133
+ async suggestChange(currentScore, previousChanges) {
134
+ const config = readOhConfig();
135
+ const configStr = JSON.stringify(config, null, 2);
136
+ const prevChangesStr = previousChanges.length > 0
137
+ ? `\nPrevious successful changes:\n${previousChanges.map(c => `- ${c.description} (+${c.impact.toFixed(3)})`).join('\n')}`
138
+ : '';
139
+ const prompt = `You are optimizing an AI agent harness configuration. Current score: ${currentScore.toFixed(3)}/1.0.
140
+ ${prevChangesStr}
141
+
142
+ Current config:
143
+ ${configStr.slice(0, 2000)}
144
+
145
+ Suggest ONE specific configuration change that might improve the benchmark score. Focus on:
146
+ - System prompt optimization
147
+ - Tool selection (which tools are core)
148
+ - Permission rules that speed up automation
149
+ - Verification configuration
150
+
151
+ Respond with JSON: {"description": "what to change", "field": "config.path", "newValue": "the new value"}`;
152
+ try {
153
+ const response = await this.provider.complete([{ role: 'user', content: prompt, uuid: `meta-${Date.now()}`, timestamp: Date.now() }], 'You are a harness optimization engine. Respond ONLY with valid JSON.', undefined, this.model);
154
+ const jsonMatch = response.content.match(/\{[\s\S]*\}/);
155
+ if (!jsonMatch)
156
+ return null;
157
+ const parsed = JSON.parse(jsonMatch[0]);
158
+ return {
159
+ description: parsed.description ?? 'unknown change',
160
+ field: parsed.field ?? 'unknown',
161
+ oldValue: undefined,
162
+ newValue: parsed.newValue,
163
+ };
164
+ }
165
+ catch {
166
+ return null;
167
+ }
168
+ }
169
+ applyChange(change) {
170
+ invalidateConfigCache();
171
+ // Apply change to config by reading, modifying, and writing back
172
+ const config = readOhConfig() ?? {};
173
+ try {
174
+ // Simple top-level field update (nested paths would need lodash.set)
175
+ const field = change.field.replace(/^config\./, '');
176
+ config[field] = change.newValue;
177
+ writeOhConfig(config);
178
+ }
179
+ catch { /* revert will handle failures */ }
180
+ }
181
+ revertChange(change) {
182
+ invalidateConfigCache();
183
+ // Revert by re-reading the backup config
184
+ const backupPath = join('.oh', 'config.yaml.backup');
185
+ const configPath = join('.oh', 'config.yaml');
186
+ if (existsSync(backupPath)) {
187
+ copyFileSync(backupPath, configPath);
188
+ invalidateConfigCache();
189
+ }
190
+ }
191
+ }
192
+ /** Format optimization results for display */
193
+ export function formatOptimizationResult(result) {
194
+ const lines = [];
195
+ const improvement = result.finalScore - result.initialScore;
196
+ const pct = result.initialScore > 0 ? (improvement / result.initialScore * 100).toFixed(1) : '0';
197
+ lines.push(`Meta-Harness Optimization Complete`);
198
+ lines.push(`${'─'.repeat(40)}`);
199
+ lines.push(`Initial score: ${result.initialScore.toFixed(3)}`);
200
+ lines.push(`Final score: ${result.finalScore.toFixed(3)} (${improvement >= 0 ? '+' : ''}${pct}%)`);
201
+ lines.push(`Iterations: ${result.iterations}`);
202
+ lines.push(`Duration: ${Math.round(result.totalDurationMs / 1000)}s`);
203
+ if (result.changes.length > 0) {
204
+ lines.push('');
205
+ lines.push('Applied changes:');
206
+ for (const c of result.changes) {
207
+ lines.push(` +${c.impact.toFixed(3)} ${c.description}`);
208
+ }
209
+ }
210
+ else {
211
+ lines.push('');
212
+ lines.push('No improvements found in this run.');
213
+ }
214
+ return lines.join('\n');
215
+ }
216
+ //# sourceMappingURL=MetaHarness.js.map
@@ -194,7 +194,14 @@ export const AgentTool = {
194
194
  }
195
195
  }
196
196
  emitHook("subagentStop", { agentId });
197
- return { output: finalText || "(sub-agent completed with no text output)", isError: false };
197
+ // Context folding: collapse long sub-agent output to summary
198
+ let output = finalText || "(sub-agent completed with no text output)";
199
+ if (output.length > 2000) {
200
+ const { ContextManager } = await import("../../query/context-manager.js");
201
+ const cm = new ContextManager();
202
+ output = cm.foldSubagentResult(agentId, output);
203
+ }
204
+ return { output, isError: false };
198
205
  },
199
206
  prompt() {
200
207
  return `Spawn a sub-agent with its own tool-use loop to handle a delegated task autonomously. The sub-agent runs in an isolated git worktree to prevent file conflicts. Parameters:
@@ -7,13 +7,13 @@ declare const inputSchema: z.ZodObject<{
7
7
  maxLines: z.ZodOptional<z.ZodNumber>;
8
8
  }, "strip", z.ZodTypeAny, {
9
9
  command: string;
10
- pattern?: string | undefined;
11
10
  timeout?: number | undefined;
11
+ pattern?: string | undefined;
12
12
  maxLines?: number | undefined;
13
13
  }, {
14
14
  command: string;
15
- pattern?: string | undefined;
16
15
  timeout?: number | undefined;
16
+ pattern?: string | undefined;
17
17
  maxLines?: number | undefined;
18
18
  }>;
19
19
  export declare const MonitorTool: Tool<typeof inputSchema>;
package/package.json CHANGED
@@ -1,13 +1,17 @@
1
1
  {
2
2
  "name": "@zhijiewang/openharness",
3
- "version": "1.4.0",
3
+ "version": "2.1.0",
4
4
  "description": "Open-source terminal coding agent. Works with any LLM.",
5
5
  "type": "module",
6
6
  "bin": {
7
7
  "openharness": "./dist/main.js",
8
8
  "oh": "./dist/main.js"
9
9
  },
10
- "main": "./dist/main.js",
10
+ "main": "./dist/sdk/index.js",
11
+ "exports": {
12
+ ".": "./dist/sdk/index.js",
13
+ "./cli": "./dist/main.js"
14
+ },
11
15
  "files": [
12
16
  "dist/**/*.js",
13
17
  "dist/**/*.d.ts",