@zhijiewang/openharness 1.4.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Active Context Management — proactive control of the context window.
3
+ *
4
+ * Unlike reactive compression (trigger at 80% full), active management:
5
+ * - Enforces per-tool token budgets (no single tool consumes the window)
6
+ * - Folds sub-agent output to summaries (parent sees prompt + result only)
7
+ * - Pre-compresses before large tool calls
8
+ * - Auto-summarizes when approaching limits
9
+ *
10
+ * Based on the "context engineering" pattern from Anthropic's harness research.
11
+ */
12
+ import type { Message } from '../types/message.js';
13
+ export type ContextBudget = {
14
+ /** Max tokens for a single tool output */
15
+ toolOutputMax: number;
16
+ /** Per-tool overrides */
17
+ perTool: Record<string, number>;
18
+ /** Whether to auto-fold sub-agent results */
19
+ autoFold: boolean;
20
+ /** Context usage threshold to trigger proactive compression (0-1) */
21
+ proactiveThreshold: number;
22
+ };
23
+ export declare class ContextManager {
24
+ private budget;
25
+ private model;
26
+ constructor(budget?: Partial<ContextBudget>, model?: string);
27
+ /** Get the token budget for a specific tool */
28
+ getToolBudget(toolName: string): number;
29
+ /** Set a per-tool token budget */
30
+ setToolBudget(toolName: string, maxTokens: number): void;
31
+ /**
32
+ * Truncate tool output to its budget.
33
+ * Keeps the first portion and last portion, with a truncation marker.
34
+ */
35
+ enforceToolBudget(toolName: string, output: string): string;
36
+ /**
37
+ * Fold a sub-agent's full output into a concise summary.
38
+ * Keeps the first 200 chars as context + truncates the rest.
39
+ */
40
+ foldSubagentResult(agentId: string, fullOutput: string): string;
41
+ /**
42
+ * Check if we should proactively compress before a tool call.
43
+ * Returns true if estimated context usage exceeds the proactive threshold.
44
+ */
45
+ shouldPreCompress(messages: Message[], estimatedOutputTokens: number): boolean;
46
+ /**
47
+ * Estimate how many tokens a tool call might produce.
48
+ * Based on historical averages for each tool type.
49
+ */
50
+ estimateToolOutputTokens(toolName: string): number;
51
+ /** Whether auto-folding is enabled */
52
+ get autoFoldEnabled(): boolean;
53
+ /** Get the full budget config */
54
+ get config(): ContextBudget;
55
+ }
56
+ //# sourceMappingURL=context-manager.d.ts.map
@@ -0,0 +1,108 @@
1
+ /**
2
+ * Active Context Management — proactive control of the context window.
3
+ *
4
+ * Unlike reactive compression (trigger at 80% full), active management:
5
+ * - Enforces per-tool token budgets (no single tool consumes the window)
6
+ * - Folds sub-agent output to summaries (parent sees prompt + result only)
7
+ * - Pre-compresses before large tool calls
8
+ * - Auto-summarizes when approaching limits
9
+ *
10
+ * Based on the "context engineering" pattern from Anthropic's harness research.
11
+ */
12
+ import { getContextWindow } from '../harness/cost.js';
13
+ import { estimateMessagesTokens } from './compress.js';
14
+ const DEFAULT_BUDGET = {
15
+ toolOutputMax: 10_000,
16
+ perTool: {},
17
+ autoFold: true,
18
+ proactiveThreshold: 0.6,
19
+ };
20
+ // ── Context Manager ──
21
+ export class ContextManager {
22
+ budget;
23
+ model;
24
+ constructor(budget, model) {
25
+ this.budget = { ...DEFAULT_BUDGET, ...budget };
26
+ this.model = model;
27
+ }
28
+ /** Get the token budget for a specific tool */
29
+ getToolBudget(toolName) {
30
+ return this.budget.perTool[toolName] ?? this.budget.toolOutputMax;
31
+ }
32
+ /** Set a per-tool token budget */
33
+ setToolBudget(toolName, maxTokens) {
34
+ this.budget.perTool[toolName] = maxTokens;
35
+ }
36
+ /**
37
+ * Truncate tool output to its budget.
38
+ * Keeps the first portion and last portion, with a truncation marker.
39
+ */
40
+ enforceToolBudget(toolName, output) {
41
+ const budget = this.getToolBudget(toolName);
42
+ // Rough estimate: 4 chars ≈ 1 token
43
+ const maxChars = budget * 4;
44
+ if (output.length <= maxChars)
45
+ return output;
46
+ const keepHead = Math.floor(maxChars * 0.7);
47
+ const keepTail = Math.floor(maxChars * 0.2);
48
+ const truncated = output.length - keepHead - keepTail;
49
+ return output.slice(0, keepHead)
50
+ + `\n\n[...${truncated.toLocaleString()} chars truncated (budget: ${budget} tokens)...]\n\n`
51
+ + output.slice(-keepTail);
52
+ }
53
+ /**
54
+ * Fold a sub-agent's full output into a concise summary.
55
+ * Keeps the first 200 chars as context + truncates the rest.
56
+ */
57
+ foldSubagentResult(agentId, fullOutput) {
58
+ if (!this.budget.autoFold)
59
+ return fullOutput;
60
+ // Short outputs don't need folding
61
+ if (fullOutput.length < 2000)
62
+ return fullOutput;
63
+ // Keep first ~500 chars (task context) + last ~500 chars (conclusion)
64
+ const head = fullOutput.slice(0, 500);
65
+ const tail = fullOutput.slice(-500);
66
+ const foldedChars = fullOutput.length - 1000;
67
+ return `${head}\n\n[...${foldedChars} chars folded from sub-agent ${agentId}...]\n\n${tail}`;
68
+ }
69
+ /**
70
+ * Check if we should proactively compress before a tool call.
71
+ * Returns true if estimated context usage exceeds the proactive threshold.
72
+ */
73
+ shouldPreCompress(messages, estimatedOutputTokens) {
74
+ const contextWindow = getContextWindow(this.model);
75
+ const currentTokens = estimateMessagesTokens(messages);
76
+ const projected = currentTokens + estimatedOutputTokens;
77
+ return projected / contextWindow > this.budget.proactiveThreshold;
78
+ }
79
+ /**
80
+ * Estimate how many tokens a tool call might produce.
81
+ * Based on historical averages for each tool type.
82
+ */
83
+ estimateToolOutputTokens(toolName) {
84
+ const estimates = {
85
+ Bash: 2000,
86
+ Read: 3000,
87
+ Grep: 1500,
88
+ Glob: 500,
89
+ LS: 300,
90
+ Edit: 200,
91
+ Write: 200,
92
+ Agent: 5000,
93
+ Pipeline: 3000,
94
+ WebFetch: 4000,
95
+ WebSearch: 1000,
96
+ };
97
+ return estimates[toolName] ?? 1000;
98
+ }
99
+ /** Whether auto-folding is enabled */
100
+ get autoFoldEnabled() {
101
+ return this.budget.autoFold;
102
+ }
103
+ /** Get the full budget config */
104
+ get config() {
105
+ return { ...this.budget };
106
+ }
107
+ }
108
+ //# sourceMappingURL=context-manager.js.map
@@ -9,6 +9,7 @@
9
9
  */
10
10
  import { toolToAPIFormat } from "../Tool.js";
11
11
  import { DeferredTool } from "../DeferredTool.js";
12
+ import { ContextManager } from "./context-manager.js";
12
13
  import { createAssistantMessage, createUserMessage } from "../types/message.js";
13
14
  import { StreamingToolExecutor } from "../services/StreamingToolExecutor.js";
14
15
  import { getContextWindow } from "../harness/cost.js";
@@ -31,6 +32,7 @@ export async function* query(userMessage, config, existingMessages = []) {
31
32
  askUserQuestion: config.askUserQuestion,
32
33
  };
33
34
  const estimateTokens = makeTokenEstimator(config.provider);
35
+ const contextManager = new ContextManager(undefined, config.model);
34
36
  // Check provider capabilities
35
37
  const modelInfo = config.provider.getModelInfo?.(config.model ?? '');
36
38
  const toolsSupported = !modelInfo || modelInfo.supportsTools;
@@ -193,7 +195,9 @@ export async function* query(userMessage, config, existingMessages = []) {
193
195
  }
194
196
  for (const { toolCall: tc, result } of completedResults) {
195
197
  yield { type: "tool_call_end", callId: tc.id, output: result.output, isError: result.isError };
196
- state.messages.push(createToolResultMessage({ callId: tc.id, output: result.output, isError: result.isError }));
198
+ // Apply context budget to tool output
199
+ const budgetedOutput = contextManager.enforceToolBudget(tc.toolName, result.output);
200
+ state.messages.push(createToolResultMessage({ callId: tc.id, output: budgetedOutput, isError: result.isError }));
197
201
  }
198
202
  // Execute remaining tools not started during streaming
199
203
  const remaining = toolCalls.filter(tc => !executedIds.has(tc.id));
@@ -0,0 +1,76 @@
1
+ /**
2
+ * openHarness Agent SDK — programmatic API for building AI agents.
3
+ *
4
+ * Usage:
5
+ * import { createAgent } from '@zhijiewang/openharness';
6
+ *
7
+ * const agent = createAgent({
8
+ * provider: 'anthropic',
9
+ * model: 'claude-sonnet-4-6',
10
+ * apiKey: process.env.ANTHROPIC_API_KEY,
11
+ * });
12
+ *
13
+ * const result = await agent.run('Fix the failing tests');
14
+ * console.log(result.text);
15
+ */
16
+ import type { StreamEvent } from '../types/events.js';
17
+ import type { PermissionMode } from '../types/permissions.js';
18
+ export type AgentConfig = {
19
+ /** Provider name: 'anthropic', 'openai', 'ollama', 'openrouter', etc. */
20
+ provider: string;
21
+ /** Model identifier */
22
+ model: string;
23
+ /** API key (or use environment variable) */
24
+ apiKey?: string;
25
+ /** Custom base URL */
26
+ baseUrl?: string;
27
+ /** Tools to include: 'all', 'read-only', or array of tool names */
28
+ tools?: 'all' | 'read-only' | string[];
29
+ /** Permission mode (default: 'trust') */
30
+ permissionMode?: PermissionMode;
31
+ /** Custom system prompt */
32
+ systemPrompt?: string;
33
+ /** Max turns per run */
34
+ maxTurns?: number;
35
+ /** Working directory */
36
+ cwd?: string;
37
+ };
38
+ export type AgentResult = {
39
+ /** Final text output */
40
+ text: string;
41
+ /** Tool calls made during execution */
42
+ toolCalls: Array<{
43
+ toolName: string;
44
+ output: string;
45
+ isError: boolean;
46
+ }>;
47
+ /** Total cost in USD */
48
+ cost: number;
49
+ /** Total input tokens */
50
+ inputTokens: number;
51
+ /** Total output tokens */
52
+ outputTokens: number;
53
+ /** Number of turns taken */
54
+ turns: number;
55
+ };
56
+ export declare class Agent {
57
+ private provider;
58
+ private tools;
59
+ private config;
60
+ private initPromise;
61
+ constructor(config: AgentConfig);
62
+ /** Initialize provider and tools (lazy, on first use). Race-safe via promise guard. */
63
+ private init;
64
+ private _doInit;
65
+ /** Run a single prompt and return the result */
66
+ run(prompt: string): Promise<AgentResult>;
67
+ /** Stream events from a prompt */
68
+ stream(prompt: string): AsyncGenerator<StreamEvent, void>;
69
+ /** Stop the agent (cleanup) */
70
+ stop(): void;
71
+ }
72
+ /** Create a new agent instance */
73
+ export declare function createAgent(config: AgentConfig): Agent;
74
+ export type { StreamEvent } from '../types/events.js';
75
+ export type { PermissionMode } from '../types/permissions.js';
76
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1,146 @@
1
+ /**
2
+ * openHarness Agent SDK — programmatic API for building AI agents.
3
+ *
4
+ * Usage:
5
+ * import { createAgent } from '@zhijiewang/openharness';
6
+ *
7
+ * const agent = createAgent({
8
+ * provider: 'anthropic',
9
+ * model: 'claude-sonnet-4-6',
10
+ * apiKey: process.env.ANTHROPIC_API_KEY,
11
+ * });
12
+ *
13
+ * const result = await agent.run('Fix the failing tests');
14
+ * console.log(result.text);
15
+ */
16
+ // ── Agent Class ──
17
+ export class Agent {
18
+ provider = null;
19
+ tools = null;
20
+ config;
21
+ initPromise = null;
22
+ constructor(config) {
23
+ this.config = {
24
+ permissionMode: 'trust',
25
+ maxTurns: 20,
26
+ ...config,
27
+ };
28
+ }
29
+ /** Initialize provider and tools (lazy, on first use). Race-safe via promise guard. */
30
+ init() {
31
+ return (this.initPromise ??= this._doInit());
32
+ }
33
+ async _doInit() {
34
+ const { createProvider } = await import('../providers/index.js');
35
+ const { getAllTools } = await import('../tools.js');
36
+ const overrides = {};
37
+ if (this.config.apiKey)
38
+ overrides.apiKey = this.config.apiKey;
39
+ if (this.config.baseUrl)
40
+ overrides.baseUrl = this.config.baseUrl;
41
+ const { provider } = await createProvider(this.config.model, Object.keys(overrides).length > 0 ? overrides : undefined);
42
+ this.provider = provider;
43
+ // Filter tools
44
+ let tools = getAllTools();
45
+ if (this.config.tools === 'read-only') {
46
+ const readOnlyNames = new Set(['Read', 'Glob', 'Grep', 'LS', 'ImageRead', 'WebSearch', 'WebFetch']);
47
+ tools = tools.filter(t => readOnlyNames.has(t.name));
48
+ }
49
+ else if (Array.isArray(this.config.tools)) {
50
+ const allowed = new Set(this.config.tools.map(n => n.toLowerCase()));
51
+ tools = tools.filter(t => allowed.has(t.name.toLowerCase()));
52
+ }
53
+ this.tools = tools;
54
+ }
55
+ /** Run a single prompt and return the result */
56
+ async run(prompt) {
57
+ await this.init();
58
+ const { query } = await import('../query.js');
59
+ const originalCwd = process.cwd();
60
+ if (this.config.cwd) {
61
+ try {
62
+ process.chdir(this.config.cwd);
63
+ }
64
+ catch { /* ignore */ }
65
+ }
66
+ const config = {
67
+ provider: this.provider,
68
+ tools: this.tools,
69
+ systemPrompt: this.config.systemPrompt ?? 'You are a helpful coding agent.',
70
+ permissionMode: this.config.permissionMode,
71
+ model: this.config.model,
72
+ maxTurns: this.config.maxTurns,
73
+ };
74
+ let text = '';
75
+ const toolCalls = [];
76
+ let cost = 0;
77
+ let inputTokens = 0;
78
+ let outputTokens = 0;
79
+ let turns = 0;
80
+ try {
81
+ for await (const event of query(prompt, config)) {
82
+ switch (event.type) {
83
+ case 'text_delta':
84
+ text += event.content;
85
+ break;
86
+ case 'tool_call_end':
87
+ toolCalls.push({
88
+ toolName: event.toolName ?? 'unknown',
89
+ output: event.output ?? '',
90
+ isError: event.isError ?? false,
91
+ });
92
+ break;
93
+ case 'cost_update':
94
+ cost += event.cost ?? 0;
95
+ inputTokens += event.inputTokens ?? 0;
96
+ outputTokens += event.outputTokens ?? 0;
97
+ break;
98
+ case 'turn_complete':
99
+ turns++;
100
+ break;
101
+ }
102
+ }
103
+ }
104
+ finally {
105
+ if (this.config.cwd) {
106
+ try {
107
+ process.chdir(originalCwd);
108
+ }
109
+ catch { /* ignore */ }
110
+ }
111
+ }
112
+ return { text, toolCalls, cost, inputTokens, outputTokens, turns };
113
+ }
114
+ /** Stream events from a prompt */
115
+ async *stream(prompt) {
116
+ await this.init();
117
+ const { query } = await import('../query.js');
118
+ if (this.config.cwd) {
119
+ try {
120
+ process.chdir(this.config.cwd);
121
+ }
122
+ catch { /* ignore */ }
123
+ }
124
+ const config = {
125
+ provider: this.provider,
126
+ tools: this.tools,
127
+ systemPrompt: this.config.systemPrompt ?? 'You are a helpful coding agent.',
128
+ permissionMode: this.config.permissionMode,
129
+ model: this.config.model,
130
+ maxTurns: this.config.maxTurns,
131
+ };
132
+ yield* query(prompt, config);
133
+ }
134
+ /** Stop the agent (cleanup) */
135
+ stop() {
136
+ this.provider = null;
137
+ this.tools = null;
138
+ this.initPromise = null;
139
+ }
140
+ }
141
+ // ── Factory ──
142
+ /** Create a new agent instance */
143
+ export function createAgent(config) {
144
+ return new Agent(config);
145
+ }
146
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,61 @@
1
+ /**
2
+ * GAN-Style Evaluator Loop — Generator→Evaluator adversarial refinement.
3
+ *
4
+ * Inspired by Anthropic's three-agent harness architecture:
5
+ * "AI models are inherently poor at self-critique; they tend to rate
6
+ * their own work favorably." Externalizing critique to a separate
7
+ * Evaluator agent produces measurably better output.
8
+ *
9
+ * Flow:
10
+ * 1. Generator produces initial output
11
+ * 2. Evaluator scores against rubric criteria
12
+ * 3. If below threshold, Generator refines based on feedback
13
+ * 4. Repeat until pass or max iterations reached
14
+ */
15
+ import type { Provider } from '../providers/base.js';
16
+ import type { Tools } from '../Tool.js';
17
+ import type { PermissionMode } from '../types/permissions.js';
18
+ export type EvaluationCriterion = {
19
+ name: string;
20
+ weight: number;
21
+ description: string;
22
+ };
23
+ export type EvaluationRubric = {
24
+ criteria: EvaluationCriterion[];
25
+ passThreshold: number;
26
+ };
27
+ export type EvaluationScore = {
28
+ criterion: string;
29
+ score: number;
30
+ feedback: string;
31
+ };
32
+ export type EvaluatorResult = {
33
+ output: string;
34
+ scores: EvaluationScore[];
35
+ weightedScore: number;
36
+ passed: boolean;
37
+ iterations: number;
38
+ refinements: string[];
39
+ };
40
+ export declare const DEFAULT_RUBRIC: EvaluationRubric;
41
+ export declare class EvaluatorLoop {
42
+ private provider;
43
+ private tools;
44
+ private systemPrompt;
45
+ private permissionMode;
46
+ private model?;
47
+ private rubric;
48
+ private maxIterations;
49
+ constructor(provider: Provider, tools: Tools, systemPrompt: string, permissionMode: PermissionMode, model?: string | undefined, rubric?: EvaluationRubric, maxIterations?: number);
50
+ /**
51
+ * Run the full Generator→Evaluator→Refine cycle.
52
+ */
53
+ run(task: string): Promise<EvaluatorResult>;
54
+ private generate;
55
+ private evaluate;
56
+ private calculateWeightedScore;
57
+ private defaultScores;
58
+ }
59
+ /** Format evaluator results for display */
60
+ export declare function formatEvaluatorResult(result: EvaluatorResult): string;
61
+ //# sourceMappingURL=EvaluatorLoop.d.ts.map
@@ -0,0 +1,157 @@
1
+ /**
2
+ * GAN-Style Evaluator Loop — Generator→Evaluator adversarial refinement.
3
+ *
4
+ * Inspired by Anthropic's three-agent harness architecture:
5
+ * "AI models are inherently poor at self-critique; they tend to rate
6
+ * their own work favorably." Externalizing critique to a separate
7
+ * Evaluator agent produces measurably better output.
8
+ *
9
+ * Flow:
10
+ * 1. Generator produces initial output
11
+ * 2. Evaluator scores against rubric criteria
12
+ * 3. If below threshold, Generator refines based on feedback
13
+ * 4. Repeat until pass or max iterations reached
14
+ */
15
+ // ── Default Rubric ──
16
+ export const DEFAULT_RUBRIC = {
17
+ criteria: [
18
+ { name: 'correctness', weight: 0.4, description: 'Does the output correctly address the task? Are there logical errors?' },
19
+ { name: 'completeness', weight: 0.3, description: 'Is the solution complete? Any missing edge cases or requirements?' },
20
+ { name: 'quality', weight: 0.2, description: 'Is the code clean, well-structured, and following best practices?' },
21
+ { name: 'safety', weight: 0.1, description: 'Are there security issues, unsafe patterns, or potential bugs?' },
22
+ ],
23
+ passThreshold: 0.7,
24
+ };
25
+ // ── Evaluator Loop ──
26
+ export class EvaluatorLoop {
27
+ provider;
28
+ tools;
29
+ systemPrompt;
30
+ permissionMode;
31
+ model;
32
+ rubric;
33
+ maxIterations;
34
+ constructor(provider, tools, systemPrompt, permissionMode, model, rubric = DEFAULT_RUBRIC, maxIterations = 3) {
35
+ this.provider = provider;
36
+ this.tools = tools;
37
+ this.systemPrompt = systemPrompt;
38
+ this.permissionMode = permissionMode;
39
+ this.model = model;
40
+ this.rubric = rubric;
41
+ this.maxIterations = maxIterations;
42
+ }
43
+ /**
44
+ * Run the full Generator→Evaluator→Refine cycle.
45
+ */
46
+ async run(task) {
47
+ const refinements = [];
48
+ let currentOutput = '';
49
+ let scores = [];
50
+ let weightedScore = 0;
51
+ for (let iteration = 1; iteration <= this.maxIterations; iteration++) {
52
+ // ── Generate ──
53
+ const generatorPrompt = iteration === 1
54
+ ? task
55
+ : `${task}\n\n[Evaluator feedback from iteration ${iteration - 1}]:\n${scores.map(s => `${s.criterion}: ${s.score}/1.0 — ${s.feedback}`).join('\n')}\n\nPlease refine your output based on this feedback.`;
56
+ currentOutput = await this.generate(generatorPrompt);
57
+ // ── Evaluate ──
58
+ scores = await this.evaluate(task, currentOutput);
59
+ weightedScore = this.calculateWeightedScore(scores);
60
+ if (weightedScore >= this.rubric.passThreshold) {
61
+ return {
62
+ output: currentOutput,
63
+ scores,
64
+ weightedScore,
65
+ passed: true,
66
+ iterations: iteration,
67
+ refinements,
68
+ };
69
+ }
70
+ refinements.push(`Iteration ${iteration}: score ${weightedScore.toFixed(2)} — refining`);
71
+ }
72
+ // Max iterations reached — return best effort
73
+ return {
74
+ output: currentOutput,
75
+ scores,
76
+ weightedScore,
77
+ passed: false,
78
+ iterations: this.maxIterations,
79
+ refinements,
80
+ };
81
+ }
82
+ async generate(prompt) {
83
+ const { query } = await import('../query.js');
84
+ const config = {
85
+ provider: this.provider,
86
+ tools: this.tools,
87
+ systemPrompt: this.systemPrompt,
88
+ permissionMode: this.permissionMode,
89
+ model: this.model,
90
+ maxTurns: 15,
91
+ };
92
+ let output = '';
93
+ for await (const event of query(prompt, config)) {
94
+ if (event.type === 'text_delta')
95
+ output += event.content;
96
+ }
97
+ return output;
98
+ }
99
+ async evaluate(task, output) {
100
+ const evaluationPrompt = `You are a code evaluator. Score the following output on a 0-1 scale for each criterion.
101
+
102
+ Task: ${task.slice(0, 500)}
103
+
104
+ Output to evaluate:
105
+ ${output.slice(0, 3000)}
106
+
107
+ Criteria:
108
+ ${this.rubric.criteria.map(c => `- ${c.name} (weight: ${c.weight}): ${c.description}`).join('\n')}
109
+
110
+ Respond ONLY with a JSON array: [{"criterion": "name", "score": 0.8, "feedback": "brief explanation"}, ...]`;
111
+ const response = await this.provider.complete([{ role: 'user', content: evaluationPrompt, uuid: `eval-${Date.now()}`, timestamp: Date.now() }], 'You are a strict code evaluator. Respond ONLY with valid JSON. Be critical and specific.', undefined, this.model);
112
+ try {
113
+ const jsonMatch = response.content.match(/\[[\s\S]*\]/);
114
+ if (!jsonMatch)
115
+ return this.defaultScores();
116
+ const parsed = JSON.parse(jsonMatch[0]);
117
+ return parsed.filter(s => s.criterion && typeof s.score === 'number');
118
+ }
119
+ catch {
120
+ return this.defaultScores();
121
+ }
122
+ }
123
+ calculateWeightedScore(scores) {
124
+ let total = 0;
125
+ for (const criterion of this.rubric.criteria) {
126
+ const score = scores.find(s => s.criterion === criterion.name);
127
+ total += (score?.score ?? 0.5) * criterion.weight;
128
+ }
129
+ return total;
130
+ }
131
+ defaultScores() {
132
+ return this.rubric.criteria.map(c => ({
133
+ criterion: c.name,
134
+ score: 0.5,
135
+ feedback: 'Could not evaluate (parsing error)',
136
+ }));
137
+ }
138
+ }
139
+ /** Format evaluator results for display */
140
+ export function formatEvaluatorResult(result) {
141
+ const lines = [];
142
+ lines.push(`Evaluator: ${result.passed ? 'PASSED' : 'NEEDS IMPROVEMENT'} (${result.weightedScore.toFixed(2)}/${1.0})`);
143
+ lines.push(`Iterations: ${result.iterations}`);
144
+ lines.push('');
145
+ for (const s of result.scores) {
146
+ const bar = '█'.repeat(Math.round(s.score * 10)) + '░'.repeat(10 - Math.round(s.score * 10));
147
+ lines.push(` ${s.criterion.padEnd(15)} ${bar} ${s.score.toFixed(1)} — ${s.feedback}`);
148
+ }
149
+ if (result.refinements.length > 0) {
150
+ lines.push('');
151
+ lines.push('Refinements:');
152
+ for (const r of result.refinements)
153
+ lines.push(` ${r}`);
154
+ }
155
+ return lines.join('\n');
156
+ }
157
+ //# sourceMappingURL=EvaluatorLoop.js.map
@@ -0,0 +1,61 @@
1
+ /**
2
+ * Meta-Harness — self-optimizing agent harness.
3
+ *
4
+ * Inspired by AutoAgent (which hit #1 on SpreadsheetBench by letting
5
+ * the agent optimize its own harness overnight).
6
+ *
7
+ * Flow:
8
+ * 1. Run benchmark → get baseline score
9
+ * 2. Ask LLM to suggest a config change
10
+ * 3. Apply change → re-run benchmark
11
+ * 4. If score improved, keep; otherwise revert
12
+ * 5. Repeat for N iterations
13
+ *
14
+ * What it optimizes:
15
+ * - System prompt (trim, rephrase, add instructions)
16
+ * - Tool selection (which tools are core vs deferred)
17
+ * - Model router configuration
18
+ * - Compression strategy
19
+ * - Permission rules
20
+ */
21
+ import type { Provider } from '../providers/base.js';
22
+ export type BenchmarkResult = {
23
+ score: number;
24
+ details: string;
25
+ durationMs: number;
26
+ };
27
+ export type OptimizationChange = {
28
+ description: string;
29
+ field: string;
30
+ oldValue: unknown;
31
+ newValue: unknown;
32
+ impact: number;
33
+ };
34
+ export type OptimizationResult = {
35
+ initialScore: number;
36
+ finalScore: number;
37
+ iterations: number;
38
+ changes: OptimizationChange[];
39
+ totalDurationMs: number;
40
+ };
41
+ /**
42
+ * Run a benchmark command and extract a score.
43
+ * Score is derived from test results: pass_rate + speed_bonus.
44
+ */
45
+ export declare function runBenchmark(command: string): Promise<BenchmarkResult>;
46
+ export declare class MetaHarness {
47
+ private provider;
48
+ private benchmarkCommand;
49
+ private model?;
50
+ constructor(provider: Provider, benchmarkCommand: string, model?: string | undefined);
51
+ /**
52
+ * Run the optimization loop.
53
+ */
54
+ optimize(iterations: number): Promise<OptimizationResult>;
55
+ private suggestChange;
56
+ private applyChange;
57
+ private revertChange;
58
+ }
59
+ /** Format optimization results for display */
60
+ export declare function formatOptimizationResult(result: OptimizationResult): string;
61
+ //# sourceMappingURL=MetaHarness.d.ts.map