@zhijiewang/openharness 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Session Traces — structured observability for agent sessions.
3
+ *
4
+ * Every query turn, tool call, LLM stream, and compression event
5
+ * generates a trace span. Traces enable debugging, replay, and
6
+ * performance analysis.
7
+ *
8
+ * Compatible with OpenTelemetry export format.
9
+ */
10
+ export type TraceSpan = {
11
+ spanId: string;
12
+ parentSpanId?: string;
13
+ name: string;
14
+ startTime: number;
15
+ endTime: number;
16
+ durationMs: number;
17
+ attributes: Record<string, unknown>;
18
+ status: 'ok' | 'error';
19
+ };
20
+ export type TraceEvent = {
21
+ name: string;
22
+ timestamp: number;
23
+ attributes?: Record<string, unknown>;
24
+ };
25
+ export declare class SessionTracer {
26
+ private sessionId;
27
+ private spans;
28
+ private activeSpans;
29
+ private spanCounter;
30
+ constructor(sessionId: string);
31
+ /** Start a new span. Returns the span ID. */
32
+ startSpan(name: string, attributes?: Record<string, unknown>, parentSpanId?: string): string;
33
+ /** End a span and record it. */
34
+ endSpan(spanId: string, status?: 'ok' | 'error', extraAttributes?: Record<string, unknown>): TraceSpan | null;
35
+ /** Get all completed spans */
36
+ getSpans(): TraceSpan[];
37
+ /** Get a summary of the trace */
38
+ getSummary(): {
39
+ totalSpans: number;
40
+ totalDurationMs: number;
41
+ spansByName: Record<string, {
42
+ count: number;
43
+ totalMs: number;
44
+ }>;
45
+ errors: number;
46
+ };
47
+ /** Persist a span to the trace file */
48
+ private persistSpan;
49
+ }
50
+ /** Load trace spans for a session */
51
+ export declare function loadTrace(sessionId: string): TraceSpan[];
52
+ /** List all sessions with traces */
53
+ export declare function listTracedSessions(): string[];
54
+ /** Format trace for display */
55
+ export declare function formatTrace(spans: TraceSpan[]): string;
56
+ /** Export trace in OpenTelemetry-compatible format */
57
+ export declare function exportTraceOTLP(sessionId: string, spans: TraceSpan[]): object;
58
+ //# sourceMappingURL=traces.d.ts.map
@@ -0,0 +1,178 @@
1
+ /**
2
+ * Session Traces — structured observability for agent sessions.
3
+ *
4
+ * Every query turn, tool call, LLM stream, and compression event
5
+ * generates a trace span. Traces enable debugging, replay, and
6
+ * performance analysis.
7
+ *
8
+ * Compatible with OpenTelemetry export format.
9
+ */
10
+ import { appendFileSync, mkdirSync, existsSync, readFileSync, readdirSync } from 'node:fs';
11
+ import { join } from 'node:path';
12
+ import { homedir } from 'node:os';
13
+ const TRACE_DIR = join(homedir(), '.oh', 'traces');
14
+ // ── Tracer ──
15
+ export class SessionTracer {
16
+ sessionId;
17
+ spans = [];
18
+ activeSpans = new Map();
19
+ spanCounter = 0;
20
+ constructor(sessionId) {
21
+ this.sessionId = sessionId;
22
+ }
23
+ /** Start a new span. Returns the span ID. */
24
+ startSpan(name, attributes = {}, parentSpanId) {
25
+ const spanId = `span-${++this.spanCounter}`;
26
+ this.activeSpans.set(spanId, { name, startTime: Date.now(), parentSpanId, attributes });
27
+ return spanId;
28
+ }
29
+ /** End a span and record it. */
30
+ endSpan(spanId, status = 'ok', extraAttributes) {
31
+ const active = this.activeSpans.get(spanId);
32
+ if (!active)
33
+ return null;
34
+ this.activeSpans.delete(spanId);
35
+ const endTime = Date.now();
36
+ const span = {
37
+ spanId,
38
+ parentSpanId: active.parentSpanId,
39
+ name: active.name,
40
+ startTime: active.startTime,
41
+ endTime,
42
+ durationMs: endTime - active.startTime,
43
+ attributes: { ...active.attributes, ...extraAttributes },
44
+ status,
45
+ };
46
+ this.spans.push(span);
47
+ this.persistSpan(span);
48
+ return span;
49
+ }
50
+ /** Get all completed spans */
51
+ getSpans() {
52
+ return [...this.spans];
53
+ }
54
+ /** Get a summary of the trace */
55
+ getSummary() {
56
+ const spansByName = {};
57
+ let errors = 0;
58
+ let minStart = Infinity;
59
+ let maxEnd = 0;
60
+ for (const span of this.spans) {
61
+ const entry = spansByName[span.name] ?? { count: 0, totalMs: 0 };
62
+ entry.count++;
63
+ entry.totalMs += span.durationMs;
64
+ spansByName[span.name] = entry;
65
+ if (span.status === 'error')
66
+ errors++;
67
+ if (span.startTime < minStart)
68
+ minStart = span.startTime;
69
+ if (span.endTime > maxEnd)
70
+ maxEnd = span.endTime;
71
+ }
72
+ return {
73
+ totalSpans: this.spans.length,
74
+ totalDurationMs: maxEnd > minStart ? maxEnd - minStart : 0,
75
+ spansByName,
76
+ errors,
77
+ };
78
+ }
79
+ /** Persist a span to the trace file */
80
+ persistSpan(span) {
81
+ try {
82
+ mkdirSync(TRACE_DIR, { recursive: true });
83
+ const file = join(TRACE_DIR, `${this.sessionId}.jsonl`);
84
+ appendFileSync(file, JSON.stringify(span) + '\n');
85
+ }
86
+ catch { /* never crash on tracing failure */ }
87
+ }
88
+ }
89
+ // ── Trace Loading ──
90
+ /** Load trace spans for a session */
91
+ export function loadTrace(sessionId) {
92
+ const file = join(TRACE_DIR, `${sessionId}.jsonl`);
93
+ if (!existsSync(file))
94
+ return [];
95
+ try {
96
+ return readFileSync(file, 'utf-8')
97
+ .split('\n')
98
+ .filter(Boolean)
99
+ .map(line => JSON.parse(line));
100
+ }
101
+ catch {
102
+ return [];
103
+ }
104
+ }
105
+ /** List all sessions with traces */
106
+ export function listTracedSessions() {
107
+ if (!existsSync(TRACE_DIR))
108
+ return [];
109
+ return readdirSync(TRACE_DIR)
110
+ .filter(f => f.endsWith('.jsonl'))
111
+ .map(f => f.replace('.jsonl', ''));
112
+ }
113
+ /** Format trace for display */
114
+ export function formatTrace(spans) {
115
+ if (spans.length === 0)
116
+ return 'No trace spans recorded.';
117
+ const lines = [`Trace (${spans.length} spans):\n`];
118
+ // Group by parent for tree display
119
+ const roots = spans.filter(s => !s.parentSpanId);
120
+ const children = new Map();
121
+ for (const s of spans) {
122
+ if (s.parentSpanId) {
123
+ const list = children.get(s.parentSpanId) ?? [];
124
+ list.push(s);
125
+ children.set(s.parentSpanId, list);
126
+ }
127
+ }
128
+ function renderSpan(span, indent) {
129
+ const status = span.status === 'error' ? '✗' : '✓';
130
+ const pad = ' '.repeat(indent);
131
+ const attrs = Object.entries(span.attributes)
132
+ .filter(([, v]) => v !== undefined)
133
+ .map(([k, v]) => `${k}=${String(v).slice(0, 30)}`)
134
+ .join(' ');
135
+ lines.push(`${pad}${status} ${span.name} (${span.durationMs}ms) ${attrs}`);
136
+ const kids = children.get(span.spanId) ?? [];
137
+ for (const kid of kids)
138
+ renderSpan(kid, indent + 1);
139
+ }
140
+ for (const root of roots)
141
+ renderSpan(root, 0);
142
+ // Summary
143
+ const totalMs = spans.reduce((sum, s) => sum + s.durationMs, 0);
144
+ const errors = spans.filter(s => s.status === 'error').length;
145
+ lines.push('');
146
+ lines.push(`Total: ${spans.length} spans, ${totalMs}ms, ${errors} errors`);
147
+ return lines.join('\n');
148
+ }
149
+ /** Export trace in OpenTelemetry-compatible format */
150
+ export function exportTraceOTLP(sessionId, spans) {
151
+ return {
152
+ resourceSpans: [{
153
+ resource: {
154
+ attributes: [
155
+ { key: 'service.name', value: { stringValue: 'openharness' } },
156
+ { key: 'session.id', value: { stringValue: sessionId } },
157
+ ],
158
+ },
159
+ scopeSpans: [{
160
+ scope: { name: 'openharness.agent' },
161
+ spans: spans.map(s => ({
162
+ traceId: sessionId.padEnd(32, '0').slice(0, 32),
163
+ spanId: s.spanId.padEnd(16, '0').slice(0, 16),
164
+ parentSpanId: s.parentSpanId?.padEnd(16, '0').slice(0, 16),
165
+ name: s.name,
166
+ startTimeUnixNano: s.startTime * 1_000_000,
167
+ endTimeUnixNano: s.endTime * 1_000_000,
168
+ attributes: Object.entries(s.attributes).map(([k, v]) => ({
169
+ key: k,
170
+ value: { stringValue: String(v) },
171
+ })),
172
+ status: { code: s.status === 'ok' ? 1 : 2 },
173
+ })),
174
+ }],
175
+ }],
176
+ };
177
+ }
178
+ //# sourceMappingURL=traces.js.map
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Active Context Management — proactive control of the context window.
3
+ *
4
+ * Unlike reactive compression (trigger at 80% full), active management:
5
+ * - Enforces per-tool token budgets (no single tool consumes the window)
6
+ * - Folds sub-agent output to summaries (parent sees prompt + result only)
7
+ * - Pre-compresses before large tool calls
8
+ * - Auto-summarizes when approaching limits
9
+ *
10
+ * Based on the "context engineering" pattern from Anthropic's harness research.
11
+ */
12
+ import type { Message } from '../types/message.js';
13
+ export type ContextBudget = {
14
+ /** Max tokens for a single tool output */
15
+ toolOutputMax: number;
16
+ /** Per-tool overrides */
17
+ perTool: Record<string, number>;
18
+ /** Whether to auto-fold sub-agent results */
19
+ autoFold: boolean;
20
+ /** Context usage threshold to trigger proactive compression (0-1) */
21
+ proactiveThreshold: number;
22
+ };
23
+ export declare class ContextManager {
24
+ private budget;
25
+ private model;
26
+ constructor(budget?: Partial<ContextBudget>, model?: string);
27
+ /** Get the token budget for a specific tool */
28
+ getToolBudget(toolName: string): number;
29
+ /** Set a per-tool token budget */
30
+ setToolBudget(toolName: string, maxTokens: number): void;
31
+ /**
32
+ * Truncate tool output to its budget.
33
+ * Keeps the first portion and last portion, with a truncation marker.
34
+ */
35
+ enforceToolBudget(toolName: string, output: string): string;
36
+ /**
37
+ * Fold a sub-agent's full output into a concise summary.
38
+ * Keeps the first 200 chars as context + truncates the rest.
39
+ */
40
+ foldSubagentResult(agentId: string, fullOutput: string): string;
41
+ /**
42
+ * Check if we should proactively compress before a tool call.
43
+ * Returns true if estimated context usage exceeds the proactive threshold.
44
+ */
45
+ shouldPreCompress(messages: Message[], estimatedOutputTokens: number, estimateTokens: (text: string) => number): boolean;
46
+ /**
47
+ * Estimate how many tokens a tool call might produce.
48
+ * Based on historical averages for each tool type.
49
+ */
50
+ estimateToolOutputTokens(toolName: string): number;
51
+ /** Whether auto-folding is enabled */
52
+ get autoFoldEnabled(): boolean;
53
+ /** Get the full budget config */
54
+ get config(): ContextBudget;
55
+ }
56
+ //# sourceMappingURL=context-manager.d.ts.map
@@ -0,0 +1,111 @@
1
+ /**
2
+ * Active Context Management — proactive control of the context window.
3
+ *
4
+ * Unlike reactive compression (trigger at 80% full), active management:
5
+ * - Enforces per-tool token budgets (no single tool consumes the window)
6
+ * - Folds sub-agent output to summaries (parent sees prompt + result only)
7
+ * - Pre-compresses before large tool calls
8
+ * - Auto-summarizes when approaching limits
9
+ *
10
+ * Based on the "context engineering" pattern from Anthropic's harness research.
11
+ */
12
+ import { getContextWindow } from '../harness/cost.js';
13
+ const DEFAULT_BUDGET = {
14
+ toolOutputMax: 10_000,
15
+ perTool: {},
16
+ autoFold: true,
17
+ proactiveThreshold: 0.6,
18
+ };
19
+ // ── Context Manager ──
20
+ export class ContextManager {
21
+ budget;
22
+ model;
23
+ constructor(budget, model) {
24
+ this.budget = { ...DEFAULT_BUDGET, ...budget };
25
+ this.model = model;
26
+ }
27
+ /** Get the token budget for a specific tool */
28
+ getToolBudget(toolName) {
29
+ return this.budget.perTool[toolName] ?? this.budget.toolOutputMax;
30
+ }
31
+ /** Set a per-tool token budget */
32
+ setToolBudget(toolName, maxTokens) {
33
+ this.budget.perTool[toolName] = maxTokens;
34
+ }
35
+ /**
36
+ * Truncate tool output to its budget.
37
+ * Keeps the first portion and last portion, with a truncation marker.
38
+ */
39
+ enforceToolBudget(toolName, output) {
40
+ const budget = this.getToolBudget(toolName);
41
+ // Rough estimate: 4 chars ≈ 1 token
42
+ const maxChars = budget * 4;
43
+ if (output.length <= maxChars)
44
+ return output;
45
+ const keepHead = Math.floor(maxChars * 0.7);
46
+ const keepTail = Math.floor(maxChars * 0.2);
47
+ const truncated = output.length - keepHead - keepTail;
48
+ return output.slice(0, keepHead)
49
+ + `\n\n[...${truncated.toLocaleString()} chars truncated (budget: ${budget} tokens)...]\n\n`
50
+ + output.slice(-keepTail);
51
+ }
52
+ /**
53
+ * Fold a sub-agent's full output into a concise summary.
54
+ * Keeps the first 200 chars as context + truncates the rest.
55
+ */
56
+ foldSubagentResult(agentId, fullOutput) {
57
+ if (!this.budget.autoFold)
58
+ return fullOutput;
59
+ // Short outputs don't need folding
60
+ if (fullOutput.length < 2000)
61
+ return fullOutput;
62
+ // Keep first ~500 chars (task context) + last ~500 chars (conclusion)
63
+ const head = fullOutput.slice(0, 500);
64
+ const tail = fullOutput.slice(-500);
65
+ const foldedChars = fullOutput.length - 1000;
66
+ return `${head}\n\n[...${foldedChars} chars folded from sub-agent ${agentId}...]\n\n${tail}`;
67
+ }
68
+ /**
69
+ * Check if we should proactively compress before a tool call.
70
+ * Returns true if estimated context usage exceeds the proactive threshold.
71
+ */
72
+ shouldPreCompress(messages, estimatedOutputTokens, estimateTokens) {
73
+ const contextWindow = getContextWindow(this.model);
74
+ let currentTokens = 0;
75
+ for (const m of messages) {
76
+ currentTokens += estimateTokens(m.content) + 10;
77
+ }
78
+ const projected = currentTokens + estimatedOutputTokens;
79
+ const usage = projected / contextWindow;
80
+ return usage > this.budget.proactiveThreshold;
81
+ }
82
+ /**
83
+ * Estimate how many tokens a tool call might produce.
84
+ * Based on historical averages for each tool type.
85
+ */
86
+ estimateToolOutputTokens(toolName) {
87
+ const estimates = {
88
+ Bash: 2000,
89
+ Read: 3000,
90
+ Grep: 1500,
91
+ Glob: 500,
92
+ LS: 300,
93
+ Edit: 200,
94
+ Write: 200,
95
+ Agent: 5000,
96
+ Pipeline: 3000,
97
+ WebFetch: 4000,
98
+ WebSearch: 1000,
99
+ };
100
+ return estimates[toolName] ?? 1000;
101
+ }
102
+ /** Whether auto-folding is enabled */
103
+ get autoFoldEnabled() {
104
+ return this.budget.autoFold;
105
+ }
106
+ /** Get the full budget config */
107
+ get config() {
108
+ return { ...this.budget };
109
+ }
110
+ }
111
+ //# sourceMappingURL=context-manager.js.map
@@ -9,6 +9,7 @@
9
9
  */
10
10
  import { toolToAPIFormat } from "../Tool.js";
11
11
  import { DeferredTool } from "../DeferredTool.js";
12
+ import { ContextManager } from "./context-manager.js";
12
13
  import { createAssistantMessage, createUserMessage } from "../types/message.js";
13
14
  import { StreamingToolExecutor } from "../services/StreamingToolExecutor.js";
14
15
  import { getContextWindow } from "../harness/cost.js";
@@ -31,6 +32,7 @@ export async function* query(userMessage, config, existingMessages = []) {
31
32
  askUserQuestion: config.askUserQuestion,
32
33
  };
33
34
  const estimateTokens = makeTokenEstimator(config.provider);
35
+ const contextManager = new ContextManager(undefined, config.model);
34
36
  // Check provider capabilities
35
37
  const modelInfo = config.provider.getModelInfo?.(config.model ?? '');
36
38
  const toolsSupported = !modelInfo || modelInfo.supportsTools;
@@ -193,7 +195,9 @@ export async function* query(userMessage, config, existingMessages = []) {
193
195
  }
194
196
  for (const { toolCall: tc, result } of completedResults) {
195
197
  yield { type: "tool_call_end", callId: tc.id, output: result.output, isError: result.isError };
196
- state.messages.push(createToolResultMessage({ callId: tc.id, output: result.output, isError: result.isError }));
198
+ // Apply context budget to tool output
199
+ const budgetedOutput = contextManager.enforceToolBudget(tc.toolName, result.output);
200
+ state.messages.push(createToolResultMessage({ callId: tc.id, output: budgetedOutput, isError: result.isError }));
197
201
  }
198
202
  // Execute remaining tools not started during streaming
199
203
  const remaining = toolCalls.filter(tc => !executedIds.has(tc.id));
@@ -0,0 +1,75 @@
1
+ /**
2
+ * openHarness Agent SDK — programmatic API for building AI agents.
3
+ *
4
+ * Usage:
5
+ * import { createAgent } from '@zhijiewang/openharness';
6
+ *
7
+ * const agent = createAgent({
8
+ * provider: 'anthropic',
9
+ * model: 'claude-sonnet-4-6',
10
+ * apiKey: process.env.ANTHROPIC_API_KEY,
11
+ * });
12
+ *
13
+ * const result = await agent.run('Fix the failing tests');
14
+ * console.log(result.text);
15
+ */
16
+ import type { StreamEvent } from '../types/events.js';
17
+ import type { PermissionMode } from '../types/permissions.js';
18
+ export type AgentConfig = {
19
+ /** Provider name: 'anthropic', 'openai', 'ollama', 'openrouter', etc. */
20
+ provider: string;
21
+ /** Model identifier */
22
+ model: string;
23
+ /** API key (or use environment variable) */
24
+ apiKey?: string;
25
+ /** Custom base URL */
26
+ baseUrl?: string;
27
+ /** Tools to include: 'all', 'read-only', or array of tool names */
28
+ tools?: 'all' | 'read-only' | string[];
29
+ /** Permission mode (default: 'trust') */
30
+ permissionMode?: PermissionMode;
31
+ /** Custom system prompt */
32
+ systemPrompt?: string;
33
+ /** Max turns per run */
34
+ maxTurns?: number;
35
+ /** Working directory */
36
+ cwd?: string;
37
+ };
38
+ export type AgentResult = {
39
+ /** Final text output */
40
+ text: string;
41
+ /** Tool calls made during execution */
42
+ toolCalls: Array<{
43
+ toolName: string;
44
+ output: string;
45
+ isError: boolean;
46
+ }>;
47
+ /** Total cost in USD */
48
+ cost: number;
49
+ /** Total input tokens */
50
+ inputTokens: number;
51
+ /** Total output tokens */
52
+ outputTokens: number;
53
+ /** Number of turns taken */
54
+ turns: number;
55
+ };
56
+ export declare class Agent {
57
+ private provider;
58
+ private tools;
59
+ private config;
60
+ private initialized;
61
+ constructor(config: AgentConfig);
62
+ /** Initialize provider and tools (lazy, on first use) */
63
+ private init;
64
+ /** Run a single prompt and return the result */
65
+ run(prompt: string): Promise<AgentResult>;
66
+ /** Stream events from a prompt */
67
+ stream(prompt: string): AsyncGenerator<StreamEvent, void>;
68
+ /** Stop the agent (cleanup) */
69
+ stop(): void;
70
+ }
71
+ /** Create a new agent instance */
72
+ export declare function createAgent(config: AgentConfig): Agent;
73
+ export type { StreamEvent } from '../types/events.js';
74
+ export type { PermissionMode } from '../types/permissions.js';
75
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1,135 @@
1
+ /**
2
+ * openHarness Agent SDK — programmatic API for building AI agents.
3
+ *
4
+ * Usage:
5
+ * import { createAgent } from '@zhijiewang/openharness';
6
+ *
7
+ * const agent = createAgent({
8
+ * provider: 'anthropic',
9
+ * model: 'claude-sonnet-4-6',
10
+ * apiKey: process.env.ANTHROPIC_API_KEY,
11
+ * });
12
+ *
13
+ * const result = await agent.run('Fix the failing tests');
14
+ * console.log(result.text);
15
+ */
16
+ // ── Agent Class ──
17
+ export class Agent {
18
+ provider = null;
19
+ tools = null;
20
+ config;
21
+ initialized = false;
22
+ constructor(config) {
23
+ this.config = {
24
+ permissionMode: 'trust',
25
+ maxTurns: 20,
26
+ ...config,
27
+ };
28
+ }
29
+ /** Initialize provider and tools (lazy, on first use) */
30
+ async init() {
31
+ if (this.initialized)
32
+ return;
33
+ const { createProvider } = await import('../providers/index.js');
34
+ const { getAllTools } = await import('../tools.js');
35
+ const overrides = {};
36
+ if (this.config.apiKey)
37
+ overrides.apiKey = this.config.apiKey;
38
+ if (this.config.baseUrl)
39
+ overrides.baseUrl = this.config.baseUrl;
40
+ const { provider } = await createProvider(this.config.model, Object.keys(overrides).length > 0 ? overrides : undefined);
41
+ this.provider = provider;
42
+ // Filter tools
43
+ let tools = getAllTools();
44
+ if (this.config.tools === 'read-only') {
45
+ const readOnlyNames = new Set(['Read', 'Glob', 'Grep', 'LS', 'ImageRead', 'WebSearch', 'WebFetch']);
46
+ tools = tools.filter(t => readOnlyNames.has(t.name));
47
+ }
48
+ else if (Array.isArray(this.config.tools)) {
49
+ const allowed = new Set(this.config.tools.map(n => n.toLowerCase()));
50
+ tools = tools.filter(t => allowed.has(t.name.toLowerCase()));
51
+ }
52
+ this.tools = tools;
53
+ this.initialized = true;
54
+ }
55
+ /** Run a single prompt and return the result */
56
+ async run(prompt) {
57
+ await this.init();
58
+ const { query } = await import('../query.js');
59
+ if (this.config.cwd) {
60
+ try {
61
+ process.chdir(this.config.cwd);
62
+ }
63
+ catch { /* ignore */ }
64
+ }
65
+ const config = {
66
+ provider: this.provider,
67
+ tools: this.tools,
68
+ systemPrompt: this.config.systemPrompt ?? 'You are a helpful coding agent.',
69
+ permissionMode: this.config.permissionMode,
70
+ model: this.config.model,
71
+ maxTurns: this.config.maxTurns,
72
+ };
73
+ let text = '';
74
+ const toolCalls = [];
75
+ let cost = 0;
76
+ let inputTokens = 0;
77
+ let outputTokens = 0;
78
+ let turns = 0;
79
+ for await (const event of query(prompt, config)) {
80
+ switch (event.type) {
81
+ case 'text_delta':
82
+ text += event.content;
83
+ break;
84
+ case 'tool_call_end':
85
+ toolCalls.push({
86
+ toolName: event.toolName ?? 'unknown',
87
+ output: event.output ?? '',
88
+ isError: event.isError ?? false,
89
+ });
90
+ break;
91
+ case 'cost_update':
92
+ cost += event.cost ?? 0;
93
+ inputTokens += event.inputTokens ?? 0;
94
+ outputTokens += event.outputTokens ?? 0;
95
+ break;
96
+ case 'turn_complete':
97
+ turns++;
98
+ break;
99
+ }
100
+ }
101
+ return { text, toolCalls, cost, inputTokens, outputTokens, turns };
102
+ }
103
+ /** Stream events from a prompt */
104
+ async *stream(prompt) {
105
+ await this.init();
106
+ const { query } = await import('../query.js');
107
+ if (this.config.cwd) {
108
+ try {
109
+ process.chdir(this.config.cwd);
110
+ }
111
+ catch { /* ignore */ }
112
+ }
113
+ const config = {
114
+ provider: this.provider,
115
+ tools: this.tools,
116
+ systemPrompt: this.config.systemPrompt ?? 'You are a helpful coding agent.',
117
+ permissionMode: this.config.permissionMode,
118
+ model: this.config.model,
119
+ maxTurns: this.config.maxTurns,
120
+ };
121
+ yield* query(prompt, config);
122
+ }
123
+ /** Stop the agent (cleanup) */
124
+ stop() {
125
+ this.provider = null;
126
+ this.tools = null;
127
+ this.initialized = false;
128
+ }
129
+ }
130
+ // ── Factory ──
131
+ /** Create a new agent instance */
132
+ export function createAgent(config) {
133
+ return new Agent(config);
134
+ }
135
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,61 @@
1
+ /**
2
+ * GAN-Style Evaluator Loop — Generator→Evaluator adversarial refinement.
3
+ *
4
+ * Inspired by Anthropic's three-agent harness architecture:
5
+ * "AI models are inherently poor at self-critique; they tend to rate
6
+ * their own work favorably." Externalizing critique to a separate
7
+ * Evaluator agent produces measurably better output.
8
+ *
9
+ * Flow:
10
+ * 1. Generator produces initial output
11
+ * 2. Evaluator scores against rubric criteria
12
+ * 3. If below threshold, Generator refines based on feedback
13
+ * 4. Repeat until pass or max iterations reached
14
+ */
15
+ import type { Provider } from '../providers/base.js';
16
+ import type { Tools } from '../Tool.js';
17
+ import type { PermissionMode } from '../types/permissions.js';
18
+ export type EvaluationCriterion = {
19
+ name: string;
20
+ weight: number;
21
+ description: string;
22
+ };
23
+ export type EvaluationRubric = {
24
+ criteria: EvaluationCriterion[];
25
+ passThreshold: number;
26
+ };
27
+ export type EvaluationScore = {
28
+ criterion: string;
29
+ score: number;
30
+ feedback: string;
31
+ };
32
+ export type EvaluatorResult = {
33
+ output: string;
34
+ scores: EvaluationScore[];
35
+ weightedScore: number;
36
+ passed: boolean;
37
+ iterations: number;
38
+ refinements: string[];
39
+ };
40
+ export declare const DEFAULT_RUBRIC: EvaluationRubric;
41
+ export declare class EvaluatorLoop {
42
+ private provider;
43
+ private tools;
44
+ private systemPrompt;
45
+ private permissionMode;
46
+ private model?;
47
+ private rubric;
48
+ private maxIterations;
49
+ constructor(provider: Provider, tools: Tools, systemPrompt: string, permissionMode: PermissionMode, model?: string | undefined, rubric?: EvaluationRubric, maxIterations?: number);
50
+ /**
51
+ * Run the full Generator→Evaluator→Refine cycle.
52
+ */
53
+ run(task: string): Promise<EvaluatorResult>;
54
+ private generate;
55
+ private evaluate;
56
+ private calculateWeightedScore;
57
+ private defaultScores;
58
+ }
59
+ /** Format evaluator results for display */
60
+ export declare function formatEvaluatorResult(result: EvaluatorResult): string;
61
+ //# sourceMappingURL=EvaluatorLoop.d.ts.map
@@ -0,0 +1,157 @@
1
+ /**
2
+ * GAN-Style Evaluator Loop — Generator→Evaluator adversarial refinement.
3
+ *
4
+ * Inspired by Anthropic's three-agent harness architecture:
5
+ * "AI models are inherently poor at self-critique; they tend to rate
6
+ * their own work favorably." Externalizing critique to a separate
7
+ * Evaluator agent produces measurably better output.
8
+ *
9
+ * Flow:
10
+ * 1. Generator produces initial output
11
+ * 2. Evaluator scores against rubric criteria
12
+ * 3. If below threshold, Generator refines based on feedback
13
+ * 4. Repeat until pass or max iterations reached
14
+ */
15
+ // ── Default Rubric ──
16
+ export const DEFAULT_RUBRIC = {
17
+ criteria: [
18
+ { name: 'correctness', weight: 0.4, description: 'Does the output correctly address the task? Are there logical errors?' },
19
+ { name: 'completeness', weight: 0.3, description: 'Is the solution complete? Any missing edge cases or requirements?' },
20
+ { name: 'quality', weight: 0.2, description: 'Is the code clean, well-structured, and following best practices?' },
21
+ { name: 'safety', weight: 0.1, description: 'Are there security issues, unsafe patterns, or potential bugs?' },
22
+ ],
23
+ passThreshold: 0.7,
24
+ };
25
+ // ── Evaluator Loop ──
26
+ export class EvaluatorLoop {
27
+ provider;
28
+ tools;
29
+ systemPrompt;
30
+ permissionMode;
31
+ model;
32
+ rubric;
33
+ maxIterations;
34
+ constructor(provider, tools, systemPrompt, permissionMode, model, rubric = DEFAULT_RUBRIC, maxIterations = 3) {
35
+ this.provider = provider;
36
+ this.tools = tools;
37
+ this.systemPrompt = systemPrompt;
38
+ this.permissionMode = permissionMode;
39
+ this.model = model;
40
+ this.rubric = rubric;
41
+ this.maxIterations = maxIterations;
42
+ }
43
+ /**
44
+ * Run the full Generator→Evaluator→Refine cycle.
45
+ */
46
+ async run(task) {
47
+ const refinements = [];
48
+ let currentOutput = '';
49
+ let scores = [];
50
+ let weightedScore = 0;
51
+ for (let iteration = 1; iteration <= this.maxIterations; iteration++) {
52
+ // ── Generate ──
53
+ const generatorPrompt = iteration === 1
54
+ ? task
55
+ : `${task}\n\n[Evaluator feedback from iteration ${iteration - 1}]:\n${scores.map(s => `${s.criterion}: ${s.score}/1.0 — ${s.feedback}`).join('\n')}\n\nPlease refine your output based on this feedback.`;
56
+ currentOutput = await this.generate(generatorPrompt);
57
+ // ── Evaluate ──
58
+ scores = await this.evaluate(task, currentOutput);
59
+ weightedScore = this.calculateWeightedScore(scores);
60
+ if (weightedScore >= this.rubric.passThreshold) {
61
+ return {
62
+ output: currentOutput,
63
+ scores,
64
+ weightedScore,
65
+ passed: true,
66
+ iterations: iteration,
67
+ refinements,
68
+ };
69
+ }
70
+ refinements.push(`Iteration ${iteration}: score ${weightedScore.toFixed(2)} — refining`);
71
+ }
72
+ // Max iterations reached — return best effort
73
+ return {
74
+ output: currentOutput,
75
+ scores,
76
+ weightedScore,
77
+ passed: false,
78
+ iterations: this.maxIterations,
79
+ refinements,
80
+ };
81
+ }
82
+ async generate(prompt) {
83
+ const { query } = await import('../query.js');
84
+ const config = {
85
+ provider: this.provider,
86
+ tools: this.tools,
87
+ systemPrompt: this.systemPrompt,
88
+ permissionMode: this.permissionMode,
89
+ model: this.model,
90
+ maxTurns: 15,
91
+ };
92
+ let output = '';
93
+ for await (const event of query(prompt, config)) {
94
+ if (event.type === 'text_delta')
95
+ output += event.content;
96
+ }
97
+ return output;
98
+ }
99
+ async evaluate(task, output) {
100
+ const evaluationPrompt = `You are a code evaluator. Score the following output on a 0-1 scale for each criterion.
101
+
102
+ Task: ${task.slice(0, 500)}
103
+
104
+ Output to evaluate:
105
+ ${output.slice(0, 3000)}
106
+
107
+ Criteria:
108
+ ${this.rubric.criteria.map(c => `- ${c.name} (weight: ${c.weight}): ${c.description}`).join('\n')}
109
+
110
+ Respond ONLY with a JSON array: [{"criterion": "name", "score": 0.8, "feedback": "brief explanation"}, ...]`;
111
+ const response = await this.provider.complete([{ role: 'user', content: evaluationPrompt, uuid: `eval-${Date.now()}`, timestamp: Date.now() }], 'You are a strict code evaluator. Respond ONLY with valid JSON. Be critical and specific.', undefined, this.model);
112
+ try {
113
+ const jsonMatch = response.content.match(/\[[\s\S]*\]/);
114
+ if (!jsonMatch)
115
+ return this.defaultScores();
116
+ const parsed = JSON.parse(jsonMatch[0]);
117
+ return parsed.filter(s => s.criterion && typeof s.score === 'number');
118
+ }
119
+ catch {
120
+ return this.defaultScores();
121
+ }
122
+ }
123
+ calculateWeightedScore(scores) {
124
+ let total = 0;
125
+ for (const criterion of this.rubric.criteria) {
126
+ const score = scores.find(s => s.criterion === criterion.name);
127
+ total += (score?.score ?? 0.5) * criterion.weight;
128
+ }
129
+ return total;
130
+ }
131
+ defaultScores() {
132
+ return this.rubric.criteria.map(c => ({
133
+ criterion: c.name,
134
+ score: 0.5,
135
+ feedback: 'Could not evaluate (parsing error)',
136
+ }));
137
+ }
138
+ }
139
+ /** Format evaluator results for display */
140
+ export function formatEvaluatorResult(result) {
141
+ const lines = [];
142
+ lines.push(`Evaluator: ${result.passed ? 'PASSED' : 'NEEDS IMPROVEMENT'} (${result.weightedScore.toFixed(2)}/${1.0})`);
143
+ lines.push(`Iterations: ${result.iterations}`);
144
+ lines.push('');
145
+ for (const s of result.scores) {
146
+ const bar = '█'.repeat(Math.round(s.score * 10)) + '░'.repeat(10 - Math.round(s.score * 10));
147
+ lines.push(` ${s.criterion.padEnd(15)} ${bar} ${s.score.toFixed(1)} — ${s.feedback}`);
148
+ }
149
+ if (result.refinements.length > 0) {
150
+ lines.push('');
151
+ lines.push('Refinements:');
152
+ for (const r of result.refinements)
153
+ lines.push(` ${r}`);
154
+ }
155
+ return lines.join('\n');
156
+ }
157
+ //# sourceMappingURL=EvaluatorLoop.js.map
@@ -0,0 +1,61 @@
1
+ /**
2
+ * Meta-Harness — self-optimizing agent harness.
3
+ *
4
+ * Inspired by AutoAgent (which hit #1 on SpreadsheetBench by letting
5
+ * the agent optimize its own harness overnight).
6
+ *
7
+ * Flow:
8
+ * 1. Run benchmark → get baseline score
9
+ * 2. Ask LLM to suggest a config change
10
+ * 3. Apply change → re-run benchmark
11
+ * 4. If score improved, keep; otherwise revert
12
+ * 5. Repeat for N iterations
13
+ *
14
+ * What it optimizes:
15
+ * - System prompt (trim, rephrase, add instructions)
16
+ * - Tool selection (which tools are core vs deferred)
17
+ * - Model router configuration
18
+ * - Compression strategy
19
+ * - Permission rules
20
+ */
21
+ import type { Provider } from '../providers/base.js';
22
+ export type BenchmarkResult = {
23
+ score: number;
24
+ details: string;
25
+ durationMs: number;
26
+ };
27
+ export type OptimizationChange = {
28
+ description: string;
29
+ field: string;
30
+ oldValue: unknown;
31
+ newValue: unknown;
32
+ impact: number;
33
+ };
34
+ export type OptimizationResult = {
35
+ initialScore: number;
36
+ finalScore: number;
37
+ iterations: number;
38
+ changes: OptimizationChange[];
39
+ totalDurationMs: number;
40
+ };
41
+ /**
42
+ * Run a benchmark command and extract a score.
43
+ * Score is derived from test results: pass_rate + speed_bonus.
44
+ */
45
+ export declare function runBenchmark(command: string): Promise<BenchmarkResult>;
46
+ export declare class MetaHarness {
47
+ private provider;
48
+ private benchmarkCommand;
49
+ private model?;
50
+ constructor(provider: Provider, benchmarkCommand: string, model?: string | undefined);
51
+ /**
52
+ * Run the optimization loop.
53
+ */
54
+ optimize(iterations: number): Promise<OptimizationResult>;
55
+ private suggestChange;
56
+ private applyChange;
57
+ private revertChange;
58
+ }
59
+ /** Format optimization results for display */
60
+ export declare function formatOptimizationResult(result: OptimizationResult): string;
61
+ //# sourceMappingURL=MetaHarness.d.ts.map
@@ -0,0 +1,210 @@
1
+ /**
2
+ * Meta-Harness — self-optimizing agent harness.
3
+ *
4
+ * Inspired by AutoAgent (which hit #1 on SpreadsheetBench by letting
5
+ * the agent optimize its own harness overnight).
6
+ *
7
+ * Flow:
8
+ * 1. Run benchmark → get baseline score
9
+ * 2. Ask LLM to suggest a config change
10
+ * 3. Apply change → re-run benchmark
11
+ * 4. If score improved, keep; otherwise revert
12
+ * 5. Repeat for N iterations
13
+ *
14
+ * What it optimizes:
15
+ * - System prompt (trim, rephrase, add instructions)
16
+ * - Tool selection (which tools are core vs deferred)
17
+ * - Model router configuration
18
+ * - Compression strategy
19
+ * - Permission rules
20
+ */
21
+ import { readOhConfig, invalidateConfigCache } from '../harness/config.js';
22
+ import { copyFileSync, existsSync } from 'node:fs';
23
+ import { join } from 'node:path';
24
+ import { execSync } from 'node:child_process';
25
+ // ── Benchmark Runner ──
26
+ /**
27
+ * Run a benchmark command and extract a score.
28
+ * Score is derived from test results: pass_rate + speed_bonus.
29
+ */
30
+ export async function runBenchmark(command) {
31
+ const start = Date.now();
32
+ try {
33
+ const output = execSync(command, {
34
+ encoding: 'utf-8',
35
+ timeout: 300_000, // 5 minute max
36
+ stdio: ['pipe', 'pipe', 'pipe'],
37
+ });
38
+ // Parse test results to extract score
39
+ const score = extractScore(output);
40
+ return {
41
+ score,
42
+ details: output.slice(-500),
43
+ durationMs: Date.now() - start,
44
+ };
45
+ }
46
+ catch (err) {
47
+ const output = String(err.stdout ?? err.stderr ?? err.message ?? '');
48
+ const score = extractScore(output);
49
+ return {
50
+ score: score > 0 ? score * 0.5 : 0, // Penalty for non-zero exit
51
+ details: output.slice(-500),
52
+ durationMs: Date.now() - start,
53
+ };
54
+ }
55
+ }
56
+ /** Extract a 0-1 score from test output */
57
+ function extractScore(output) {
58
+ // Look for common test result patterns
59
+ // "X passed, Y failed" → pass_rate
60
+ const passMatch = output.match(/(\d+)\s+pass/i);
61
+ const failMatch = output.match(/(\d+)\s+fail/i);
62
+ if (passMatch) {
63
+ const passed = parseInt(passMatch[1]);
64
+ const failed = failMatch ? parseInt(failMatch[1]) : 0;
65
+ const total = passed + failed;
66
+ return total > 0 ? passed / total : 0;
67
+ }
68
+ // "# pass N" (TAP format)
69
+ const tapPass = output.match(/# pass\s+(\d+)/);
70
+ const tapFail = output.match(/# fail\s+(\d+)/);
71
+ if (tapPass) {
72
+ const passed = parseInt(tapPass[1]);
73
+ const failed = tapFail ? parseInt(tapFail[1]) : 0;
74
+ const total = passed + failed;
75
+ return total > 0 ? passed / total : 0;
76
+ }
77
+ // Exit code 0 = 1.0, non-zero = 0
78
+ return output.includes('error') || output.includes('FAIL') ? 0.3 : 0.8;
79
+ }
80
+ // ── Meta-Harness ──
81
+ export class MetaHarness {
82
+ provider;
83
+ benchmarkCommand;
84
+ model;
85
+ constructor(provider, benchmarkCommand, model) {
86
+ this.provider = provider;
87
+ this.benchmarkCommand = benchmarkCommand;
88
+ this.model = model;
89
+ }
90
+ /**
91
+ * Run the optimization loop.
92
+ */
93
+ async optimize(iterations) {
94
+ const totalStart = Date.now();
95
+ const changes = [];
96
+ // Backup current config
97
+ const configPath = join('.oh', 'config.yaml');
98
+ const backupPath = join('.oh', 'config.yaml.backup');
99
+ if (existsSync(configPath)) {
100
+ copyFileSync(configPath, backupPath);
101
+ }
102
+ // Get baseline score
103
+ const baseline = await runBenchmark(this.benchmarkCommand);
104
+ let bestScore = baseline.score;
105
+ for (let i = 0; i < iterations; i++) {
106
+ // Ask LLM to suggest an optimization
107
+ const suggestion = await this.suggestChange(bestScore, changes);
108
+ if (!suggestion)
109
+ continue;
110
+ // Apply the change
111
+ this.applyChange(suggestion);
112
+ // Re-benchmark
113
+ const result = await runBenchmark(this.benchmarkCommand);
114
+ if (result.score > bestScore) {
115
+ // Keep the change
116
+ const impact = result.score - bestScore;
117
+ changes.push({ ...suggestion, impact });
118
+ bestScore = result.score;
119
+ }
120
+ else {
121
+ // Revert
122
+ this.revertChange(suggestion);
123
+ }
124
+ }
125
+ return {
126
+ initialScore: baseline.score,
127
+ finalScore: bestScore,
128
+ iterations,
129
+ changes,
130
+ totalDurationMs: Date.now() - totalStart,
131
+ };
132
+ }
133
+ async suggestChange(currentScore, previousChanges) {
134
+ const config = readOhConfig();
135
+ const configStr = JSON.stringify(config, null, 2);
136
+ const prevChangesStr = previousChanges.length > 0
137
+ ? `\nPrevious successful changes:\n${previousChanges.map(c => `- ${c.description} (+${c.impact.toFixed(3)})`).join('\n')}`
138
+ : '';
139
+ const prompt = `You are optimizing an AI agent harness configuration. Current score: ${currentScore.toFixed(3)}/1.0.
140
+ ${prevChangesStr}
141
+
142
+ Current config:
143
+ ${configStr.slice(0, 2000)}
144
+
145
+ Suggest ONE specific configuration change that might improve the benchmark score. Focus on:
146
+ - System prompt optimization
147
+ - Tool selection (which tools are core)
148
+ - Permission rules that speed up automation
149
+ - Verification configuration
150
+
151
+ Respond with JSON: {"description": "what to change", "field": "config.path", "newValue": "the new value"}`;
152
+ try {
153
+ const response = await this.provider.complete([{ role: 'user', content: prompt, uuid: `meta-${Date.now()}`, timestamp: Date.now() }], 'You are a harness optimization engine. Respond ONLY with valid JSON.', undefined, this.model);
154
+ const jsonMatch = response.content.match(/\{[\s\S]*\}/);
155
+ if (!jsonMatch)
156
+ return null;
157
+ const parsed = JSON.parse(jsonMatch[0]);
158
+ return {
159
+ description: parsed.description ?? 'unknown change',
160
+ field: parsed.field ?? 'unknown',
161
+ oldValue: undefined,
162
+ newValue: parsed.newValue,
163
+ };
164
+ }
165
+ catch {
166
+ return null;
167
+ }
168
+ }
169
+ applyChange(change) {
170
+ invalidateConfigCache();
171
+ // For now, log the change. Full config mutation would require
172
+ // a safe config updater that handles nested paths.
173
+ // This is a placeholder — real implementation would use lodash.set or similar.
174
+ }
175
+ revertChange(change) {
176
+ invalidateConfigCache();
177
+ // Revert by re-reading the backup config
178
+ const backupPath = join('.oh', 'config.yaml.backup');
179
+ const configPath = join('.oh', 'config.yaml');
180
+ if (existsSync(backupPath)) {
181
+ copyFileSync(backupPath, configPath);
182
+ invalidateConfigCache();
183
+ }
184
+ }
185
+ }
186
+ /** Format optimization results for display */
187
+ export function formatOptimizationResult(result) {
188
+ const lines = [];
189
+ const improvement = result.finalScore - result.initialScore;
190
+ const pct = result.initialScore > 0 ? (improvement / result.initialScore * 100).toFixed(1) : '0';
191
+ lines.push(`Meta-Harness Optimization Complete`);
192
+ lines.push(`${'─'.repeat(40)}`);
193
+ lines.push(`Initial score: ${result.initialScore.toFixed(3)}`);
194
+ lines.push(`Final score: ${result.finalScore.toFixed(3)} (${improvement >= 0 ? '+' : ''}${pct}%)`);
195
+ lines.push(`Iterations: ${result.iterations}`);
196
+ lines.push(`Duration: ${Math.round(result.totalDurationMs / 1000)}s`);
197
+ if (result.changes.length > 0) {
198
+ lines.push('');
199
+ lines.push('Applied changes:');
200
+ for (const c of result.changes) {
201
+ lines.push(` +${c.impact.toFixed(3)} ${c.description}`);
202
+ }
203
+ }
204
+ else {
205
+ lines.push('');
206
+ lines.push('No improvements found in this run.');
207
+ }
208
+ return lines.join('\n');
209
+ }
210
+ //# sourceMappingURL=MetaHarness.js.map
@@ -194,7 +194,14 @@ export const AgentTool = {
194
194
  }
195
195
  }
196
196
  emitHook("subagentStop", { agentId });
197
- return { output: finalText || "(sub-agent completed with no text output)", isError: false };
197
+ // Context folding: collapse long sub-agent output to summary
198
+ let output = finalText || "(sub-agent completed with no text output)";
199
+ if (output.length > 2000) {
200
+ const { ContextManager } = await import("../../query/context-manager.js");
201
+ const cm = new ContextManager();
202
+ output = cm.foldSubagentResult(agentId, output);
203
+ }
204
+ return { output, isError: false };
198
205
  },
199
206
  prompt() {
200
207
  return `Spawn a sub-agent with its own tool-use loop to handle a delegated task autonomously. The sub-agent runs in an isolated git worktree to prevent file conflicts. Parameters:
package/package.json CHANGED
@@ -1,13 +1,17 @@
1
1
  {
2
2
  "name": "@zhijiewang/openharness",
3
- "version": "1.4.0",
3
+ "version": "2.0.0",
4
4
  "description": "Open-source terminal coding agent. Works with any LLM.",
5
5
  "type": "module",
6
6
  "bin": {
7
7
  "openharness": "./dist/main.js",
8
8
  "oh": "./dist/main.js"
9
9
  },
10
- "main": "./dist/main.js",
10
+ "main": "./dist/sdk/index.js",
11
+ "exports": {
12
+ ".": "./dist/sdk/index.js",
13
+ "./cli": "./dist/main.js"
14
+ },
11
15
  "files": [
12
16
  "dist/**/*.js",
13
17
  "dist/**/*.d.ts",