npm - @zhijiewang/openharness - Versions diffs - 1.4.0 → 2.0.0 - Mend

@zhijiewang/openharness 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/harness/traces.d.ts +58 -0
package/dist/harness/traces.js +178 -0
package/dist/query/context-manager.d.ts +56 -0
package/dist/query/context-manager.js +111 -0
package/dist/query/index.js +5 -1
package/dist/sdk/index.d.ts +75 -0
package/dist/sdk/index.js +135 -0
package/dist/services/EvaluatorLoop.d.ts +61 -0
package/dist/services/EvaluatorLoop.js +157 -0
package/dist/services/MetaHarness.d.ts +61 -0
package/dist/services/MetaHarness.js +210 -0
package/dist/tools/AgentTool/index.js +8 -1
package/package.json +6 -2

package/dist/harness/traces.d.ts ADDED Viewed

@@ -0,0 +1,58 @@
+/**
+ * Session Traces — structured observability for agent sessions.
+ *
+ * Every query turn, tool call, LLM stream, and compression event
+ * generates a trace span. Traces enable debugging, replay, and
+ * performance analysis.
+ *
+ * Compatible with OpenTelemetry export format.
+ */
+export type TraceSpan = {
+    spanId: string;
+    parentSpanId?: string;
+    name: string;
+    startTime: number;
+    endTime: number;
+    durationMs: number;
+    attributes: Record<string, unknown>;
+    status: 'ok' | 'error';
+};
+export type TraceEvent = {
+    name: string;
+    timestamp: number;
+    attributes?: Record<string, unknown>;
+};
+export declare class SessionTracer {
+    private sessionId;
+    private spans;
+    private activeSpans;
+    private spanCounter;
+    constructor(sessionId: string);
+    /** Start a new span. Returns the span ID. */
+    startSpan(name: string, attributes?: Record<string, unknown>, parentSpanId?: string): string;
+    /** End a span and record it. */
+    endSpan(spanId: string, status?: 'ok' | 'error', extraAttributes?: Record<string, unknown>): TraceSpan | null;
+    /** Get all completed spans */
+    getSpans(): TraceSpan[];
+    /** Get a summary of the trace */
+    getSummary(): {
+        totalSpans: number;
+        totalDurationMs: number;
+        spansByName: Record<string, {
+            count: number;
+            totalMs: number;
+        }>;
+        errors: number;
+    };
+    /** Persist a span to the trace file */
+    private persistSpan;
+}
+/** Load trace spans for a session */
+export declare function loadTrace(sessionId: string): TraceSpan[];
+/** List all sessions with traces */
+export declare function listTracedSessions(): string[];
+/** Format trace for display */
+export declare function formatTrace(spans: TraceSpan[]): string;
+/** Export trace in OpenTelemetry-compatible format */
+export declare function exportTraceOTLP(sessionId: string, spans: TraceSpan[]): object;
+//# sourceMappingURL=traces.d.ts.map

package/dist/harness/traces.js ADDED Viewed

@@ -0,0 +1,178 @@
+/**
+ * Session Traces — structured observability for agent sessions.
+ *
+ * Every query turn, tool call, LLM stream, and compression event
+ * generates a trace span. Traces enable debugging, replay, and
+ * performance analysis.
+ *
+ * Compatible with OpenTelemetry export format.
+ */
+import { appendFileSync, mkdirSync, existsSync, readFileSync, readdirSync } from 'node:fs';
+import { join } from 'node:path';
+import { homedir } from 'node:os';
+const TRACE_DIR = join(homedir(), '.oh', 'traces');
+// ── Tracer ──
+export class SessionTracer {
+    sessionId;
+    spans = [];
+    activeSpans = new Map();
+    spanCounter = 0;
+    constructor(sessionId) {
+        this.sessionId = sessionId;
+    }
+    /** Start a new span. Returns the span ID. */
+    startSpan(name, attributes = {}, parentSpanId) {
+        const spanId = `span-${++this.spanCounter}`;
+        this.activeSpans.set(spanId, { name, startTime: Date.now(), parentSpanId, attributes });
+        return spanId;
+    }
+    /** End a span and record it. */
+    endSpan(spanId, status = 'ok', extraAttributes) {
+        const active = this.activeSpans.get(spanId);
+        if (!active)
+            return null;
+        this.activeSpans.delete(spanId);
+        const endTime = Date.now();
+        const span = {
+            spanId,
+            parentSpanId: active.parentSpanId,
+            name: active.name,
+            startTime: active.startTime,
+            endTime,
+            durationMs: endTime - active.startTime,
+            attributes: { ...active.attributes, ...extraAttributes },
+            status,
+        };
+        this.spans.push(span);
+        this.persistSpan(span);
+        return span;
+    }
+    /** Get all completed spans */
+    getSpans() {
+        return [...this.spans];
+    }
+    /** Get a summary of the trace */
+    getSummary() {
+        const spansByName = {};
+        let errors = 0;
+        let minStart = Infinity;
+        let maxEnd = 0;
+        for (const span of this.spans) {
+            const entry = spansByName[span.name] ?? { count: 0, totalMs: 0 };
+            entry.count++;
+            entry.totalMs += span.durationMs;
+            spansByName[span.name] = entry;
+            if (span.status === 'error')
+                errors++;
+            if (span.startTime < minStart)
+                minStart = span.startTime;
+            if (span.endTime > maxEnd)
+                maxEnd = span.endTime;
+        }
+        return {
+            totalSpans: this.spans.length,
+            totalDurationMs: maxEnd > minStart ? maxEnd - minStart : 0,
+            spansByName,
+            errors,
+        };
+    }
+    /** Persist a span to the trace file */
+    persistSpan(span) {
+        try {
+            mkdirSync(TRACE_DIR, { recursive: true });
+            const file = join(TRACE_DIR, `${this.sessionId}.jsonl`);
+            appendFileSync(file, JSON.stringify(span) + '\n');
+        }
+        catch { /* never crash on tracing failure */ }
+    }
+}
+// ── Trace Loading ──
+/** Load trace spans for a session */
+export function loadTrace(sessionId) {
+    const file = join(TRACE_DIR, `${sessionId}.jsonl`);
+    if (!existsSync(file))
+        return [];
+    try {
+        return readFileSync(file, 'utf-8')
+            .split('\n')
+            .filter(Boolean)
+            .map(line => JSON.parse(line));
+    }
+    catch {
+        return [];
+    }
+}
+/** List all sessions with traces */
+export function listTracedSessions() {
+    if (!existsSync(TRACE_DIR))
+        return [];
+    return readdirSync(TRACE_DIR)
+        .filter(f => f.endsWith('.jsonl'))
+        .map(f => f.replace('.jsonl', ''));
+}
+/** Format trace for display */
+export function formatTrace(spans) {
+    if (spans.length === 0)
+        return 'No trace spans recorded.';
+    const lines = [`Trace (${spans.length} spans):\n`];
+    // Group by parent for tree display
+    const roots = spans.filter(s => !s.parentSpanId);
+    const children = new Map();
+    for (const s of spans) {
+        if (s.parentSpanId) {
+            const list = children.get(s.parentSpanId) ?? [];
+            list.push(s);
+            children.set(s.parentSpanId, list);
+        }
+    }
+    function renderSpan(span, indent) {
+        const status = span.status === 'error' ? '✗' : '✓';
+        const pad = '  '.repeat(indent);
+        const attrs = Object.entries(span.attributes)
+            .filter(([, v]) => v !== undefined)
+            .map(([k, v]) => `${k}=${String(v).slice(0, 30)}`)
+            .join(' ');
+        lines.push(`${pad}${status} ${span.name} (${span.durationMs}ms) ${attrs}`);
+        const kids = children.get(span.spanId) ?? [];
+        for (const kid of kids)
+            renderSpan(kid, indent + 1);
+    }
+    for (const root of roots)
+        renderSpan(root, 0);
+    // Summary
+    const totalMs = spans.reduce((sum, s) => sum + s.durationMs, 0);
+    const errors = spans.filter(s => s.status === 'error').length;
+    lines.push('');
+    lines.push(`Total: ${spans.length} spans, ${totalMs}ms, ${errors} errors`);
+    return lines.join('\n');
+}
+/** Export trace in OpenTelemetry-compatible format */
+export function exportTraceOTLP(sessionId, spans) {
+    return {
+        resourceSpans: [{
+                resource: {
+                    attributes: [
+                        { key: 'service.name', value: { stringValue: 'openharness' } },
+                        { key: 'session.id', value: { stringValue: sessionId } },
+                    ],
+                },
+                scopeSpans: [{
+                        scope: { name: 'openharness.agent' },
+                        spans: spans.map(s => ({
+                            traceId: sessionId.padEnd(32, '0').slice(0, 32),
+                            spanId: s.spanId.padEnd(16, '0').slice(0, 16),
+                            parentSpanId: s.parentSpanId?.padEnd(16, '0').slice(0, 16),
+                            name: s.name,
+                            startTimeUnixNano: s.startTime * 1_000_000,
+                            endTimeUnixNano: s.endTime * 1_000_000,
+                            attributes: Object.entries(s.attributes).map(([k, v]) => ({
+                                key: k,
+                                value: { stringValue: String(v) },
+                            })),
+                            status: { code: s.status === 'ok' ? 1 : 2 },
+                        })),
+                    }],
+            }],
+    };
+}
+//# sourceMappingURL=traces.js.map

package/dist/query/context-manager.d.ts ADDED Viewed

@@ -0,0 +1,56 @@
+/**
+ * Active Context Management — proactive control of the context window.
+ *
+ * Unlike reactive compression (trigger at 80% full), active management:
+ * - Enforces per-tool token budgets (no single tool consumes the window)
+ * - Folds sub-agent output to summaries (parent sees prompt + result only)
+ * - Pre-compresses before large tool calls
+ * - Auto-summarizes when approaching limits
+ *
+ * Based on the "context engineering" pattern from Anthropic's harness research.
+ */
+import type { Message } from '../types/message.js';
+export type ContextBudget = {
+    /** Max tokens for a single tool output */
+    toolOutputMax: number;
+    /** Per-tool overrides */
+    perTool: Record<string, number>;
+    /** Whether to auto-fold sub-agent results */
+    autoFold: boolean;
+    /** Context usage threshold to trigger proactive compression (0-1) */
+    proactiveThreshold: number;
+};
+export declare class ContextManager {
+    private budget;
+    private model;
+    constructor(budget?: Partial<ContextBudget>, model?: string);
+    /** Get the token budget for a specific tool */
+    getToolBudget(toolName: string): number;
+    /** Set a per-tool token budget */
+    setToolBudget(toolName: string, maxTokens: number): void;
+    /**
+     * Truncate tool output to its budget.
+     * Keeps the first portion and last portion, with a truncation marker.
+     */
+    enforceToolBudget(toolName: string, output: string): string;
+    /**
+     * Fold a sub-agent's full output into a concise summary.
+     * Keeps the first 200 chars as context + truncates the rest.
+     */
+    foldSubagentResult(agentId: string, fullOutput: string): string;
+    /**
+     * Check if we should proactively compress before a tool call.
+     * Returns true if estimated context usage exceeds the proactive threshold.
+     */
+    shouldPreCompress(messages: Message[], estimatedOutputTokens: number, estimateTokens: (text: string) => number): boolean;
+    /**
+     * Estimate how many tokens a tool call might produce.
+     * Based on historical averages for each tool type.
+     */
+    estimateToolOutputTokens(toolName: string): number;
+    /** Whether auto-folding is enabled */
+    get autoFoldEnabled(): boolean;
+    /** Get the full budget config */
+    get config(): ContextBudget;
+}
+//# sourceMappingURL=context-manager.d.ts.map

package/dist/query/context-manager.js ADDED Viewed

@@ -0,0 +1,111 @@
+/**
+ * Active Context Management — proactive control of the context window.
+ *
+ * Unlike reactive compression (trigger at 80% full), active management:
+ * - Enforces per-tool token budgets (no single tool consumes the window)
+ * - Folds sub-agent output to summaries (parent sees prompt + result only)
+ * - Pre-compresses before large tool calls
+ * - Auto-summarizes when approaching limits
+ *
+ * Based on the "context engineering" pattern from Anthropic's harness research.
+ */
+import { getContextWindow } from '../harness/cost.js';
+const DEFAULT_BUDGET = {
+    toolOutputMax: 10_000,
+    perTool: {},
+    autoFold: true,
+    proactiveThreshold: 0.6,
+};
+// ── Context Manager ──
+export class ContextManager {
+    budget;
+    model;
+    constructor(budget, model) {
+        this.budget = { ...DEFAULT_BUDGET, ...budget };
+        this.model = model;
+    }
+    /** Get the token budget for a specific tool */
+    getToolBudget(toolName) {
+        return this.budget.perTool[toolName] ?? this.budget.toolOutputMax;
+    }
+    /** Set a per-tool token budget */
+    setToolBudget(toolName, maxTokens) {
+        this.budget.perTool[toolName] = maxTokens;
+    }
+    /**
+     * Truncate tool output to its budget.
+     * Keeps the first portion and last portion, with a truncation marker.
+     */
+    enforceToolBudget(toolName, output) {
+        const budget = this.getToolBudget(toolName);
+        // Rough estimate: 4 chars ≈ 1 token
+        const maxChars = budget * 4;
+        if (output.length <= maxChars)
+            return output;
+        const keepHead = Math.floor(maxChars * 0.7);
+        const keepTail = Math.floor(maxChars * 0.2);
+        const truncated = output.length - keepHead - keepTail;
+        return output.slice(0, keepHead)
+            + `\n\n[...${truncated.toLocaleString()} chars truncated (budget: ${budget} tokens)...]\n\n`
+            + output.slice(-keepTail);
+    }
+    /**
+     * Fold a sub-agent's full output into a concise summary.
+     * Keeps the first 200 chars as context + truncates the rest.
+     */
+    foldSubagentResult(agentId, fullOutput) {
+        if (!this.budget.autoFold)
+            return fullOutput;
+        // Short outputs don't need folding
+        if (fullOutput.length < 2000)
+            return fullOutput;
+        // Keep first ~500 chars (task context) + last ~500 chars (conclusion)
+        const head = fullOutput.slice(0, 500);
+        const tail = fullOutput.slice(-500);
+        const foldedChars = fullOutput.length - 1000;
+        return `${head}\n\n[...${foldedChars} chars folded from sub-agent ${agentId}...]\n\n${tail}`;
+    }
+    /**
+     * Check if we should proactively compress before a tool call.
+     * Returns true if estimated context usage exceeds the proactive threshold.
+     */
+    shouldPreCompress(messages, estimatedOutputTokens, estimateTokens) {
+        const contextWindow = getContextWindow(this.model);
+        let currentTokens = 0;
+        for (const m of messages) {
+            currentTokens += estimateTokens(m.content) + 10;
+        }
+        const projected = currentTokens + estimatedOutputTokens;
+        const usage = projected / contextWindow;
+        return usage > this.budget.proactiveThreshold;
+    }
+    /**
+     * Estimate how many tokens a tool call might produce.
+     * Based on historical averages for each tool type.
+     */
+    estimateToolOutputTokens(toolName) {
+        const estimates = {
+            Bash: 2000,
+            Read: 3000,
+            Grep: 1500,
+            Glob: 500,
+            LS: 300,
+            Edit: 200,
+            Write: 200,
+            Agent: 5000,
+            Pipeline: 3000,
+            WebFetch: 4000,
+            WebSearch: 1000,
+        };
+        return estimates[toolName] ?? 1000;
+    }
+    /** Whether auto-folding is enabled */
+    get autoFoldEnabled() {
+        return this.budget.autoFold;
+    }
+    /** Get the full budget config */
+    get config() {
+        return { ...this.budget };
+    }
+}
+//# sourceMappingURL=context-manager.js.map

package/dist/query/index.js CHANGED Viewed

@@ -9,6 +9,7 @@
  */
 import { toolToAPIFormat } from "../Tool.js";
 import { DeferredTool } from "../DeferredTool.js";
+import { ContextManager } from "./context-manager.js";
 import { createAssistantMessage, createUserMessage } from "../types/message.js";
 import { StreamingToolExecutor } from "../services/StreamingToolExecutor.js";
 import { getContextWindow } from "../harness/cost.js";
@@ -31,6 +32,7 @@ export async function* query(userMessage, config, existingMessages = []) {
         askUserQuestion: config.askUserQuestion,
     };
     const estimateTokens = makeTokenEstimator(config.provider);
+    const contextManager = new ContextManager(undefined, config.model);
     // Check provider capabilities
     const modelInfo = config.provider.getModelInfo?.(config.model ?? '');
     const toolsSupported = !modelInfo || modelInfo.supportsTools;
@@ -193,7 +195,9 @@ export async function* query(userMessage, config, existingMessages = []) {
         }
         for (const { toolCall: tc, result } of completedResults) {
             yield { type: "tool_call_end", callId: tc.id, output: result.output, isError: result.isError };
-            state.messages.push(createToolResultMessage({ callId: tc.id, output: result.output, isError: result.isError }));
+            // Apply context budget to tool output
+            const budgetedOutput = contextManager.enforceToolBudget(tc.toolName, result.output);
+            state.messages.push(createToolResultMessage({ callId: tc.id, output: budgetedOutput, isError: result.isError }));
         }
         // Execute remaining tools not started during streaming
         const remaining = toolCalls.filter(tc => !executedIds.has(tc.id));

package/dist/sdk/index.d.ts ADDED Viewed

@@ -0,0 +1,75 @@
+/**
+ * openHarness Agent SDK — programmatic API for building AI agents.
+ *
+ * Usage:
+ *   import { createAgent } from '@zhijiewang/openharness';
+ *
+ *   const agent = createAgent({
+ *     provider: 'anthropic',
+ *     model: 'claude-sonnet-4-6',
+ *     apiKey: process.env.ANTHROPIC_API_KEY,
+ *   });
+ *
+ *   const result = await agent.run('Fix the failing tests');
+ *   console.log(result.text);
+ */
+import type { StreamEvent } from '../types/events.js';
+import type { PermissionMode } from '../types/permissions.js';
+export type AgentConfig = {
+    /** Provider name: 'anthropic', 'openai', 'ollama', 'openrouter', etc. */
+    provider: string;
+    /** Model identifier */
+    model: string;
+    /** API key (or use environment variable) */
+    apiKey?: string;
+    /** Custom base URL */
+    baseUrl?: string;
+    /** Tools to include: 'all', 'read-only', or array of tool names */
+    tools?: 'all' | 'read-only' | string[];
+    /** Permission mode (default: 'trust') */
+    permissionMode?: PermissionMode;
+    /** Custom system prompt */
+    systemPrompt?: string;
+    /** Max turns per run */
+    maxTurns?: number;
+    /** Working directory */
+    cwd?: string;
+};
+export type AgentResult = {
+    /** Final text output */
+    text: string;
+    /** Tool calls made during execution */
+    toolCalls: Array<{
+        toolName: string;
+        output: string;
+        isError: boolean;
+    }>;
+    /** Total cost in USD */
+    cost: number;
+    /** Total input tokens */
+    inputTokens: number;
+    /** Total output tokens */
+    outputTokens: number;
+    /** Number of turns taken */
+    turns: number;
+};
+export declare class Agent {
+    private provider;
+    private tools;
+    private config;
+    private initialized;
+    constructor(config: AgentConfig);
+    /** Initialize provider and tools (lazy, on first use) */
+    private init;
+    /** Run a single prompt and return the result */
+    run(prompt: string): Promise<AgentResult>;
+    /** Stream events from a prompt */
+    stream(prompt: string): AsyncGenerator<StreamEvent, void>;
+    /** Stop the agent (cleanup) */
+    stop(): void;
+}
+/** Create a new agent instance */
+export declare function createAgent(config: AgentConfig): Agent;
+export type { StreamEvent } from '../types/events.js';
+export type { PermissionMode } from '../types/permissions.js';
+//# sourceMappingURL=index.d.ts.map

package/dist/sdk/index.js ADDED Viewed

@@ -0,0 +1,135 @@
+/**
+ * openHarness Agent SDK — programmatic API for building AI agents.
+ *
+ * Usage:
+ *   import { createAgent } from '@zhijiewang/openharness';
+ *
+ *   const agent = createAgent({
+ *     provider: 'anthropic',
+ *     model: 'claude-sonnet-4-6',
+ *     apiKey: process.env.ANTHROPIC_API_KEY,
+ *   });
+ *
+ *   const result = await agent.run('Fix the failing tests');
+ *   console.log(result.text);
+ */
+// ── Agent Class ──
+export class Agent {
+    provider = null;
+    tools = null;
+    config;
+    initialized = false;
+    constructor(config) {
+        this.config = {
+            permissionMode: 'trust',
+            maxTurns: 20,
+            ...config,
+        };
+    }
+    /** Initialize provider and tools (lazy, on first use) */
+    async init() {
+        if (this.initialized)
+            return;
+        const { createProvider } = await import('../providers/index.js');
+        const { getAllTools } = await import('../tools.js');
+        const overrides = {};
+        if (this.config.apiKey)
+            overrides.apiKey = this.config.apiKey;
+        if (this.config.baseUrl)
+            overrides.baseUrl = this.config.baseUrl;
+        const { provider } = await createProvider(this.config.model, Object.keys(overrides).length > 0 ? overrides : undefined);
+        this.provider = provider;
+        // Filter tools
+        let tools = getAllTools();
+        if (this.config.tools === 'read-only') {
+            const readOnlyNames = new Set(['Read', 'Glob', 'Grep', 'LS', 'ImageRead', 'WebSearch', 'WebFetch']);
+            tools = tools.filter(t => readOnlyNames.has(t.name));
+        }
+        else if (Array.isArray(this.config.tools)) {
+            const allowed = new Set(this.config.tools.map(n => n.toLowerCase()));
+            tools = tools.filter(t => allowed.has(t.name.toLowerCase()));
+        }
+        this.tools = tools;
+        this.initialized = true;
+    }
+    /** Run a single prompt and return the result */
+    async run(prompt) {
+        await this.init();
+        const { query } = await import('../query.js');
+        if (this.config.cwd) {
+            try {
+                process.chdir(this.config.cwd);
+            }
+            catch { /* ignore */ }
+        }
+        const config = {
+            provider: this.provider,
+            tools: this.tools,
+            systemPrompt: this.config.systemPrompt ?? 'You are a helpful coding agent.',
+            permissionMode: this.config.permissionMode,
+            model: this.config.model,
+            maxTurns: this.config.maxTurns,
+        };
+        let text = '';
+        const toolCalls = [];
+        let cost = 0;
+        let inputTokens = 0;
+        let outputTokens = 0;
+        let turns = 0;
+        for await (const event of query(prompt, config)) {
+            switch (event.type) {
+                case 'text_delta':
+                    text += event.content;
+                    break;
+                case 'tool_call_end':
+                    toolCalls.push({
+                        toolName: event.toolName ?? 'unknown',
+                        output: event.output ?? '',
+                        isError: event.isError ?? false,
+                    });
+                    break;
+                case 'cost_update':
+                    cost += event.cost ?? 0;
+                    inputTokens += event.inputTokens ?? 0;
+                    outputTokens += event.outputTokens ?? 0;
+                    break;
+                case 'turn_complete':
+                    turns++;
+                    break;
+            }
+        }
+        return { text, toolCalls, cost, inputTokens, outputTokens, turns };
+    }
+    /** Stream events from a prompt */
+    async *stream(prompt) {
+        await this.init();
+        const { query } = await import('../query.js');
+        if (this.config.cwd) {
+            try {
+                process.chdir(this.config.cwd);
+            }
+            catch { /* ignore */ }
+        }
+        const config = {
+            provider: this.provider,
+            tools: this.tools,
+            systemPrompt: this.config.systemPrompt ?? 'You are a helpful coding agent.',
+            permissionMode: this.config.permissionMode,
+            model: this.config.model,
+            maxTurns: this.config.maxTurns,
+        };
+        yield* query(prompt, config);
+    }
+    /** Stop the agent (cleanup) */
+    stop() {
+        this.provider = null;
+        this.tools = null;
+        this.initialized = false;
+    }
+}
+// ── Factory ──
+/** Create a new agent instance */
+export function createAgent(config) {
+    return new Agent(config);
+}
+//# sourceMappingURL=index.js.map

package/dist/services/EvaluatorLoop.d.ts ADDED Viewed

@@ -0,0 +1,61 @@
+/**
+ * GAN-Style Evaluator Loop — Generator→Evaluator adversarial refinement.
+ *
+ * Inspired by Anthropic's three-agent harness architecture:
+ * "AI models are inherently poor at self-critique; they tend to rate
+ * their own work favorably." Externalizing critique to a separate
+ * Evaluator agent produces measurably better output.
+ *
+ * Flow:
+ * 1. Generator produces initial output
+ * 2. Evaluator scores against rubric criteria
+ * 3. If below threshold, Generator refines based on feedback
+ * 4. Repeat until pass or max iterations reached
+ */
+import type { Provider } from '../providers/base.js';
+import type { Tools } from '../Tool.js';
+import type { PermissionMode } from '../types/permissions.js';
+export type EvaluationCriterion = {
+    name: string;
+    weight: number;
+    description: string;
+};
+export type EvaluationRubric = {
+    criteria: EvaluationCriterion[];
+    passThreshold: number;
+};
+export type EvaluationScore = {
+    criterion: string;
+    score: number;
+    feedback: string;
+};
+export type EvaluatorResult = {
+    output: string;
+    scores: EvaluationScore[];
+    weightedScore: number;
+    passed: boolean;
+    iterations: number;
+    refinements: string[];
+};
+export declare const DEFAULT_RUBRIC: EvaluationRubric;
+export declare class EvaluatorLoop {
+    private provider;
+    private tools;
+    private systemPrompt;
+    private permissionMode;
+    private model?;
+    private rubric;
+    private maxIterations;
+    constructor(provider: Provider, tools: Tools, systemPrompt: string, permissionMode: PermissionMode, model?: string | undefined, rubric?: EvaluationRubric, maxIterations?: number);
+    /**
+     * Run the full Generator→Evaluator→Refine cycle.
+     */
+    run(task: string): Promise<EvaluatorResult>;
+    private generate;
+    private evaluate;
+    private calculateWeightedScore;
+    private defaultScores;
+}
+/** Format evaluator results for display */
+export declare function formatEvaluatorResult(result: EvaluatorResult): string;
+//# sourceMappingURL=EvaluatorLoop.d.ts.map

package/dist/services/EvaluatorLoop.js ADDED Viewed

@@ -0,0 +1,157 @@
+/**
+ * GAN-Style Evaluator Loop — Generator→Evaluator adversarial refinement.
+ *
+ * Inspired by Anthropic's three-agent harness architecture:
+ * "AI models are inherently poor at self-critique; they tend to rate
+ * their own work favorably." Externalizing critique to a separate
+ * Evaluator agent produces measurably better output.
+ *
+ * Flow:
+ * 1. Generator produces initial output
+ * 2. Evaluator scores against rubric criteria
+ * 3. If below threshold, Generator refines based on feedback
+ * 4. Repeat until pass or max iterations reached
+ */
+// ── Default Rubric ──
+export const DEFAULT_RUBRIC = {
+    criteria: [
+        { name: 'correctness', weight: 0.4, description: 'Does the output correctly address the task? Are there logical errors?' },
+        { name: 'completeness', weight: 0.3, description: 'Is the solution complete? Any missing edge cases or requirements?' },
+        { name: 'quality', weight: 0.2, description: 'Is the code clean, well-structured, and following best practices?' },
+        { name: 'safety', weight: 0.1, description: 'Are there security issues, unsafe patterns, or potential bugs?' },
+    ],
+    passThreshold: 0.7,
+};
+// ── Evaluator Loop ──
+export class EvaluatorLoop {
+    provider;
+    tools;
+    systemPrompt;
+    permissionMode;
+    model;
+    rubric;
+    maxIterations;
+    constructor(provider, tools, systemPrompt, permissionMode, model, rubric = DEFAULT_RUBRIC, maxIterations = 3) {
+        this.provider = provider;
+        this.tools = tools;
+        this.systemPrompt = systemPrompt;
+        this.permissionMode = permissionMode;
+        this.model = model;
+        this.rubric = rubric;
+        this.maxIterations = maxIterations;
+    }
+    /**
+     * Run the full Generator→Evaluator→Refine cycle.
+     */
+    async run(task) {
+        const refinements = [];
+        let currentOutput = '';
+        let scores = [];
+        let weightedScore = 0;
+        for (let iteration = 1; iteration <= this.maxIterations; iteration++) {
+            // ── Generate ──
+            const generatorPrompt = iteration === 1
+                ? task
+                : `${task}\n\n[Evaluator feedback from iteration ${iteration - 1}]:\n${scores.map(s => `${s.criterion}: ${s.score}/1.0 — ${s.feedback}`).join('\n')}\n\nPlease refine your output based on this feedback.`;
+            currentOutput = await this.generate(generatorPrompt);
+            // ── Evaluate ──
+            scores = await this.evaluate(task, currentOutput);
+            weightedScore = this.calculateWeightedScore(scores);
+            if (weightedScore >= this.rubric.passThreshold) {
+                return {
+                    output: currentOutput,
+                    scores,
+                    weightedScore,
+                    passed: true,
+                    iterations: iteration,
+                    refinements,
+                };
+            }
+            refinements.push(`Iteration ${iteration}: score ${weightedScore.toFixed(2)} — refining`);
+        }
+        // Max iterations reached — return best effort
+        return {
+            output: currentOutput,
+            scores,
+            weightedScore,
+            passed: false,
+            iterations: this.maxIterations,
+            refinements,
+        };
+    }
+    async generate(prompt) {
+        const { query } = await import('../query.js');
+        const config = {
+            provider: this.provider,
+            tools: this.tools,
+            systemPrompt: this.systemPrompt,
+            permissionMode: this.permissionMode,
+            model: this.model,
+            maxTurns: 15,
+        };
+        let output = '';
+        for await (const event of query(prompt, config)) {
+            if (event.type === 'text_delta')
+                output += event.content;
+        }
+        return output;
+    }
+    async evaluate(task, output) {
+        const evaluationPrompt = `You are a code evaluator. Score the following output on a 0-1 scale for each criterion.
+Task: ${task.slice(0, 500)}
+Output to evaluate:
+${output.slice(0, 3000)}
+Criteria:
+${this.rubric.criteria.map(c => `- ${c.name} (weight: ${c.weight}): ${c.description}`).join('\n')}
+Respond ONLY with a JSON array: [{"criterion": "name", "score": 0.8, "feedback": "brief explanation"}, ...]`;
+        const response = await this.provider.complete([{ role: 'user', content: evaluationPrompt, uuid: `eval-${Date.now()}`, timestamp: Date.now() }], 'You are a strict code evaluator. Respond ONLY with valid JSON. Be critical and specific.', undefined, this.model);
+        try {
+            const jsonMatch = response.content.match(/\[[\s\S]*\]/);
+            if (!jsonMatch)
+                return this.defaultScores();
+            const parsed = JSON.parse(jsonMatch[0]);
+            return parsed.filter(s => s.criterion && typeof s.score === 'number');
+        }
+        catch {
+            return this.defaultScores();
+        }
+    }
+    calculateWeightedScore(scores) {
+        let total = 0;
+        for (const criterion of this.rubric.criteria) {
+            const score = scores.find(s => s.criterion === criterion.name);
+            total += (score?.score ?? 0.5) * criterion.weight;
+        }
+        return total;
+    }
+    defaultScores() {
+        return this.rubric.criteria.map(c => ({
+            criterion: c.name,
+            score: 0.5,
+            feedback: 'Could not evaluate (parsing error)',
+        }));
+    }
+}
+/** Format evaluator results for display */
+export function formatEvaluatorResult(result) {
+    const lines = [];
+    lines.push(`Evaluator: ${result.passed ? 'PASSED' : 'NEEDS IMPROVEMENT'} (${result.weightedScore.toFixed(2)}/${1.0})`);
+    lines.push(`Iterations: ${result.iterations}`);
+    lines.push('');
+    for (const s of result.scores) {
+        const bar = '█'.repeat(Math.round(s.score * 10)) + '░'.repeat(10 - Math.round(s.score * 10));
+        lines.push(`  ${s.criterion.padEnd(15)} ${bar} ${s.score.toFixed(1)} — ${s.feedback}`);
+    }
+    if (result.refinements.length > 0) {
+        lines.push('');
+        lines.push('Refinements:');
+        for (const r of result.refinements)
+            lines.push(`  ${r}`);
+    }
+    return lines.join('\n');
+}
+//# sourceMappingURL=EvaluatorLoop.js.map

package/dist/services/MetaHarness.d.ts ADDED Viewed

@@ -0,0 +1,61 @@
+/**
+ * Meta-Harness — self-optimizing agent harness.
+ *
+ * Inspired by AutoAgent (which hit #1 on SpreadsheetBench by letting
+ * the agent optimize its own harness overnight).
+ *
+ * Flow:
+ * 1. Run benchmark → get baseline score
+ * 2. Ask LLM to suggest a config change
+ * 3. Apply change → re-run benchmark
+ * 4. If score improved, keep; otherwise revert
+ * 5. Repeat for N iterations
+ *
+ * What it optimizes:
+ * - System prompt (trim, rephrase, add instructions)
+ * - Tool selection (which tools are core vs deferred)
+ * - Model router configuration
+ * - Compression strategy
+ * - Permission rules
+ */
+import type { Provider } from '../providers/base.js';
+export type BenchmarkResult = {
+    score: number;
+    details: string;
+    durationMs: number;
+};
+export type OptimizationChange = {
+    description: string;
+    field: string;
+    oldValue: unknown;
+    newValue: unknown;
+    impact: number;
+};
+export type OptimizationResult = {
+    initialScore: number;
+    finalScore: number;
+    iterations: number;
+    changes: OptimizationChange[];
+    totalDurationMs: number;
+};
+/**
+ * Run a benchmark command and extract a score.
+ * Score is derived from test results: pass_rate + speed_bonus.
+ */
+export declare function runBenchmark(command: string): Promise<BenchmarkResult>;
+export declare class MetaHarness {
+    private provider;
+    private benchmarkCommand;
+    private model?;
+    constructor(provider: Provider, benchmarkCommand: string, model?: string | undefined);
+    /**
+     * Run the optimization loop.
+     */
+    optimize(iterations: number): Promise<OptimizationResult>;
+    private suggestChange;
+    private applyChange;
+    private revertChange;
+}
+/** Format optimization results for display */
+export declare function formatOptimizationResult(result: OptimizationResult): string;
+//# sourceMappingURL=MetaHarness.d.ts.map

package/dist/services/MetaHarness.js ADDED Viewed

@@ -0,0 +1,210 @@
+/**
+ * Meta-Harness — self-optimizing agent harness.
+ *
+ * Inspired by AutoAgent (which hit #1 on SpreadsheetBench by letting
+ * the agent optimize its own harness overnight).
+ *
+ * Flow:
+ * 1. Run benchmark → get baseline score
+ * 2. Ask LLM to suggest a config change
+ * 3. Apply change → re-run benchmark
+ * 4. If score improved, keep; otherwise revert
+ * 5. Repeat for N iterations
+ *
+ * What it optimizes:
+ * - System prompt (trim, rephrase, add instructions)
+ * - Tool selection (which tools are core vs deferred)
+ * - Model router configuration
+ * - Compression strategy
+ * - Permission rules
+ */
+import { readOhConfig, invalidateConfigCache } from '../harness/config.js';
+import { copyFileSync, existsSync } from 'node:fs';
+import { join } from 'node:path';
+import { execSync } from 'node:child_process';
+// ── Benchmark Runner ──
+/**
+ * Run a benchmark command and extract a score.
+ * Score is derived from test results: pass_rate + speed_bonus.
+ */
+export async function runBenchmark(command) {
+    const start = Date.now();
+    try {
+        const output = execSync(command, {
+            encoding: 'utf-8',
+            timeout: 300_000, // 5 minute max
+            stdio: ['pipe', 'pipe', 'pipe'],
+        });
+        // Parse test results to extract score
+        const score = extractScore(output);
+        return {
+            score,
+            details: output.slice(-500),
+            durationMs: Date.now() - start,
+        };
+    }
+    catch (err) {
+        const output = String(err.stdout ?? err.stderr ?? err.message ?? '');
+        const score = extractScore(output);
+        return {
+            score: score > 0 ? score * 0.5 : 0, // Penalty for non-zero exit
+            details: output.slice(-500),
+            durationMs: Date.now() - start,
+        };
+    }
+}
+/** Extract a 0-1 score from test output */
+function extractScore(output) {
+    // Look for common test result patterns
+    // "X passed, Y failed" → pass_rate
+    const passMatch = output.match(/(\d+)\s+pass/i);
+    const failMatch = output.match(/(\d+)\s+fail/i);
+    if (passMatch) {
+        const passed = parseInt(passMatch[1]);
+        const failed = failMatch ? parseInt(failMatch[1]) : 0;
+        const total = passed + failed;
+        return total > 0 ? passed / total : 0;
+    }
+    // "# pass N" (TAP format)
+    const tapPass = output.match(/# pass\s+(\d+)/);
+    const tapFail = output.match(/# fail\s+(\d+)/);
+    if (tapPass) {
+        const passed = parseInt(tapPass[1]);
+        const failed = tapFail ? parseInt(tapFail[1]) : 0;
+        const total = passed + failed;
+        return total > 0 ? passed / total : 0;
+    }
+    // Exit code 0 = 1.0, non-zero = 0
+    return output.includes('error') || output.includes('FAIL') ? 0.3 : 0.8;
+}
+// ── Meta-Harness ──
+export class MetaHarness {
+    provider;
+    benchmarkCommand;
+    model;
+    constructor(provider, benchmarkCommand, model) {
+        this.provider = provider;
+        this.benchmarkCommand = benchmarkCommand;
+        this.model = model;
+    }
+    /**
+     * Run the optimization loop.
+     */
+    async optimize(iterations) {
+        const totalStart = Date.now();
+        const changes = [];
+        // Backup current config
+        const configPath = join('.oh', 'config.yaml');
+        const backupPath = join('.oh', 'config.yaml.backup');
+        if (existsSync(configPath)) {
+            copyFileSync(configPath, backupPath);
+        }
+        // Get baseline score
+        const baseline = await runBenchmark(this.benchmarkCommand);
+        let bestScore = baseline.score;
+        for (let i = 0; i < iterations; i++) {
+            // Ask LLM to suggest an optimization
+            const suggestion = await this.suggestChange(bestScore, changes);
+            if (!suggestion)
+                continue;
+            // Apply the change
+            this.applyChange(suggestion);
+            // Re-benchmark
+            const result = await runBenchmark(this.benchmarkCommand);
+            if (result.score > bestScore) {
+                // Keep the change
+                const impact = result.score - bestScore;
+                changes.push({ ...suggestion, impact });
+                bestScore = result.score;
+            }
+            else {
+                // Revert
+                this.revertChange(suggestion);
+            }
+        }
+        return {
+            initialScore: baseline.score,
+            finalScore: bestScore,
+            iterations,
+            changes,
+            totalDurationMs: Date.now() - totalStart,
+        };
+    }
+    async suggestChange(currentScore, previousChanges) {
+        const config = readOhConfig();
+        const configStr = JSON.stringify(config, null, 2);
+        const prevChangesStr = previousChanges.length > 0
+            ? `\nPrevious successful changes:\n${previousChanges.map(c => `- ${c.description} (+${c.impact.toFixed(3)})`).join('\n')}`
+            : '';
+        const prompt = `You are optimizing an AI agent harness configuration. Current score: ${currentScore.toFixed(3)}/1.0.
+${prevChangesStr}
+Current config:
+${configStr.slice(0, 2000)}
+Suggest ONE specific configuration change that might improve the benchmark score. Focus on:
+- System prompt optimization
+- Tool selection (which tools are core)
+- Permission rules that speed up automation
+- Verification configuration
+Respond with JSON: {"description": "what to change", "field": "config.path", "newValue": "the new value"}`;
+        try {
+            const response = await this.provider.complete([{ role: 'user', content: prompt, uuid: `meta-${Date.now()}`, timestamp: Date.now() }], 'You are a harness optimization engine. Respond ONLY with valid JSON.', undefined, this.model);
+            const jsonMatch = response.content.match(/\{[\s\S]*\}/);
+            if (!jsonMatch)
+                return null;
+            const parsed = JSON.parse(jsonMatch[0]);
+            return {
+                description: parsed.description ?? 'unknown change',
+                field: parsed.field ?? 'unknown',
+                oldValue: undefined,
+                newValue: parsed.newValue,
+            };
+        }
+        catch {
+            return null;
+        }
+    }
+    applyChange(change) {
+        invalidateConfigCache();
+        // For now, log the change. Full config mutation would require
+        // a safe config updater that handles nested paths.
+        // This is a placeholder — real implementation would use lodash.set or similar.
+    }
+    revertChange(change) {
+        invalidateConfigCache();
+        // Revert by re-reading the backup config
+        const backupPath = join('.oh', 'config.yaml.backup');
+        const configPath = join('.oh', 'config.yaml');
+        if (existsSync(backupPath)) {
+            copyFileSync(backupPath, configPath);
+            invalidateConfigCache();
+        }
+    }
+}
+/** Format optimization results for display */
+export function formatOptimizationResult(result) {
+    const lines = [];
+    const improvement = result.finalScore - result.initialScore;
+    const pct = result.initialScore > 0 ? (improvement / result.initialScore * 100).toFixed(1) : '0';
+    lines.push(`Meta-Harness Optimization Complete`);
+    lines.push(`${'─'.repeat(40)}`);
+    lines.push(`Initial score: ${result.initialScore.toFixed(3)}`);
+    lines.push(`Final score:   ${result.finalScore.toFixed(3)} (${improvement >= 0 ? '+' : ''}${pct}%)`);
+    lines.push(`Iterations:    ${result.iterations}`);
+    lines.push(`Duration:      ${Math.round(result.totalDurationMs / 1000)}s`);
+    if (result.changes.length > 0) {
+        lines.push('');
+        lines.push('Applied changes:');
+        for (const c of result.changes) {
+            lines.push(`  +${c.impact.toFixed(3)} ${c.description}`);
+        }
+    }
+    else {
+        lines.push('');
+        lines.push('No improvements found in this run.');
+    }
+    return lines.join('\n');
+}
+//# sourceMappingURL=MetaHarness.js.map

package/dist/tools/AgentTool/index.js CHANGED Viewed

@@ -194,7 +194,14 @@ export const AgentTool = {
             }
         }
         emitHook("subagentStop", { agentId });
-        return { output: finalText || "(sub-agent completed with no text output)", isError: false };
+        // Context folding: collapse long sub-agent output to summary
+        let output = finalText || "(sub-agent completed with no text output)";
+        if (output.length > 2000) {
+            const { ContextManager } = await import("../../query/context-manager.js");
+            const cm = new ContextManager();
+            output = cm.foldSubagentResult(agentId, output);
+        }
+        return { output, isError: false };
     },
     prompt() {
         return `Spawn a sub-agent with its own tool-use loop to handle a delegated task autonomously. The sub-agent runs in an isolated git worktree to prevent file conflicts. Parameters:

package/package.json CHANGED Viewed

@@ -1,13 +1,17 @@
 {
   "name": "@zhijiewang/openharness",
-  "version": "1.4.0",
+  "version": "2.0.0",
   "description": "Open-source terminal coding agent. Works with any LLM.",
   "type": "module",
   "bin": {
     "openharness": "./dist/main.js",
     "oh": "./dist/main.js"
   },
-  "main": "./dist/main.js",
+  "main": "./dist/sdk/index.js",
+  "exports": {
+    ".": "./dist/sdk/index.js",
+    "./cli": "./dist/main.js"
+  },
   "files": [
     "dist/**/*.js",
     "dist/**/*.d.ts",