npm - @zhijiewang/openharness - Versions diffs - 1.3.0 → 2.0.0 - Mend

@zhijiewang/openharness 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/dist/commands/index.js +45 -6
package/dist/harness/config.d.ts +12 -1
package/dist/harness/config.js +5 -0
package/dist/harness/hooks.d.ts +19 -4
package/dist/harness/hooks.js +82 -23
package/dist/harness/rules.js +32 -4
package/dist/harness/submit-handler.js +18 -2
package/dist/harness/traces.d.ts +58 -0
package/dist/harness/traces.js +178 -0
package/dist/main.js +1 -0
package/dist/query/compress.js +5 -1
package/dist/query/context-manager.d.ts +56 -0
package/dist/query/context-manager.js +111 -0
package/dist/query/index.js +5 -1
package/dist/query/tools.js +7 -0
package/dist/sdk/index.d.ts +75 -0
package/dist/sdk/index.js +135 -0
package/dist/services/EvaluatorLoop.d.ts +61 -0
package/dist/services/EvaluatorLoop.js +157 -0
package/dist/services/MetaHarness.d.ts +61 -0
package/dist/services/MetaHarness.js +210 -0
package/dist/tools/AgentTool/index.js +13 -2
package/dist/tools/DiagnosticsTool/index.d.ts +3 -3
package/dist/tools/DiagnosticsTool/index.js +37 -8
package/dist/tools/MonitorTool/index.d.ts +21 -0
package/dist/tools/MonitorTool/index.js +114 -0
package/dist/tools/PowerShellTool/index.d.ts +15 -0
package/dist/tools/PowerShellTool/index.js +32 -0
package/dist/tools.js +4 -0
package/dist/types/permissions.js +42 -2
package/package.json +6 -2

package/dist/query/compress.js CHANGED Viewed

@@ -4,6 +4,7 @@
  */
 import { createUserMessage } from "../types/message.js";
 import { defaultEstimateTokens } from "../providers/base.js";
+import { emitHook } from "../harness/hooks.js";
 const DEFAULT_KEEP_LAST = 10;
 /**
  * Semantic importance scoring for messages.
@@ -61,6 +62,7 @@ export function estimateMessagesTokens(messages, estimateTokens = (t) => Math.ce
 export function compressMessages(messages, targetTokens) {
     if (messages.length <= 2)
         return messages;
+    emitHook("preCompact", {});
     const result = [...messages];
     const keepLast = DEFAULT_KEEP_LAST;
     // MicroCompact: Truncate long tool results and assistant messages
@@ -114,12 +116,14 @@ export function compressMessages(messages, targetTokens) {
                 validCallIds.add(tc.id);
         }
     }
-    return result.filter((msg) => {
+    const filtered = result.filter((msg) => {
         if (msg.role !== "tool")
             return true;
         return (msg.toolResults?.length ?? 0) > 0 &&
             msg.toolResults.every((tr) => validCallIds.has(tr.callId));
     });
+    emitHook("postCompact", {});
+    return filtered;
 }
 /**
  * LLM-assisted summarization of older messages.

package/dist/query/context-manager.d.ts ADDED Viewed

@@ -0,0 +1,56 @@
+/**
+ * Active Context Management — proactive control of the context window.
+ *
+ * Unlike reactive compression (trigger at 80% full), active management:
+ * - Enforces per-tool token budgets (no single tool consumes the window)
+ * - Folds sub-agent output to summaries (parent sees prompt + result only)
+ * - Pre-compresses before large tool calls
+ * - Auto-summarizes when approaching limits
+ *
+ * Based on the "context engineering" pattern from Anthropic's harness research.
+ */
+import type { Message } from '../types/message.js';
+export type ContextBudget = {
+    /** Max tokens for a single tool output */
+    toolOutputMax: number;
+    /** Per-tool overrides */
+    perTool: Record<string, number>;
+    /** Whether to auto-fold sub-agent results */
+    autoFold: boolean;
+    /** Context usage threshold to trigger proactive compression (0-1) */
+    proactiveThreshold: number;
+};
+export declare class ContextManager {
+    private budget;
+    private model;
+    constructor(budget?: Partial<ContextBudget>, model?: string);
+    /** Get the token budget for a specific tool */
+    getToolBudget(toolName: string): number;
+    /** Set a per-tool token budget */
+    setToolBudget(toolName: string, maxTokens: number): void;
+    /**
+     * Truncate tool output to its budget.
+     * Keeps the first portion and last portion, with a truncation marker.
+     */
+    enforceToolBudget(toolName: string, output: string): string;
+    /**
+     * Fold a sub-agent's full output into a concise summary.
+     * Keeps the first 200 chars as context + truncates the rest.
+     */
+    foldSubagentResult(agentId: string, fullOutput: string): string;
+    /**
+     * Check if we should proactively compress before a tool call.
+     * Returns true if estimated context usage exceeds the proactive threshold.
+     */
+    shouldPreCompress(messages: Message[], estimatedOutputTokens: number, estimateTokens: (text: string) => number): boolean;
+    /**
+     * Estimate how many tokens a tool call might produce.
+     * Based on historical averages for each tool type.
+     */
+    estimateToolOutputTokens(toolName: string): number;
+    /** Whether auto-folding is enabled */
+    get autoFoldEnabled(): boolean;
+    /** Get the full budget config */
+    get config(): ContextBudget;
+}
+//# sourceMappingURL=context-manager.d.ts.map

package/dist/query/context-manager.js ADDED Viewed

@@ -0,0 +1,111 @@
+/**
+ * Active Context Management — proactive control of the context window.
+ *
+ * Unlike reactive compression (trigger at 80% full), active management:
+ * - Enforces per-tool token budgets (no single tool consumes the window)
+ * - Folds sub-agent output to summaries (parent sees prompt + result only)
+ * - Pre-compresses before large tool calls
+ * - Auto-summarizes when approaching limits
+ *
+ * Based on the "context engineering" pattern from Anthropic's harness research.
+ */
+import { getContextWindow } from '../harness/cost.js';
+const DEFAULT_BUDGET = {
+    toolOutputMax: 10_000,
+    perTool: {},
+    autoFold: true,
+    proactiveThreshold: 0.6,
+};
+// ── Context Manager ──
+export class ContextManager {
+    budget;
+    model;
+    constructor(budget, model) {
+        this.budget = { ...DEFAULT_BUDGET, ...budget };
+        this.model = model;
+    }
+    /** Get the token budget for a specific tool */
+    getToolBudget(toolName) {
+        return this.budget.perTool[toolName] ?? this.budget.toolOutputMax;
+    }
+    /** Set a per-tool token budget */
+    setToolBudget(toolName, maxTokens) {
+        this.budget.perTool[toolName] = maxTokens;
+    }
+    /**
+     * Truncate tool output to its budget.
+     * Keeps the first portion and last portion, with a truncation marker.
+     */
+    enforceToolBudget(toolName, output) {
+        const budget = this.getToolBudget(toolName);
+        // Rough estimate: 4 chars ≈ 1 token
+        const maxChars = budget * 4;
+        if (output.length <= maxChars)
+            return output;
+        const keepHead = Math.floor(maxChars * 0.7);
+        const keepTail = Math.floor(maxChars * 0.2);
+        const truncated = output.length - keepHead - keepTail;
+        return output.slice(0, keepHead)
+            + `\n\n[...${truncated.toLocaleString()} chars truncated (budget: ${budget} tokens)...]\n\n`
+            + output.slice(-keepTail);
+    }
+    /**
+     * Fold a sub-agent's full output into a concise summary.
+     * Keeps the first 200 chars as context + truncates the rest.
+     */
+    foldSubagentResult(agentId, fullOutput) {
+        if (!this.budget.autoFold)
+            return fullOutput;
+        // Short outputs don't need folding
+        if (fullOutput.length < 2000)
+            return fullOutput;
+        // Keep first ~500 chars (task context) + last ~500 chars (conclusion)
+        const head = fullOutput.slice(0, 500);
+        const tail = fullOutput.slice(-500);
+        const foldedChars = fullOutput.length - 1000;
+        return `${head}\n\n[...${foldedChars} chars folded from sub-agent ${agentId}...]\n\n${tail}`;
+    }
+    /**
+     * Check if we should proactively compress before a tool call.
+     * Returns true if estimated context usage exceeds the proactive threshold.
+     */
+    shouldPreCompress(messages, estimatedOutputTokens, estimateTokens) {
+        const contextWindow = getContextWindow(this.model);
+        let currentTokens = 0;
+        for (const m of messages) {
+            currentTokens += estimateTokens(m.content) + 10;
+        }
+        const projected = currentTokens + estimatedOutputTokens;
+        const usage = projected / contextWindow;
+        return usage > this.budget.proactiveThreshold;
+    }
+    /**
+     * Estimate how many tokens a tool call might produce.
+     * Based on historical averages for each tool type.
+     */
+    estimateToolOutputTokens(toolName) {
+        const estimates = {
+            Bash: 2000,
+            Read: 3000,
+            Grep: 1500,
+            Glob: 500,
+            LS: 300,
+            Edit: 200,
+            Write: 200,
+            Agent: 5000,
+            Pipeline: 3000,
+            WebFetch: 4000,
+            WebSearch: 1000,
+        };
+        return estimates[toolName] ?? 1000;
+    }
+    /** Whether auto-folding is enabled */
+    get autoFoldEnabled() {
+        return this.budget.autoFold;
+    }
+    /** Get the full budget config */
+    get config() {
+        return { ...this.budget };
+    }
+}
+//# sourceMappingURL=context-manager.js.map

package/dist/query/index.js CHANGED Viewed

@@ -9,6 +9,7 @@
  */
 import { toolToAPIFormat } from "../Tool.js";
 import { DeferredTool } from "../DeferredTool.js";
+import { ContextManager } from "./context-manager.js";
 import { createAssistantMessage, createUserMessage } from "../types/message.js";
 import { StreamingToolExecutor } from "../services/StreamingToolExecutor.js";
 import { getContextWindow } from "../harness/cost.js";
@@ -31,6 +32,7 @@ export async function* query(userMessage, config, existingMessages = []) {
         askUserQuestion: config.askUserQuestion,
     };
     const estimateTokens = makeTokenEstimator(config.provider);
+    const contextManager = new ContextManager(undefined, config.model);
     // Check provider capabilities
     const modelInfo = config.provider.getModelInfo?.(config.model ?? '');
     const toolsSupported = !modelInfo || modelInfo.supportsTools;
@@ -193,7 +195,9 @@ export async function* query(userMessage, config, existingMessages = []) {
         }
         for (const { toolCall: tc, result } of completedResults) {
             yield { type: "tool_call_end", callId: tc.id, output: result.output, isError: result.isError };
-            state.messages.push(createToolResultMessage({ callId: tc.id, output: result.output, isError: result.isError }));
+            // Apply context budget to tool output
+            const budgetedOutput = contextManager.enforceToolBudget(tc.toolName, result.output);
+            state.messages.push(createToolResultMessage({ callId: tc.id, output: budgetedOutput, isError: result.isError }));
         }
         // Execute remaining tools not started during streaming
         const remaining = toolCalls.filter(tc => !executedIds.has(tc.id));

package/dist/query/tools.js CHANGED Viewed

@@ -85,6 +85,13 @@ export async function executeSingleTool(toolCall, tools, context, permissionMode
             toolArgs: JSON.stringify(toolCall.arguments).slice(0, 1000),
             toolOutput: result.output.slice(0, 1000),
         });
+        // Emit fileChanged hook for file-modifying tools
+        if (!result.isError && ['Edit', 'Write', 'MultiEdit'].includes(tool.name)) {
+            const filePaths = getAffectedFiles(tool.name, parsed.data);
+            for (const fp of filePaths) {
+                emitHook("fileChanged", { filePath: fp, toolName: tool.name });
+            }
+        }
         // Verification loop: auto-run lint/typecheck after file-modifying tools
         let verificationSuffix = '';
         if (!result.isError && ['Edit', 'Write', 'MultiEdit'].includes(tool.name)) {

package/dist/sdk/index.d.ts ADDED Viewed

@@ -0,0 +1,75 @@
+/**
+ * openHarness Agent SDK — programmatic API for building AI agents.
+ *
+ * Usage:
+ *   import { createAgent } from '@zhijiewang/openharness';
+ *
+ *   const agent = createAgent({
+ *     provider: 'anthropic',
+ *     model: 'claude-sonnet-4-6',
+ *     apiKey: process.env.ANTHROPIC_API_KEY,
+ *   });
+ *
+ *   const result = await agent.run('Fix the failing tests');
+ *   console.log(result.text);
+ */
+import type { StreamEvent } from '../types/events.js';
+import type { PermissionMode } from '../types/permissions.js';
+export type AgentConfig = {
+    /** Provider name: 'anthropic', 'openai', 'ollama', 'openrouter', etc. */
+    provider: string;
+    /** Model identifier */
+    model: string;
+    /** API key (or use environment variable) */
+    apiKey?: string;
+    /** Custom base URL */
+    baseUrl?: string;
+    /** Tools to include: 'all', 'read-only', or array of tool names */
+    tools?: 'all' | 'read-only' | string[];
+    /** Permission mode (default: 'trust') */
+    permissionMode?: PermissionMode;
+    /** Custom system prompt */
+    systemPrompt?: string;
+    /** Max turns per run */
+    maxTurns?: number;
+    /** Working directory */
+    cwd?: string;
+};
+export type AgentResult = {
+    /** Final text output */
+    text: string;
+    /** Tool calls made during execution */
+    toolCalls: Array<{
+        toolName: string;
+        output: string;
+        isError: boolean;
+    }>;
+    /** Total cost in USD */
+    cost: number;
+    /** Total input tokens */
+    inputTokens: number;
+    /** Total output tokens */
+    outputTokens: number;
+    /** Number of turns taken */
+    turns: number;
+};
+export declare class Agent {
+    private provider;
+    private tools;
+    private config;
+    private initialized;
+    constructor(config: AgentConfig);
+    /** Initialize provider and tools (lazy, on first use) */
+    private init;
+    /** Run a single prompt and return the result */
+    run(prompt: string): Promise<AgentResult>;
+    /** Stream events from a prompt */
+    stream(prompt: string): AsyncGenerator<StreamEvent, void>;
+    /** Stop the agent (cleanup) */
+    stop(): void;
+}
+/** Create a new agent instance */
+export declare function createAgent(config: AgentConfig): Agent;
+export type { StreamEvent } from '../types/events.js';
+export type { PermissionMode } from '../types/permissions.js';
+//# sourceMappingURL=index.d.ts.map

package/dist/sdk/index.js ADDED Viewed

@@ -0,0 +1,135 @@
+/**
+ * openHarness Agent SDK — programmatic API for building AI agents.
+ *
+ * Usage:
+ *   import { createAgent } from '@zhijiewang/openharness';
+ *
+ *   const agent = createAgent({
+ *     provider: 'anthropic',
+ *     model: 'claude-sonnet-4-6',
+ *     apiKey: process.env.ANTHROPIC_API_KEY,
+ *   });
+ *
+ *   const result = await agent.run('Fix the failing tests');
+ *   console.log(result.text);
+ */
+// ── Agent Class ──
+export class Agent {
+    provider = null;
+    tools = null;
+    config;
+    initialized = false;
+    constructor(config) {
+        this.config = {
+            permissionMode: 'trust',
+            maxTurns: 20,
+            ...config,
+        };
+    }
+    /** Initialize provider and tools (lazy, on first use) */
+    async init() {
+        if (this.initialized)
+            return;
+        const { createProvider } = await import('../providers/index.js');
+        const { getAllTools } = await import('../tools.js');
+        const overrides = {};
+        if (this.config.apiKey)
+            overrides.apiKey = this.config.apiKey;
+        if (this.config.baseUrl)
+            overrides.baseUrl = this.config.baseUrl;
+        const { provider } = await createProvider(this.config.model, Object.keys(overrides).length > 0 ? overrides : undefined);
+        this.provider = provider;
+        // Filter tools
+        let tools = getAllTools();
+        if (this.config.tools === 'read-only') {
+            const readOnlyNames = new Set(['Read', 'Glob', 'Grep', 'LS', 'ImageRead', 'WebSearch', 'WebFetch']);
+            tools = tools.filter(t => readOnlyNames.has(t.name));
+        }
+        else if (Array.isArray(this.config.tools)) {
+            const allowed = new Set(this.config.tools.map(n => n.toLowerCase()));
+            tools = tools.filter(t => allowed.has(t.name.toLowerCase()));
+        }
+        this.tools = tools;
+        this.initialized = true;
+    }
+    /** Run a single prompt and return the result */
+    async run(prompt) {
+        await this.init();
+        const { query } = await import('../query.js');
+        if (this.config.cwd) {
+            try {
+                process.chdir(this.config.cwd);
+            }
+            catch { /* ignore */ }
+        }
+        const config = {
+            provider: this.provider,
+            tools: this.tools,
+            systemPrompt: this.config.systemPrompt ?? 'You are a helpful coding agent.',
+            permissionMode: this.config.permissionMode,
+            model: this.config.model,
+            maxTurns: this.config.maxTurns,
+        };
+        let text = '';
+        const toolCalls = [];
+        let cost = 0;
+        let inputTokens = 0;
+        let outputTokens = 0;
+        let turns = 0;
+        for await (const event of query(prompt, config)) {
+            switch (event.type) {
+                case 'text_delta':
+                    text += event.content;
+                    break;
+                case 'tool_call_end':
+                    toolCalls.push({
+                        toolName: event.toolName ?? 'unknown',
+                        output: event.output ?? '',
+                        isError: event.isError ?? false,
+                    });
+                    break;
+                case 'cost_update':
+                    cost += event.cost ?? 0;
+                    inputTokens += event.inputTokens ?? 0;
+                    outputTokens += event.outputTokens ?? 0;
+                    break;
+                case 'turn_complete':
+                    turns++;
+                    break;
+            }
+        }
+        return { text, toolCalls, cost, inputTokens, outputTokens, turns };
+    }
+    /** Stream events from a prompt */
+    async *stream(prompt) {
+        await this.init();
+        const { query } = await import('../query.js');
+        if (this.config.cwd) {
+            try {
+                process.chdir(this.config.cwd);
+            }
+            catch { /* ignore */ }
+        }
+        const config = {
+            provider: this.provider,
+            tools: this.tools,
+            systemPrompt: this.config.systemPrompt ?? 'You are a helpful coding agent.',
+            permissionMode: this.config.permissionMode,
+            model: this.config.model,
+            maxTurns: this.config.maxTurns,
+        };
+        yield* query(prompt, config);
+    }
+    /** Stop the agent (cleanup) */
+    stop() {
+        this.provider = null;
+        this.tools = null;
+        this.initialized = false;
+    }
+}
+// ── Factory ──
+/** Create a new agent instance */
+export function createAgent(config) {
+    return new Agent(config);
+}
+//# sourceMappingURL=index.js.map

package/dist/services/EvaluatorLoop.d.ts ADDED Viewed

@@ -0,0 +1,61 @@
+/**
+ * GAN-Style Evaluator Loop — Generator→Evaluator adversarial refinement.
+ *
+ * Inspired by Anthropic's three-agent harness architecture:
+ * "AI models are inherently poor at self-critique; they tend to rate
+ * their own work favorably." Externalizing critique to a separate
+ * Evaluator agent produces measurably better output.
+ *
+ * Flow:
+ * 1. Generator produces initial output
+ * 2. Evaluator scores against rubric criteria
+ * 3. If below threshold, Generator refines based on feedback
+ * 4. Repeat until pass or max iterations reached
+ */
+import type { Provider } from '../providers/base.js';
+import type { Tools } from '../Tool.js';
+import type { PermissionMode } from '../types/permissions.js';
+export type EvaluationCriterion = {
+    name: string;
+    weight: number;
+    description: string;
+};
+export type EvaluationRubric = {
+    criteria: EvaluationCriterion[];
+    passThreshold: number;
+};
+export type EvaluationScore = {
+    criterion: string;
+    score: number;
+    feedback: string;
+};
+export type EvaluatorResult = {
+    output: string;
+    scores: EvaluationScore[];
+    weightedScore: number;
+    passed: boolean;
+    iterations: number;
+    refinements: string[];
+};
+export declare const DEFAULT_RUBRIC: EvaluationRubric;
+export declare class EvaluatorLoop {
+    private provider;
+    private tools;
+    private systemPrompt;
+    private permissionMode;
+    private model?;
+    private rubric;
+    private maxIterations;
+    constructor(provider: Provider, tools: Tools, systemPrompt: string, permissionMode: PermissionMode, model?: string | undefined, rubric?: EvaluationRubric, maxIterations?: number);
+    /**
+     * Run the full Generator→Evaluator→Refine cycle.
+     */
+    run(task: string): Promise<EvaluatorResult>;
+    private generate;
+    private evaluate;
+    private calculateWeightedScore;
+    private defaultScores;
+}
+/** Format evaluator results for display */
+export declare function formatEvaluatorResult(result: EvaluatorResult): string;
+//# sourceMappingURL=EvaluatorLoop.d.ts.map

package/dist/services/EvaluatorLoop.js ADDED Viewed

@@ -0,0 +1,157 @@
+/**
+ * GAN-Style Evaluator Loop — Generator→Evaluator adversarial refinement.
+ *
+ * Inspired by Anthropic's three-agent harness architecture:
+ * "AI models are inherently poor at self-critique; they tend to rate
+ * their own work favorably." Externalizing critique to a separate
+ * Evaluator agent produces measurably better output.
+ *
+ * Flow:
+ * 1. Generator produces initial output
+ * 2. Evaluator scores against rubric criteria
+ * 3. If below threshold, Generator refines based on feedback
+ * 4. Repeat until pass or max iterations reached
+ */
+// ── Default Rubric ──
+export const DEFAULT_RUBRIC = {
+    criteria: [
+        { name: 'correctness', weight: 0.4, description: 'Does the output correctly address the task? Are there logical errors?' },
+        { name: 'completeness', weight: 0.3, description: 'Is the solution complete? Any missing edge cases or requirements?' },
+        { name: 'quality', weight: 0.2, description: 'Is the code clean, well-structured, and following best practices?' },
+        { name: 'safety', weight: 0.1, description: 'Are there security issues, unsafe patterns, or potential bugs?' },
+    ],
+    passThreshold: 0.7,
+};
+// ── Evaluator Loop ──
+export class EvaluatorLoop {
+    provider;
+    tools;
+    systemPrompt;
+    permissionMode;
+    model;
+    rubric;
+    maxIterations;
+    constructor(provider, tools, systemPrompt, permissionMode, model, rubric = DEFAULT_RUBRIC, maxIterations = 3) {
+        this.provider = provider;
+        this.tools = tools;
+        this.systemPrompt = systemPrompt;
+        this.permissionMode = permissionMode;
+        this.model = model;
+        this.rubric = rubric;
+        this.maxIterations = maxIterations;
+    }
+    /**
+     * Run the full Generator→Evaluator→Refine cycle.
+     */
+    async run(task) {
+        const refinements = [];
+        let currentOutput = '';
+        let scores = [];
+        let weightedScore = 0;
+        for (let iteration = 1; iteration <= this.maxIterations; iteration++) {
+            // ── Generate ──
+            const generatorPrompt = iteration === 1
+                ? task
+                : `${task}\n\n[Evaluator feedback from iteration ${iteration - 1}]:\n${scores.map(s => `${s.criterion}: ${s.score}/1.0 — ${s.feedback}`).join('\n')}\n\nPlease refine your output based on this feedback.`;
+            currentOutput = await this.generate(generatorPrompt);
+            // ── Evaluate ──
+            scores = await this.evaluate(task, currentOutput);
+            weightedScore = this.calculateWeightedScore(scores);
+            if (weightedScore >= this.rubric.passThreshold) {
+                return {
+                    output: currentOutput,
+                    scores,
+                    weightedScore,
+                    passed: true,
+                    iterations: iteration,
+                    refinements,
+                };
+            }
+            refinements.push(`Iteration ${iteration}: score ${weightedScore.toFixed(2)} — refining`);
+        }
+        // Max iterations reached — return best effort
+        return {
+            output: currentOutput,
+            scores,
+            weightedScore,
+            passed: false,
+            iterations: this.maxIterations,
+            refinements,
+        };
+    }
+    async generate(prompt) {
+        const { query } = await import('../query.js');
+        const config = {
+            provider: this.provider,
+            tools: this.tools,
+            systemPrompt: this.systemPrompt,
+            permissionMode: this.permissionMode,
+            model: this.model,
+            maxTurns: 15,
+        };
+        let output = '';
+        for await (const event of query(prompt, config)) {
+            if (event.type === 'text_delta')
+                output += event.content;
+        }
+        return output;
+    }
+    async evaluate(task, output) {
+        const evaluationPrompt = `You are a code evaluator. Score the following output on a 0-1 scale for each criterion.
+Task: ${task.slice(0, 500)}
+Output to evaluate:
+${output.slice(0, 3000)}
+Criteria:
+${this.rubric.criteria.map(c => `- ${c.name} (weight: ${c.weight}): ${c.description}`).join('\n')}
+Respond ONLY with a JSON array: [{"criterion": "name", "score": 0.8, "feedback": "brief explanation"}, ...]`;
+        const response = await this.provider.complete([{ role: 'user', content: evaluationPrompt, uuid: `eval-${Date.now()}`, timestamp: Date.now() }], 'You are a strict code evaluator. Respond ONLY with valid JSON. Be critical and specific.', undefined, this.model);
+        try {
+            const jsonMatch = response.content.match(/\[[\s\S]*\]/);
+            if (!jsonMatch)
+                return this.defaultScores();
+            const parsed = JSON.parse(jsonMatch[0]);
+            return parsed.filter(s => s.criterion && typeof s.score === 'number');
+        }
+        catch {
+            return this.defaultScores();
+        }
+    }
+    calculateWeightedScore(scores) {
+        let total = 0;
+        for (const criterion of this.rubric.criteria) {
+            const score = scores.find(s => s.criterion === criterion.name);
+            total += (score?.score ?? 0.5) * criterion.weight;
+        }
+        return total;
+    }
+    defaultScores() {
+        return this.rubric.criteria.map(c => ({
+            criterion: c.name,
+            score: 0.5,
+            feedback: 'Could not evaluate (parsing error)',
+        }));
+    }
+}
+/** Format evaluator results for display */
+export function formatEvaluatorResult(result) {
+    const lines = [];
+    lines.push(`Evaluator: ${result.passed ? 'PASSED' : 'NEEDS IMPROVEMENT'} (${result.weightedScore.toFixed(2)}/${1.0})`);
+    lines.push(`Iterations: ${result.iterations}`);
+    lines.push('');
+    for (const s of result.scores) {
+        const bar = '█'.repeat(Math.round(s.score * 10)) + '░'.repeat(10 - Math.round(s.score * 10));
+        lines.push(`  ${s.criterion.padEnd(15)} ${bar} ${s.score.toFixed(1)} — ${s.feedback}`);
+    }
+    if (result.refinements.length > 0) {
+        lines.push('');
+        lines.push('Refinements:');
+        for (const r of result.refinements)
+            lines.push(`  ${r}`);
+    }
+    return lines.join('\n');
+}
+//# sourceMappingURL=EvaluatorLoop.js.map