@zhijiewang/openharness 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@
4
4
  */
5
5
  import { createUserMessage } from "../types/message.js";
6
6
  import { defaultEstimateTokens } from "../providers/base.js";
7
+ import { emitHook } from "../harness/hooks.js";
7
8
  const DEFAULT_KEEP_LAST = 10;
8
9
  /**
9
10
  * Semantic importance scoring for messages.
@@ -61,6 +62,7 @@ export function estimateMessagesTokens(messages, estimateTokens = (t) => Math.ce
61
62
  export function compressMessages(messages, targetTokens) {
62
63
  if (messages.length <= 2)
63
64
  return messages;
65
+ emitHook("preCompact", {});
64
66
  const result = [...messages];
65
67
  const keepLast = DEFAULT_KEEP_LAST;
66
68
  // MicroCompact: Truncate long tool results and assistant messages
@@ -114,12 +116,14 @@ export function compressMessages(messages, targetTokens) {
114
116
  validCallIds.add(tc.id);
115
117
  }
116
118
  }
117
- return result.filter((msg) => {
119
+ const filtered = result.filter((msg) => {
118
120
  if (msg.role !== "tool")
119
121
  return true;
120
122
  return (msg.toolResults?.length ?? 0) > 0 &&
121
123
  msg.toolResults.every((tr) => validCallIds.has(tr.callId));
122
124
  });
125
+ emitHook("postCompact", {});
126
+ return filtered;
123
127
  }
124
128
  /**
125
129
  * LLM-assisted summarization of older messages.
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Active Context Management — proactive control of the context window.
3
+ *
4
+ * Unlike reactive compression (trigger at 80% full), active management:
5
+ * - Enforces per-tool token budgets (no single tool consumes the window)
6
+ * - Folds sub-agent output to summaries (parent sees prompt + result only)
7
+ * - Pre-compresses before large tool calls
8
+ * - Auto-summarizes when approaching limits
9
+ *
10
+ * Based on the "context engineering" pattern from Anthropic's harness research.
11
+ */
12
+ import type { Message } from '../types/message.js';
13
+ export type ContextBudget = {
14
+ /** Max tokens for a single tool output */
15
+ toolOutputMax: number;
16
+ /** Per-tool overrides */
17
+ perTool: Record<string, number>;
18
+ /** Whether to auto-fold sub-agent results */
19
+ autoFold: boolean;
20
+ /** Context usage threshold to trigger proactive compression (0-1) */
21
+ proactiveThreshold: number;
22
+ };
23
+ export declare class ContextManager {
24
+ private budget;
25
+ private model;
26
+ constructor(budget?: Partial<ContextBudget>, model?: string);
27
+ /** Get the token budget for a specific tool */
28
+ getToolBudget(toolName: string): number;
29
+ /** Set a per-tool token budget */
30
+ setToolBudget(toolName: string, maxTokens: number): void;
31
+ /**
32
+ * Truncate tool output to its budget.
33
+ * Keeps the first portion and last portion, with a truncation marker.
34
+ */
35
+ enforceToolBudget(toolName: string, output: string): string;
36
+ /**
37
+ * Fold a sub-agent's full output into a concise summary.
38
+ * Keeps the first 200 chars as context + truncates the rest.
39
+ */
40
+ foldSubagentResult(agentId: string, fullOutput: string): string;
41
+ /**
42
+ * Check if we should proactively compress before a tool call.
43
+ * Returns true if estimated context usage exceeds the proactive threshold.
44
+ */
45
+ shouldPreCompress(messages: Message[], estimatedOutputTokens: number, estimateTokens: (text: string) => number): boolean;
46
+ /**
47
+ * Estimate how many tokens a tool call might produce.
48
+ * Based on historical averages for each tool type.
49
+ */
50
+ estimateToolOutputTokens(toolName: string): number;
51
+ /** Whether auto-folding is enabled */
52
+ get autoFoldEnabled(): boolean;
53
+ /** Get the full budget config */
54
+ get config(): ContextBudget;
55
+ }
56
+ //# sourceMappingURL=context-manager.d.ts.map
@@ -0,0 +1,111 @@
1
+ /**
2
+ * Active Context Management — proactive control of the context window.
3
+ *
4
+ * Unlike reactive compression (trigger at 80% full), active management:
5
+ * - Enforces per-tool token budgets (no single tool consumes the window)
6
+ * - Folds sub-agent output to summaries (parent sees prompt + result only)
7
+ * - Pre-compresses before large tool calls
8
+ * - Auto-summarizes when approaching limits
9
+ *
10
+ * Based on the "context engineering" pattern from Anthropic's harness research.
11
+ */
12
+ import { getContextWindow } from '../harness/cost.js';
13
+ const DEFAULT_BUDGET = {
14
+ toolOutputMax: 10_000,
15
+ perTool: {},
16
+ autoFold: true,
17
+ proactiveThreshold: 0.6,
18
+ };
19
+ // ── Context Manager ──
20
+ export class ContextManager {
21
+ budget;
22
+ model;
23
+ constructor(budget, model) {
24
+ this.budget = { ...DEFAULT_BUDGET, ...budget };
25
+ this.model = model;
26
+ }
27
+ /** Get the token budget for a specific tool */
28
+ getToolBudget(toolName) {
29
+ return this.budget.perTool[toolName] ?? this.budget.toolOutputMax;
30
+ }
31
+ /** Set a per-tool token budget */
32
+ setToolBudget(toolName, maxTokens) {
33
+ this.budget.perTool[toolName] = maxTokens;
34
+ }
35
+ /**
36
+ * Truncate tool output to its budget.
37
+ * Keeps the first portion and last portion, with a truncation marker.
38
+ */
39
+ enforceToolBudget(toolName, output) {
40
+ const budget = this.getToolBudget(toolName);
41
+ // Rough estimate: 4 chars ≈ 1 token
42
+ const maxChars = budget * 4;
43
+ if (output.length <= maxChars)
44
+ return output;
45
+ const keepHead = Math.floor(maxChars * 0.7);
46
+ const keepTail = Math.floor(maxChars * 0.2);
47
+ const truncated = output.length - keepHead - keepTail;
48
+ return output.slice(0, keepHead)
49
+ + `\n\n[...${truncated.toLocaleString()} chars truncated (budget: ${budget} tokens)...]\n\n`
50
+ + output.slice(-keepTail);
51
+ }
52
+ /**
53
+ * Fold a sub-agent's full output into a concise summary.
54
+ * Keeps the first 200 chars as context + truncates the rest.
55
+ */
56
+ foldSubagentResult(agentId, fullOutput) {
57
+ if (!this.budget.autoFold)
58
+ return fullOutput;
59
+ // Short outputs don't need folding
60
+ if (fullOutput.length < 2000)
61
+ return fullOutput;
62
+ // Keep first ~500 chars (task context) + last ~500 chars (conclusion)
63
+ const head = fullOutput.slice(0, 500);
64
+ const tail = fullOutput.slice(-500);
65
+ const foldedChars = fullOutput.length - 1000;
66
+ return `${head}\n\n[...${foldedChars} chars folded from sub-agent ${agentId}...]\n\n${tail}`;
67
+ }
68
+ /**
69
+ * Check if we should proactively compress before a tool call.
70
+ * Returns true if estimated context usage exceeds the proactive threshold.
71
+ */
72
+ shouldPreCompress(messages, estimatedOutputTokens, estimateTokens) {
73
+ const contextWindow = getContextWindow(this.model);
74
+ let currentTokens = 0;
75
+ for (const m of messages) {
76
+ currentTokens += estimateTokens(m.content) + 10;
77
+ }
78
+ const projected = currentTokens + estimatedOutputTokens;
79
+ const usage = projected / contextWindow;
80
+ return usage > this.budget.proactiveThreshold;
81
+ }
82
+ /**
83
+ * Estimate how many tokens a tool call might produce.
84
+ * Based on historical averages for each tool type.
85
+ */
86
+ estimateToolOutputTokens(toolName) {
87
+ const estimates = {
88
+ Bash: 2000,
89
+ Read: 3000,
90
+ Grep: 1500,
91
+ Glob: 500,
92
+ LS: 300,
93
+ Edit: 200,
94
+ Write: 200,
95
+ Agent: 5000,
96
+ Pipeline: 3000,
97
+ WebFetch: 4000,
98
+ WebSearch: 1000,
99
+ };
100
+ return estimates[toolName] ?? 1000;
101
+ }
102
+ /** Whether auto-folding is enabled */
103
+ get autoFoldEnabled() {
104
+ return this.budget.autoFold;
105
+ }
106
+ /** Get the full budget config */
107
+ get config() {
108
+ return { ...this.budget };
109
+ }
110
+ }
111
+ //# sourceMappingURL=context-manager.js.map
@@ -9,6 +9,7 @@
9
9
  */
10
10
  import { toolToAPIFormat } from "../Tool.js";
11
11
  import { DeferredTool } from "../DeferredTool.js";
12
+ import { ContextManager } from "./context-manager.js";
12
13
  import { createAssistantMessage, createUserMessage } from "../types/message.js";
13
14
  import { StreamingToolExecutor } from "../services/StreamingToolExecutor.js";
14
15
  import { getContextWindow } from "../harness/cost.js";
@@ -31,6 +32,7 @@ export async function* query(userMessage, config, existingMessages = []) {
31
32
  askUserQuestion: config.askUserQuestion,
32
33
  };
33
34
  const estimateTokens = makeTokenEstimator(config.provider);
35
+ const contextManager = new ContextManager(undefined, config.model);
34
36
  // Check provider capabilities
35
37
  const modelInfo = config.provider.getModelInfo?.(config.model ?? '');
36
38
  const toolsSupported = !modelInfo || modelInfo.supportsTools;
@@ -193,7 +195,9 @@ export async function* query(userMessage, config, existingMessages = []) {
193
195
  }
194
196
  for (const { toolCall: tc, result } of completedResults) {
195
197
  yield { type: "tool_call_end", callId: tc.id, output: result.output, isError: result.isError };
196
- state.messages.push(createToolResultMessage({ callId: tc.id, output: result.output, isError: result.isError }));
198
+ // Apply context budget to tool output
199
+ const budgetedOutput = contextManager.enforceToolBudget(tc.toolName, result.output);
200
+ state.messages.push(createToolResultMessage({ callId: tc.id, output: budgetedOutput, isError: result.isError }));
197
201
  }
198
202
  // Execute remaining tools not started during streaming
199
203
  const remaining = toolCalls.filter(tc => !executedIds.has(tc.id));
@@ -85,6 +85,13 @@ export async function executeSingleTool(toolCall, tools, context, permissionMode
85
85
  toolArgs: JSON.stringify(toolCall.arguments).slice(0, 1000),
86
86
  toolOutput: result.output.slice(0, 1000),
87
87
  });
88
+ // Emit fileChanged hook for file-modifying tools
89
+ if (!result.isError && ['Edit', 'Write', 'MultiEdit'].includes(tool.name)) {
90
+ const filePaths = getAffectedFiles(tool.name, parsed.data);
91
+ for (const fp of filePaths) {
92
+ emitHook("fileChanged", { filePath: fp, toolName: tool.name });
93
+ }
94
+ }
88
95
  // Verification loop: auto-run lint/typecheck after file-modifying tools
89
96
  let verificationSuffix = '';
90
97
  if (!result.isError && ['Edit', 'Write', 'MultiEdit'].includes(tool.name)) {
@@ -0,0 +1,75 @@
1
+ /**
2
+ * openHarness Agent SDK — programmatic API for building AI agents.
3
+ *
4
+ * Usage:
5
+ * import { createAgent } from '@zhijiewang/openharness';
6
+ *
7
+ * const agent = createAgent({
8
+ * provider: 'anthropic',
9
+ * model: 'claude-sonnet-4-6',
10
+ * apiKey: process.env.ANTHROPIC_API_KEY,
11
+ * });
12
+ *
13
+ * const result = await agent.run('Fix the failing tests');
14
+ * console.log(result.text);
15
+ */
16
+ import type { StreamEvent } from '../types/events.js';
17
+ import type { PermissionMode } from '../types/permissions.js';
18
+ export type AgentConfig = {
19
+ /** Provider name: 'anthropic', 'openai', 'ollama', 'openrouter', etc. */
20
+ provider: string;
21
+ /** Model identifier */
22
+ model: string;
23
+ /** API key (or use environment variable) */
24
+ apiKey?: string;
25
+ /** Custom base URL */
26
+ baseUrl?: string;
27
+ /** Tools to include: 'all', 'read-only', or array of tool names */
28
+ tools?: 'all' | 'read-only' | string[];
29
+ /** Permission mode (default: 'trust') */
30
+ permissionMode?: PermissionMode;
31
+ /** Custom system prompt */
32
+ systemPrompt?: string;
33
+ /** Max turns per run */
34
+ maxTurns?: number;
35
+ /** Working directory */
36
+ cwd?: string;
37
+ };
38
+ export type AgentResult = {
39
+ /** Final text output */
40
+ text: string;
41
+ /** Tool calls made during execution */
42
+ toolCalls: Array<{
43
+ toolName: string;
44
+ output: string;
45
+ isError: boolean;
46
+ }>;
47
+ /** Total cost in USD */
48
+ cost: number;
49
+ /** Total input tokens */
50
+ inputTokens: number;
51
+ /** Total output tokens */
52
+ outputTokens: number;
53
+ /** Number of turns taken */
54
+ turns: number;
55
+ };
56
+ export declare class Agent {
57
+ private provider;
58
+ private tools;
59
+ private config;
60
+ private initialized;
61
+ constructor(config: AgentConfig);
62
+ /** Initialize provider and tools (lazy, on first use) */
63
+ private init;
64
+ /** Run a single prompt and return the result */
65
+ run(prompt: string): Promise<AgentResult>;
66
+ /** Stream events from a prompt */
67
+ stream(prompt: string): AsyncGenerator<StreamEvent, void>;
68
+ /** Stop the agent (cleanup) */
69
+ stop(): void;
70
+ }
71
+ /** Create a new agent instance */
72
+ export declare function createAgent(config: AgentConfig): Agent;
73
+ export type { StreamEvent } from '../types/events.js';
74
+ export type { PermissionMode } from '../types/permissions.js';
75
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1,135 @@
1
+ /**
2
+ * openHarness Agent SDK — programmatic API for building AI agents.
3
+ *
4
+ * Usage:
5
+ * import { createAgent } from '@zhijiewang/openharness';
6
+ *
7
+ * const agent = createAgent({
8
+ * provider: 'anthropic',
9
+ * model: 'claude-sonnet-4-6',
10
+ * apiKey: process.env.ANTHROPIC_API_KEY,
11
+ * });
12
+ *
13
+ * const result = await agent.run('Fix the failing tests');
14
+ * console.log(result.text);
15
+ */
16
+ // ── Agent Class ──
17
+ export class Agent {
18
+ provider = null;
19
+ tools = null;
20
+ config;
21
+ initialized = false;
22
+ constructor(config) {
23
+ this.config = {
24
+ permissionMode: 'trust',
25
+ maxTurns: 20,
26
+ ...config,
27
+ };
28
+ }
29
+ /** Initialize provider and tools (lazy, on first use) */
30
+ async init() {
31
+ if (this.initialized)
32
+ return;
33
+ const { createProvider } = await import('../providers/index.js');
34
+ const { getAllTools } = await import('../tools.js');
35
+ const overrides = {};
36
+ if (this.config.apiKey)
37
+ overrides.apiKey = this.config.apiKey;
38
+ if (this.config.baseUrl)
39
+ overrides.baseUrl = this.config.baseUrl;
40
+ const { provider } = await createProvider(this.config.model, Object.keys(overrides).length > 0 ? overrides : undefined);
41
+ this.provider = provider;
42
+ // Filter tools
43
+ let tools = getAllTools();
44
+ if (this.config.tools === 'read-only') {
45
+ const readOnlyNames = new Set(['Read', 'Glob', 'Grep', 'LS', 'ImageRead', 'WebSearch', 'WebFetch']);
46
+ tools = tools.filter(t => readOnlyNames.has(t.name));
47
+ }
48
+ else if (Array.isArray(this.config.tools)) {
49
+ const allowed = new Set(this.config.tools.map(n => n.toLowerCase()));
50
+ tools = tools.filter(t => allowed.has(t.name.toLowerCase()));
51
+ }
52
+ this.tools = tools;
53
+ this.initialized = true;
54
+ }
55
+ /** Run a single prompt and return the result */
56
+ async run(prompt) {
57
+ await this.init();
58
+ const { query } = await import('../query.js');
59
+ if (this.config.cwd) {
60
+ try {
61
+ process.chdir(this.config.cwd);
62
+ }
63
+ catch { /* ignore */ }
64
+ }
65
+ const config = {
66
+ provider: this.provider,
67
+ tools: this.tools,
68
+ systemPrompt: this.config.systemPrompt ?? 'You are a helpful coding agent.',
69
+ permissionMode: this.config.permissionMode,
70
+ model: this.config.model,
71
+ maxTurns: this.config.maxTurns,
72
+ };
73
+ let text = '';
74
+ const toolCalls = [];
75
+ let cost = 0;
76
+ let inputTokens = 0;
77
+ let outputTokens = 0;
78
+ let turns = 0;
79
+ for await (const event of query(prompt, config)) {
80
+ switch (event.type) {
81
+ case 'text_delta':
82
+ text += event.content;
83
+ break;
84
+ case 'tool_call_end':
85
+ toolCalls.push({
86
+ toolName: event.toolName ?? 'unknown',
87
+ output: event.output ?? '',
88
+ isError: event.isError ?? false,
89
+ });
90
+ break;
91
+ case 'cost_update':
92
+ cost += event.cost ?? 0;
93
+ inputTokens += event.inputTokens ?? 0;
94
+ outputTokens += event.outputTokens ?? 0;
95
+ break;
96
+ case 'turn_complete':
97
+ turns++;
98
+ break;
99
+ }
100
+ }
101
+ return { text, toolCalls, cost, inputTokens, outputTokens, turns };
102
+ }
103
+ /** Stream events from a prompt */
104
+ async *stream(prompt) {
105
+ await this.init();
106
+ const { query } = await import('../query.js');
107
+ if (this.config.cwd) {
108
+ try {
109
+ process.chdir(this.config.cwd);
110
+ }
111
+ catch { /* ignore */ }
112
+ }
113
+ const config = {
114
+ provider: this.provider,
115
+ tools: this.tools,
116
+ systemPrompt: this.config.systemPrompt ?? 'You are a helpful coding agent.',
117
+ permissionMode: this.config.permissionMode,
118
+ model: this.config.model,
119
+ maxTurns: this.config.maxTurns,
120
+ };
121
+ yield* query(prompt, config);
122
+ }
123
+ /** Stop the agent (cleanup) */
124
+ stop() {
125
+ this.provider = null;
126
+ this.tools = null;
127
+ this.initialized = false;
128
+ }
129
+ }
130
+ // ── Factory ──
131
+ /** Create a new agent instance */
132
+ export function createAgent(config) {
133
+ return new Agent(config);
134
+ }
135
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,61 @@
1
+ /**
2
+ * GAN-Style Evaluator Loop — Generator→Evaluator adversarial refinement.
3
+ *
4
+ * Inspired by Anthropic's three-agent harness architecture:
5
+ * "AI models are inherently poor at self-critique; they tend to rate
6
+ * their own work favorably." Externalizing critique to a separate
7
+ * Evaluator agent produces measurably better output.
8
+ *
9
+ * Flow:
10
+ * 1. Generator produces initial output
11
+ * 2. Evaluator scores against rubric criteria
12
+ * 3. If below threshold, Generator refines based on feedback
13
+ * 4. Repeat until pass or max iterations reached
14
+ */
15
+ import type { Provider } from '../providers/base.js';
16
+ import type { Tools } from '../Tool.js';
17
+ import type { PermissionMode } from '../types/permissions.js';
18
+ export type EvaluationCriterion = {
19
+ name: string;
20
+ weight: number;
21
+ description: string;
22
+ };
23
+ export type EvaluationRubric = {
24
+ criteria: EvaluationCriterion[];
25
+ passThreshold: number;
26
+ };
27
+ export type EvaluationScore = {
28
+ criterion: string;
29
+ score: number;
30
+ feedback: string;
31
+ };
32
+ export type EvaluatorResult = {
33
+ output: string;
34
+ scores: EvaluationScore[];
35
+ weightedScore: number;
36
+ passed: boolean;
37
+ iterations: number;
38
+ refinements: string[];
39
+ };
40
+ export declare const DEFAULT_RUBRIC: EvaluationRubric;
41
+ export declare class EvaluatorLoop {
42
+ private provider;
43
+ private tools;
44
+ private systemPrompt;
45
+ private permissionMode;
46
+ private model?;
47
+ private rubric;
48
+ private maxIterations;
49
+ constructor(provider: Provider, tools: Tools, systemPrompt: string, permissionMode: PermissionMode, model?: string | undefined, rubric?: EvaluationRubric, maxIterations?: number);
50
+ /**
51
+ * Run the full Generator→Evaluator→Refine cycle.
52
+ */
53
+ run(task: string): Promise<EvaluatorResult>;
54
+ private generate;
55
+ private evaluate;
56
+ private calculateWeightedScore;
57
+ private defaultScores;
58
+ }
59
+ /** Format evaluator results for display */
60
+ export declare function formatEvaluatorResult(result: EvaluatorResult): string;
61
+ //# sourceMappingURL=EvaluatorLoop.d.ts.map
@@ -0,0 +1,157 @@
1
+ /**
2
+ * GAN-Style Evaluator Loop — Generator→Evaluator adversarial refinement.
3
+ *
4
+ * Inspired by Anthropic's three-agent harness architecture:
5
+ * "AI models are inherently poor at self-critique; they tend to rate
6
+ * their own work favorably." Externalizing critique to a separate
7
+ * Evaluator agent produces measurably better output.
8
+ *
9
+ * Flow:
10
+ * 1. Generator produces initial output
11
+ * 2. Evaluator scores against rubric criteria
12
+ * 3. If below threshold, Generator refines based on feedback
13
+ * 4. Repeat until pass or max iterations reached
14
+ */
15
+ // ── Default Rubric ──
16
+ export const DEFAULT_RUBRIC = {
17
+ criteria: [
18
+ { name: 'correctness', weight: 0.4, description: 'Does the output correctly address the task? Are there logical errors?' },
19
+ { name: 'completeness', weight: 0.3, description: 'Is the solution complete? Any missing edge cases or requirements?' },
20
+ { name: 'quality', weight: 0.2, description: 'Is the code clean, well-structured, and following best practices?' },
21
+ { name: 'safety', weight: 0.1, description: 'Are there security issues, unsafe patterns, or potential bugs?' },
22
+ ],
23
+ passThreshold: 0.7,
24
+ };
25
+ // ── Evaluator Loop ──
26
+ export class EvaluatorLoop {
27
+ provider;
28
+ tools;
29
+ systemPrompt;
30
+ permissionMode;
31
+ model;
32
+ rubric;
33
+ maxIterations;
34
+ constructor(provider, tools, systemPrompt, permissionMode, model, rubric = DEFAULT_RUBRIC, maxIterations = 3) {
35
+ this.provider = provider;
36
+ this.tools = tools;
37
+ this.systemPrompt = systemPrompt;
38
+ this.permissionMode = permissionMode;
39
+ this.model = model;
40
+ this.rubric = rubric;
41
+ this.maxIterations = maxIterations;
42
+ }
43
+ /**
44
+ * Run the full Generator→Evaluator→Refine cycle.
45
+ */
46
+ async run(task) {
47
+ const refinements = [];
48
+ let currentOutput = '';
49
+ let scores = [];
50
+ let weightedScore = 0;
51
+ for (let iteration = 1; iteration <= this.maxIterations; iteration++) {
52
+ // ── Generate ──
53
+ const generatorPrompt = iteration === 1
54
+ ? task
55
+ : `${task}\n\n[Evaluator feedback from iteration ${iteration - 1}]:\n${scores.map(s => `${s.criterion}: ${s.score}/1.0 — ${s.feedback}`).join('\n')}\n\nPlease refine your output based on this feedback.`;
56
+ currentOutput = await this.generate(generatorPrompt);
57
+ // ── Evaluate ──
58
+ scores = await this.evaluate(task, currentOutput);
59
+ weightedScore = this.calculateWeightedScore(scores);
60
+ if (weightedScore >= this.rubric.passThreshold) {
61
+ return {
62
+ output: currentOutput,
63
+ scores,
64
+ weightedScore,
65
+ passed: true,
66
+ iterations: iteration,
67
+ refinements,
68
+ };
69
+ }
70
+ refinements.push(`Iteration ${iteration}: score ${weightedScore.toFixed(2)} — refining`);
71
+ }
72
+ // Max iterations reached — return best effort
73
+ return {
74
+ output: currentOutput,
75
+ scores,
76
+ weightedScore,
77
+ passed: false,
78
+ iterations: this.maxIterations,
79
+ refinements,
80
+ };
81
+ }
82
+ async generate(prompt) {
83
+ const { query } = await import('../query.js');
84
+ const config = {
85
+ provider: this.provider,
86
+ tools: this.tools,
87
+ systemPrompt: this.systemPrompt,
88
+ permissionMode: this.permissionMode,
89
+ model: this.model,
90
+ maxTurns: 15,
91
+ };
92
+ let output = '';
93
+ for await (const event of query(prompt, config)) {
94
+ if (event.type === 'text_delta')
95
+ output += event.content;
96
+ }
97
+ return output;
98
+ }
99
+ async evaluate(task, output) {
100
+ const evaluationPrompt = `You are a code evaluator. Score the following output on a 0-1 scale for each criterion.
101
+
102
+ Task: ${task.slice(0, 500)}
103
+
104
+ Output to evaluate:
105
+ ${output.slice(0, 3000)}
106
+
107
+ Criteria:
108
+ ${this.rubric.criteria.map(c => `- ${c.name} (weight: ${c.weight}): ${c.description}`).join('\n')}
109
+
110
+ Respond ONLY with a JSON array: [{"criterion": "name", "score": 0.8, "feedback": "brief explanation"}, ...]`;
111
+ const response = await this.provider.complete([{ role: 'user', content: evaluationPrompt, uuid: `eval-${Date.now()}`, timestamp: Date.now() }], 'You are a strict code evaluator. Respond ONLY with valid JSON. Be critical and specific.', undefined, this.model);
112
+ try {
113
+ const jsonMatch = response.content.match(/\[[\s\S]*\]/);
114
+ if (!jsonMatch)
115
+ return this.defaultScores();
116
+ const parsed = JSON.parse(jsonMatch[0]);
117
+ return parsed.filter(s => s.criterion && typeof s.score === 'number');
118
+ }
119
+ catch {
120
+ return this.defaultScores();
121
+ }
122
+ }
123
+ calculateWeightedScore(scores) {
124
+ let total = 0;
125
+ for (const criterion of this.rubric.criteria) {
126
+ const score = scores.find(s => s.criterion === criterion.name);
127
+ total += (score?.score ?? 0.5) * criterion.weight;
128
+ }
129
+ return total;
130
+ }
131
+ defaultScores() {
132
+ return this.rubric.criteria.map(c => ({
133
+ criterion: c.name,
134
+ score: 0.5,
135
+ feedback: 'Could not evaluate (parsing error)',
136
+ }));
137
+ }
138
+ }
139
+ /** Format evaluator results for display */
140
+ export function formatEvaluatorResult(result) {
141
+ const lines = [];
142
+ lines.push(`Evaluator: ${result.passed ? 'PASSED' : 'NEEDS IMPROVEMENT'} (${result.weightedScore.toFixed(2)}/${1.0})`);
143
+ lines.push(`Iterations: ${result.iterations}`);
144
+ lines.push('');
145
+ for (const s of result.scores) {
146
+ const bar = '█'.repeat(Math.round(s.score * 10)) + '░'.repeat(10 - Math.round(s.score * 10));
147
+ lines.push(` ${s.criterion.padEnd(15)} ${bar} ${s.score.toFixed(1)} — ${s.feedback}`);
148
+ }
149
+ if (result.refinements.length > 0) {
150
+ lines.push('');
151
+ lines.push('Refinements:');
152
+ for (const r of result.refinements)
153
+ lines.push(` ${r}`);
154
+ }
155
+ return lines.join('\n');
156
+ }
157
+ //# sourceMappingURL=EvaluatorLoop.js.map