jiva-core 0.3.42-dev.c44c725 → 0.3.43-dev.00a530a
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/code/agent.d.ts +2 -0
- package/dist/code/agent.d.ts.map +1 -1
- package/dist/code/agent.js +11 -4
- package/dist/code/agent.js.map +1 -1
- package/dist/core/agent-interface.d.ts +5 -0
- package/dist/core/agent-interface.d.ts.map +1 -1
- package/dist/core/agent-spawner.d.ts.map +1 -1
- package/dist/core/agent-spawner.js +2 -1
- package/dist/core/agent-spawner.js.map +1 -1
- package/dist/core/conversation-manager.d.ts +17 -3
- package/dist/core/conversation-manager.d.ts.map +1 -1
- package/dist/core/conversation-manager.js +21 -8
- package/dist/core/conversation-manager.js.map +1 -1
- package/dist/core/dual-agent.d.ts +16 -0
- package/dist/core/dual-agent.d.ts.map +1 -1
- package/dist/core/dual-agent.js +35 -7
- package/dist/core/dual-agent.js.map +1 -1
- package/dist/core/manager-agent.d.ts.map +1 -1
- package/dist/core/manager-agent.js +19 -8
- package/dist/core/manager-agent.js.map +1 -1
- package/dist/core/worker-agent.d.ts.map +1 -1
- package/dist/core/worker-agent.js +59 -9
- package/dist/core/worker-agent.js.map +1 -1
- package/dist/evaluator/directive-adapter.d.ts +15 -0
- package/dist/evaluator/directive-adapter.d.ts.map +1 -0
- package/dist/evaluator/directive-adapter.js +75 -0
- package/dist/evaluator/directive-adapter.js.map +1 -0
- package/dist/evaluator/evaluator-agent.d.ts +35 -0
- package/dist/evaluator/evaluator-agent.d.ts.map +1 -0
- package/dist/evaluator/evaluator-agent.js +369 -0
- package/dist/evaluator/evaluator-agent.js.map +1 -0
- package/dist/evaluator/harness.d.ts +31 -0
- package/dist/evaluator/harness.d.ts.map +1 -0
- package/dist/evaluator/harness.js +70 -0
- package/dist/evaluator/harness.js.map +1 -0
- package/dist/evaluator/index.d.ts +47 -0
- package/dist/evaluator/index.d.ts.map +1 -0
- package/dist/evaluator/index.js +79 -0
- package/dist/evaluator/index.js.map +1 -0
- package/dist/evaluator/tools/agent-tools.d.ts +23 -0
- package/dist/evaluator/tools/agent-tools.d.ts.map +1 -0
- package/dist/evaluator/tools/agent-tools.js +111 -0
- package/dist/evaluator/tools/agent-tools.js.map +1 -0
- package/dist/evaluator/types.d.ts +82 -0
- package/dist/evaluator/types.d.ts.map +1 -0
- package/dist/evaluator/types.js +9 -0
- package/dist/evaluator/types.js.map +1 -0
- package/dist/interfaces/cli/index.js +36 -13
- package/dist/interfaces/cli/index.js.map +1 -1
- package/dist/interfaces/cli/repl.d.ts +6 -0
- package/dist/interfaces/cli/repl.d.ts.map +1 -1
- package/dist/interfaces/cli/repl.js +153 -28
- package/dist/interfaces/cli/repl.js.map +1 -1
- package/dist/interfaces/http/middleware/auth.d.ts.map +1 -1
- package/dist/interfaces/http/middleware/auth.js +2 -1
- package/dist/interfaces/http/middleware/auth.js.map +1 -1
- package/dist/interfaces/http/routes/chat.d.ts.map +1 -1
- package/dist/interfaces/http/routes/chat.js +78 -0
- package/dist/interfaces/http/routes/chat.js.map +1 -1
- package/dist/interfaces/http/routes/session.d.ts.map +1 -1
- package/dist/interfaces/http/routes/session.js +22 -0
- package/dist/interfaces/http/routes/session.js.map +1 -1
- package/dist/interfaces/http/session-manager.d.ts.map +1 -1
- package/dist/interfaces/http/session-manager.js +4 -0
- package/dist/interfaces/http/session-manager.js.map +1 -1
- package/dist/mcp/client.d.ts.map +1 -1
- package/dist/mcp/client.js +9 -1
- package/dist/mcp/client.js.map +1 -1
- package/dist/models/harmony.d.ts.map +1 -1
- package/dist/models/harmony.js +31 -1
- package/dist/models/harmony.js.map +1 -1
- package/dist/models/model-client.d.ts.map +1 -1
- package/dist/models/model-client.js +5 -3
- package/dist/models/model-client.js.map +1 -1
- package/dist/models/orchestrator.d.ts +6 -0
- package/dist/models/orchestrator.d.ts.map +1 -1
- package/dist/models/orchestrator.js +20 -6
- package/dist/models/orchestrator.js.map +1 -1
- package/dist/models/token-tracker.d.ts +72 -0
- package/dist/models/token-tracker.d.ts.map +1 -0
- package/dist/models/token-tracker.js +110 -0
- package/dist/models/token-tracker.js.map +1 -0
- package/dist/storage/types.d.ts +6 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/dist/storage/types.js.map +1 -1
- package/package.json +2 -1
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvaluatorAgent — autonomous supervisor for Jiva tasks.
|
|
3
|
+
*
|
|
4
|
+
* Architecture mirrors CodeAgent's single tool-calling loop, but instead of
|
|
5
|
+
* writing code the evaluator reads workspace files, validates completion against
|
|
6
|
+
* the directive, and uses interact_with_agent to nudge the main agent when gaps
|
|
7
|
+
* are found.
|
|
8
|
+
*
|
|
9
|
+
* The evaluator has completely isolated LLM state — its own ModelOrchestrator,
|
|
10
|
+
* its own MCPServerManager, and its own message history. It never shares context
|
|
11
|
+
* with the agent it supervises.
|
|
12
|
+
*/
|
|
13
|
+
import type { EvaluatorConfig, EvaluationContext, EvaluationResult } from './types.js';
|
|
14
|
+
export declare class EvaluatorAgent {
|
|
15
|
+
private orchestrator;
|
|
16
|
+
private mcpManager;
|
|
17
|
+
private workspace;
|
|
18
|
+
private targetAgent;
|
|
19
|
+
private maxIterations;
|
|
20
|
+
private maxCycles;
|
|
21
|
+
private virtualTools;
|
|
22
|
+
constructor(config: EvaluatorConfig);
|
|
23
|
+
/** Return token usage accumulated by the evaluator's isolated orchestrator. */
|
|
24
|
+
getOrchestratorTokenUsage(): import("../models/token-tracker.js").TokenUsageSnapshot;
|
|
25
|
+
/**
|
|
26
|
+
* Evaluate whether the main agent has completed the goal described in userMessage.
|
|
27
|
+
*
|
|
28
|
+
* The evaluator runs its own agentic loop: it reads files, assesses completion,
|
|
29
|
+
* sends nudges to the main agent via interact_with_agent, and re-checks until
|
|
30
|
+
* satisfied or the cycle limit is reached.
|
|
31
|
+
*/
|
|
32
|
+
evaluate(ctx: EvaluationContext): Promise<EvaluationResult>;
|
|
33
|
+
private runEvaluationCycle;
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=evaluator-agent.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluator-agent.d.ts","sourceRoot":"","sources":["../../src/evaluator/evaluator-agent.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAUH,OAAO,KAAK,EACV,eAAe,EACf,iBAAiB,EACjB,gBAAgB,EAEjB,MAAM,YAAY,CAAC;AAiHpB,qBAAa,cAAc;IACzB,OAAO,CAAC,YAAY,CAAoB;IACxC,OAAO,CAAC,UAAU,CAAmB;IACrC,OAAO,CAAC,SAAS,CAAmB;IACpC,OAAO,CAAC,WAAW,CAA8C;IACjE,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,YAAY,CAAmB;gBAE3B,MAAM,EAAE,eAAe;IAUnC,+EAA+E;IAC/E,yBAAyB;IAIzB;;;;;;OAMG;IACG,QAAQ,CAAC,GAAG,EAAE,iBAAiB,GAAG,OAAO,CAAC,gBAAgB,CAAC;YAuFnD,kBAAkB;CA0MjC"}
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvaluatorAgent — autonomous supervisor for Jiva tasks.
|
|
3
|
+
*
|
|
4
|
+
* Architecture mirrors CodeAgent's single tool-calling loop, but instead of
|
|
5
|
+
* writing code the evaluator reads workspace files, validates completion against
|
|
6
|
+
* the directive, and uses interact_with_agent to nudge the main agent when gaps
|
|
7
|
+
* are found.
|
|
8
|
+
*
|
|
9
|
+
* The evaluator has completely isolated LLM state — its own ModelOrchestrator,
|
|
10
|
+
* its own MCPServerManager, and its own message history. It never shares context
|
|
11
|
+
* with the agent it supervises.
|
|
12
|
+
*/
|
|
13
|
+
import { formatToolResult } from '../models/harmony.js';
|
|
14
|
+
import { logger } from '../utils/logger.js';
|
|
15
|
+
import { deriveEvaluatorDirective } from './directive-adapter.js';
|
|
16
|
+
import { EVALUATOR_VIRTUAL_TOOLS } from './tools/agent-tools.js';
|
|
17
|
+
// ─── Constants ────────────────────────────────────────────────────────────────
|
|
18
|
+
const DEFAULT_MAX_ITERATIONS = 30;
|
|
19
|
+
const DEFAULT_MAX_CYCLES = 5;
|
|
20
|
+
const DOOM_LOOP_THRESHOLD = 3;
|
|
21
|
+
// ─── Result parsing ───────────────────────────────────────────────────────────
|
|
22
|
+
/**
|
|
23
|
+
* Extract a structured EvaluationResult from the evaluator's final LLM response.
|
|
24
|
+
* Looks for a JSON code block; falls back to a best-effort parse.
|
|
25
|
+
*/
|
|
26
|
+
/** Strip Harmony/XML tool-call markup so it doesn't pollute the summary text. */
|
|
27
|
+
function stripToolCallMarkup(text) {
|
|
28
|
+
return text
|
|
29
|
+
.replace(/<tool_call>[\s\S]*?<\/tool_call>/gi, '')
|
|
30
|
+
.replace(/<\|call\|>[\s\S]*?<\|return\|>/g, '')
|
|
31
|
+
.replace(/<\|channel\|>\w+/g, '')
|
|
32
|
+
.trim();
|
|
33
|
+
}
|
|
34
|
+
function parseEvaluationResult(text, nudgesSent, cyclesRan, evidence) {
|
|
35
|
+
// Strip any tool-call markup before attempting to parse
|
|
36
|
+
const stripped = stripToolCallMarkup(text);
|
|
37
|
+
try {
|
|
38
|
+
// Look for ```json ... ``` block (prefer stripped, fall back to raw)
|
|
39
|
+
const jsonMatch = stripped.match(/```json\s*([\s\S]+?)\s*```/i) ||
|
|
40
|
+
text.match(/```json\s*([\s\S]+?)\s*```/i);
|
|
41
|
+
const raw = jsonMatch ? jsonMatch[1] : stripped;
|
|
42
|
+
const parsed = JSON.parse(raw.trim());
|
|
43
|
+
return {
|
|
44
|
+
passed: Boolean(parsed.passed),
|
|
45
|
+
gaps: Array.isArray(parsed.gaps) ? parsed.gaps : [],
|
|
46
|
+
nudgesSent,
|
|
47
|
+
cyclesRan,
|
|
48
|
+
evidence,
|
|
49
|
+
summary: typeof parsed.summary === 'string' ? parsed.summary : stripped.substring(0, 300),
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
// Free-form response — determine pass/fail from keywords in the stripped text
|
|
54
|
+
const lower = stripped.toLowerCase();
|
|
55
|
+
const passed = lower.includes('evaluation passed') ||
|
|
56
|
+
lower.includes('all tasks complete') ||
|
|
57
|
+
lower.includes('work is complete') ||
|
|
58
|
+
lower.includes('"passed": true');
|
|
59
|
+
// If the stripped text is empty (was entirely tool calls) or too short to be
|
|
60
|
+
// a real verdict, treat this as inconclusive rather than a definitive failure.
|
|
61
|
+
const inconclusive = stripped.length < 20;
|
|
62
|
+
return {
|
|
63
|
+
passed: inconclusive ? false : passed,
|
|
64
|
+
gaps: inconclusive
|
|
65
|
+
? ['Evaluation inconclusive — evaluator did not produce a final verdict']
|
|
66
|
+
: [],
|
|
67
|
+
nudgesSent,
|
|
68
|
+
cyclesRan,
|
|
69
|
+
evidence,
|
|
70
|
+
summary: inconclusive
|
|
71
|
+
? 'The evaluator did not produce a parseable verdict. Review the agent logs for details.'
|
|
72
|
+
: stripped.substring(0, 300),
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
// ─── System prompt ────────────────────────────────────────────────────────────
|
|
77
|
+
function buildSystemPrompt(workspaceDir, evaluatorDirective) {
|
|
78
|
+
return `You are an autonomous evaluation agent operating in evaluation mode.
|
|
79
|
+
You DO NOT perform tasks — you VALIDATE whether tasks have been completed.
|
|
80
|
+
|
|
81
|
+
WORKSPACE: ${workspaceDir}
|
|
82
|
+
All relative paths are resolved relative to the workspace directory above.
|
|
83
|
+
Use absolute paths for all file operations.
|
|
84
|
+
|
|
85
|
+
YOUR EVALUATION APPROACH:
|
|
86
|
+
1. Read workspace files to understand what has been produced.
|
|
87
|
+
2. Compare the actual output against the requirements in your directive.
|
|
88
|
+
3. If gaps exist, send targeted instructions to the main agent using interact_with_agent.
|
|
89
|
+
4. Re-read files after each nudge to confirm corrections were applied.
|
|
90
|
+
5. When you are satisfied (or exhausted your options), output your final JSON assessment.
|
|
91
|
+
|
|
92
|
+
TOOL SELECTION:
|
|
93
|
+
- To READ files → use the filesystem MCP tools (filesystem__read_file, filesystem__search_files, etc.)
|
|
94
|
+
- To SEND instructions to the main agent → use interact_with_agent
|
|
95
|
+
- To LIST available conversations → use list_agent_conversations
|
|
96
|
+
- To INSPECT conversation history → use get_conversation_history
|
|
97
|
+
|
|
98
|
+
FINAL OUTPUT — when evaluation is complete, respond with ONLY this JSON (no other text):
|
|
99
|
+
\`\`\`json
|
|
100
|
+
{
|
|
101
|
+
"passed": true | false,
|
|
102
|
+
"gaps": ["specific gap 1", "specific gap 2"],
|
|
103
|
+
"summary": "1-3 sentence assessment."
|
|
104
|
+
}
|
|
105
|
+
\`\`\`
|
|
106
|
+
|
|
107
|
+
${evaluatorDirective}`;
|
|
108
|
+
}
|
|
109
|
+
// ─── EvaluatorAgent ───────────────────────────────────────────────────────────
|
|
110
|
+
export class EvaluatorAgent {
|
|
111
|
+
orchestrator;
|
|
112
|
+
mcpManager;
|
|
113
|
+
workspace;
|
|
114
|
+
targetAgent;
|
|
115
|
+
maxIterations;
|
|
116
|
+
maxCycles;
|
|
117
|
+
virtualTools;
|
|
118
|
+
constructor(config) {
|
|
119
|
+
this.orchestrator = config.orchestrator;
|
|
120
|
+
this.mcpManager = config.mcpManager;
|
|
121
|
+
this.workspace = config.workspace;
|
|
122
|
+
this.targetAgent = config.targetAgent;
|
|
123
|
+
this.maxIterations = config.maxIterationsPerCycle ?? DEFAULT_MAX_ITERATIONS;
|
|
124
|
+
this.maxCycles = config.maxEvaluationCycles ?? DEFAULT_MAX_CYCLES;
|
|
125
|
+
this.virtualTools = EVALUATOR_VIRTUAL_TOOLS;
|
|
126
|
+
}
|
|
127
|
+
/** Return token usage accumulated by the evaluator's isolated orchestrator. */
|
|
128
|
+
getOrchestratorTokenUsage() {
|
|
129
|
+
return this.orchestrator.getTokenUsage();
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Evaluate whether the main agent has completed the goal described in userMessage.
|
|
133
|
+
*
|
|
134
|
+
* The evaluator runs its own agentic loop: it reads files, assesses completion,
|
|
135
|
+
* sends nudges to the main agent via interact_with_agent, and re-checks until
|
|
136
|
+
* satisfied or the cycle limit is reached.
|
|
137
|
+
*/
|
|
138
|
+
async evaluate(ctx) {
|
|
139
|
+
logger.info('[Evaluator] Starting evaluation');
|
|
140
|
+
// Optionally load a specific conversation into the target agent
|
|
141
|
+
if (ctx.targetConversationId) {
|
|
142
|
+
logger.info(`[Evaluator] Loading conversation: ${ctx.targetConversationId}`);
|
|
143
|
+
await this.targetAgent.loadConversation(ctx.targetConversationId);
|
|
144
|
+
}
|
|
145
|
+
let nudgesSent = 0;
|
|
146
|
+
let cyclesRan = 0;
|
|
147
|
+
const evidence = [];
|
|
148
|
+
// The directive for the evaluator — wraps the workspace directive with evaluation framing
|
|
149
|
+
const rawDirective = this.workspace.getDirectivePrompt() ?? '';
|
|
150
|
+
const evaluatorDirective = deriveEvaluatorDirective(rawDirective);
|
|
151
|
+
const workspaceDir = this.workspace.getWorkspaceDir();
|
|
152
|
+
const systemPrompt = buildSystemPrompt(workspaceDir, evaluatorDirective);
|
|
153
|
+
// Build the unified tool list: MCP tools + virtual tools
|
|
154
|
+
const mcpToolDefs = this.mcpManager
|
|
155
|
+
.getClient()
|
|
156
|
+
.getAllTools()
|
|
157
|
+
.map((t) => ({
|
|
158
|
+
name: t.name,
|
|
159
|
+
description: t.description,
|
|
160
|
+
parameters: t.inputSchema ?? { type: 'object', properties: {} },
|
|
161
|
+
}));
|
|
162
|
+
const virtualToolDefs = this.virtualTools.map((t) => ({
|
|
163
|
+
name: t.name,
|
|
164
|
+
description: t.description,
|
|
165
|
+
parameters: t.parameters,
|
|
166
|
+
}));
|
|
167
|
+
const allToolDefs = [...mcpToolDefs, ...virtualToolDefs];
|
|
168
|
+
// Evaluation loop — each cycle runs the full agentic loop once.
|
|
169
|
+
// The evaluator terminates early if it produces a JSON result with "passed": true.
|
|
170
|
+
let lastResult = null;
|
|
171
|
+
for (let cycle = 0; cycle < this.maxCycles; cycle++) {
|
|
172
|
+
cyclesRan = cycle + 1;
|
|
173
|
+
logger.info(`[Evaluator] Cycle ${cyclesRan}/${this.maxCycles}`);
|
|
174
|
+
const cycleResult = await this.runEvaluationCycle({
|
|
175
|
+
systemPrompt,
|
|
176
|
+
userMessage: ctx.userMessage,
|
|
177
|
+
allToolDefs,
|
|
178
|
+
workspaceDir,
|
|
179
|
+
nudgesSentRef: { value: nudgesSent },
|
|
180
|
+
evidence,
|
|
181
|
+
});
|
|
182
|
+
nudgesSent = cycleResult.nudgesSentAfter;
|
|
183
|
+
lastResult = parseEvaluationResult(cycleResult.finalText, nudgesSent, cyclesRan, [...evidence]);
|
|
184
|
+
logger.info(`[Evaluator] Cycle ${cyclesRan} complete — passed: ${lastResult.passed}, gaps: ${lastResult.gaps.length}`);
|
|
185
|
+
if (lastResult.passed) {
|
|
186
|
+
break;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
if (!lastResult) {
|
|
190
|
+
return {
|
|
191
|
+
passed: false,
|
|
192
|
+
gaps: ['Evaluation could not be completed — max cycles reached without a result'],
|
|
193
|
+
nudgesSent,
|
|
194
|
+
cyclesRan,
|
|
195
|
+
evidence,
|
|
196
|
+
summary: 'Evaluation did not produce a result within the allowed cycle budget.',
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
return lastResult;
|
|
200
|
+
}
|
|
201
|
+
// ─── Private: single cycle ──────────────────────────────────────────────────
|
|
202
|
+
async runEvaluationCycle(params) {
|
|
203
|
+
const { systemPrompt, userMessage, allToolDefs, workspaceDir, nudgesSentRef, evidence } = params;
|
|
204
|
+
// Fresh isolated message history for each cycle
|
|
205
|
+
const messages = [
|
|
206
|
+
{ role: 'developer', content: systemPrompt },
|
|
207
|
+
{
|
|
208
|
+
role: 'user',
|
|
209
|
+
content: `Evaluate whether the following goal has been fully and correctly completed:\n\n` +
|
|
210
|
+
`GOAL: ${userMessage}\n\n` +
|
|
211
|
+
`Start by reading the relevant workspace files, then assess completion against the ` +
|
|
212
|
+
`original directive. If gaps exist, use interact_with_agent to guide the main agent. ` +
|
|
213
|
+
`When done, output your JSON assessment.`,
|
|
214
|
+
},
|
|
215
|
+
];
|
|
216
|
+
const toolContext = {
|
|
217
|
+
workspaceDir,
|
|
218
|
+
targetAgent: this.targetAgent,
|
|
219
|
+
onNudgeSent: () => {
|
|
220
|
+
nudgesSentRef.value++;
|
|
221
|
+
},
|
|
222
|
+
onEvidenceFound: (path) => {
|
|
223
|
+
if (!evidence.includes(path))
|
|
224
|
+
evidence.push(path);
|
|
225
|
+
},
|
|
226
|
+
};
|
|
227
|
+
// Doom loop detection
|
|
228
|
+
const recentCalls = [];
|
|
229
|
+
let consecutiveApiErrors = 0;
|
|
230
|
+
const MAX_API_ERRORS = 3;
|
|
231
|
+
let finalText = '';
|
|
232
|
+
let iterations = 0;
|
|
233
|
+
for (let i = 0; i < this.maxIterations; i++) {
|
|
234
|
+
iterations = i + 1;
|
|
235
|
+
const iterPct = i / this.maxIterations;
|
|
236
|
+
const isFinalPhase = iterPct >= 0.90;
|
|
237
|
+
// Near-limit nudge
|
|
238
|
+
if (iterPct >= 0.80 && iterPct < 0.90 && i > 0) {
|
|
239
|
+
const alreadyNudged = messages.some((m) => typeof m.content === 'string' && m.content.includes('FINAL PHASE'));
|
|
240
|
+
if (!alreadyNudged) {
|
|
241
|
+
messages.push({
|
|
242
|
+
role: 'user',
|
|
243
|
+
content: `You are approaching the iteration limit (${iterations}/${this.maxIterations}). ` +
|
|
244
|
+
`Finish any remaining checks and produce your final JSON assessment.`,
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
if (isFinalPhase) {
|
|
249
|
+
const alreadyStripped = messages.some((m) => typeof m.content === 'string' && m.content.includes('FINAL PHASE'));
|
|
250
|
+
if (!alreadyStripped) {
|
|
251
|
+
messages.push({
|
|
252
|
+
role: 'user',
|
|
253
|
+
content: 'FINAL PHASE: Tools are disabled. You MUST respond with ONLY the JSON assessment block now.\n\n' +
|
|
254
|
+
'```json\n{"passed": true|false, "gaps": [], "summary": "..."}\n```',
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
let response;
|
|
259
|
+
try {
|
|
260
|
+
response = await this.orchestrator.chatWithFallback({
|
|
261
|
+
messages,
|
|
262
|
+
tools: isFinalPhase ? [] : allToolDefs,
|
|
263
|
+
temperature: 0.1,
|
|
264
|
+
}, false);
|
|
265
|
+
consecutiveApiErrors = 0;
|
|
266
|
+
}
|
|
267
|
+
catch (error) {
|
|
268
|
+
consecutiveApiErrors++;
|
|
269
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
270
|
+
logger.error(`[Evaluator] Model error (${consecutiveApiErrors}/${MAX_API_ERRORS}): ${msg}`);
|
|
271
|
+
if (consecutiveApiErrors >= MAX_API_ERRORS) {
|
|
272
|
+
finalText = '[Evaluation aborted due to repeated model errors]';
|
|
273
|
+
break;
|
|
274
|
+
}
|
|
275
|
+
// Add placeholder so conversation structure remains valid
|
|
276
|
+
const lastMsg = messages[messages.length - 1];
|
|
277
|
+
if (lastMsg && lastMsg.role !== 'assistant') {
|
|
278
|
+
messages.push({ role: 'assistant', content: '' });
|
|
279
|
+
}
|
|
280
|
+
messages.push({
|
|
281
|
+
role: 'user',
|
|
282
|
+
content: 'The previous request failed. Please continue the evaluation and output your JSON assessment.',
|
|
283
|
+
});
|
|
284
|
+
continue;
|
|
285
|
+
}
|
|
286
|
+
// Record assistant message
|
|
287
|
+
if (response.toolCalls && response.toolCalls.length > 0) {
|
|
288
|
+
messages.push({
|
|
289
|
+
role: 'assistant',
|
|
290
|
+
content: response.content || null,
|
|
291
|
+
tool_calls: response.toolCalls,
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
else {
|
|
295
|
+
messages.push({ role: 'assistant', content: response.content });
|
|
296
|
+
}
|
|
297
|
+
// No tool calls → evaluator is done
|
|
298
|
+
if (!response.toolCalls || response.toolCalls.length === 0) {
|
|
299
|
+
finalText = response.content || '[No evaluation response]';
|
|
300
|
+
break;
|
|
301
|
+
}
|
|
302
|
+
// Execute tool calls
|
|
303
|
+
for (const toolCall of response.toolCalls) {
|
|
304
|
+
const toolName = toolCall.function.name;
|
|
305
|
+
let toolArgs = {};
|
|
306
|
+
try {
|
|
307
|
+
toolArgs = JSON.parse(toolCall.function.arguments);
|
|
308
|
+
}
|
|
309
|
+
catch {
|
|
310
|
+
// malformed args — continue with empty args
|
|
311
|
+
}
|
|
312
|
+
// Doom loop guard
|
|
313
|
+
const callSig = `${toolName}:${JSON.stringify(toolArgs)}`;
|
|
314
|
+
recentCalls.push(callSig);
|
|
315
|
+
if (recentCalls.length > DOOM_LOOP_THRESHOLD)
|
|
316
|
+
recentCalls.shift();
|
|
317
|
+
if (recentCalls.length === DOOM_LOOP_THRESHOLD &&
|
|
318
|
+
recentCalls.every((c) => c === recentCalls[0])) {
|
|
319
|
+
logger.warn(`[Evaluator] Doom loop detected for tool: ${toolName}`);
|
|
320
|
+
messages.push({
|
|
321
|
+
role: 'user',
|
|
322
|
+
content: `STOP: You are calling \`${toolName}\` with the same arguments repeatedly. ` +
|
|
323
|
+
`Move on and output your final JSON assessment based on what you have found so far.`,
|
|
324
|
+
});
|
|
325
|
+
break;
|
|
326
|
+
}
|
|
327
|
+
logger.info(`[Evaluator] Tool: ${toolName}`);
|
|
328
|
+
let toolResult;
|
|
329
|
+
// Check virtual tools first, then fall through to MCP
|
|
330
|
+
const virtualTool = this.virtualTools.find((t) => t.name === toolName);
|
|
331
|
+
if (virtualTool) {
|
|
332
|
+
try {
|
|
333
|
+
toolResult = await virtualTool.execute(toolArgs, toolContext);
|
|
334
|
+
}
|
|
335
|
+
catch (e) {
|
|
336
|
+
toolResult = `Error executing ${toolName}: ${e instanceof Error ? e.message : String(e)}`;
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
else {
|
|
340
|
+
// MCP tool
|
|
341
|
+
try {
|
|
342
|
+
const result = await this.mcpManager.getClient().executeTool(toolName, toolArgs);
|
|
343
|
+
toolResult = typeof result === 'string' ? result : JSON.stringify(result);
|
|
344
|
+
// Track filesystem reads as evidence
|
|
345
|
+
if (toolName.includes('read') ||
|
|
346
|
+
toolName.includes('search') ||
|
|
347
|
+
toolName.includes('list')) {
|
|
348
|
+
const pathArg = toolArgs.path ||
|
|
349
|
+
toolArgs.query ||
|
|
350
|
+
toolArgs.pattern;
|
|
351
|
+
if (pathArg)
|
|
352
|
+
toolContext.onEvidenceFound(pathArg);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
catch (e) {
|
|
356
|
+
toolResult = `Error executing ${toolName}: ${e instanceof Error ? e.message : String(e)}`;
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
const toolMessage = formatToolResult(toolCall.id, toolName, toolResult);
|
|
360
|
+
messages.push(toolMessage);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
if (!finalText) {
|
|
364
|
+
finalText = `[Evaluator reached max iterations (${iterations}) without producing a final assessment]`;
|
|
365
|
+
}
|
|
366
|
+
return { finalText, nudgesSentAfter: nudgesSentRef.value };
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
//# sourceMappingURL=evaluator-agent.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluator-agent.js","sourceRoot":"","sources":["../../src/evaluator/evaluator-agent.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAKH,OAAO,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAExD,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAC5C,OAAO,EAAE,wBAAwB,EAAE,MAAM,wBAAwB,CAAC;AAClE,OAAO,EAAE,uBAAuB,EAAuB,MAAM,wBAAwB,CAAC;AAQtF,iFAAiF;AAEjF,MAAM,sBAAsB,GAAG,EAAE,CAAC;AAClC,MAAM,kBAAkB,GAAG,CAAC,CAAC;AAC7B,MAAM,mBAAmB,GAAG,CAAC,CAAC;AAE9B,iFAAiF;AAEjF;;;GAGG;AACH,iFAAiF;AACjF,SAAS,mBAAmB,CAAC,IAAY;IACvC,OAAO,IAAI;SACR,OAAO,CAAC,oCAAoC,EAAE,EAAE,CAAC;SACjD,OAAO,CAAC,iCAAiC,EAAE,EAAE,CAAC;SAC9C,OAAO,CAAC,mBAAmB,EAAE,EAAE,CAAC;SAChC,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,SAAS,qBAAqB,CAC5B,IAAY,EACZ,UAAkB,EAClB,SAAiB,EACjB,QAAkB;IAElB,wDAAwD;IACxD,MAAM,QAAQ,GAAG,mBAAmB,CAAC,IAAI,CAAC,CAAC;IAE3C,IAAI,CAAC;QACH,qEAAqE;QACrE,MAAM,SAAS,GACb,QAAQ,CAAC,KAAK,CAAC,6BAA6B,CAAC;YAC7C,IAAI,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;QAC5C,MAAM,GAAG,GAAG,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC;QAChD,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QAEtC,OAAO;YACL,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC;YAC9B,IAAI,EAAE,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE;YACnD,UAAU;YACV,SAAS;YACT,QAAQ;YACR,OAAO,EAAE,OAAO,MAAM,CAAC,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC;SAC1F,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,8EAA8E;QAC9E,MAAM,KAAK,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC;QACrC,MAAM,MAAM,GACV,KAAK,CAAC,QAAQ,CAAC,mBAAmB,CAAC;YACnC,KAAK,CAAC,QAAQ,CAAC,oBAAoB,CAAC;YACpC,KAAK,CAAC,QAAQ,CAAC,kBAAkB,CAAC;YAClC,KAAK,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC;QAEnC,6EAA6E;QAC7E,+EAA+E;QAC/E,MAAM,YAAY,GAAG,QAAQ,CAAC,MAAM,GAAG,EAAE,CAAC;QAE1C,OAAO;YACL,MAAM,EAAE,YAAY,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM;YACrC,IAAI,EAAE,YAAY;gBAChB,CAAC,CAAC,CAAC,qEAAqE,CAAC;gBACzE,CAAC,CAAC,EAAE;YACN,UAAU;YACV,SAAS;YACT,QAAQ;YACR,OAAO,EAAE,YAAY;gBACnB,CAAC,CAAC,uFAAuF;gBACzF,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC;SAC/B,CAAC;IACJ,CAAC;AACH,CAAC;AAED,iFAAiF;AAEjF,SAAS,iBAAiB,CAAC,YAAoB,EAAE,kBAA0B;IACzE,OAAO;;;aAGI,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;EA0BvB,kBAAkB,EAAE,CAAC;AACvB,CAAC;AAED,iFAAiF;AAEjF,MAAM,OAAO,cAAc;IACjB,YAAY,CAAoB;IAChC,UAAU,CAAmB;IAC7B,SAAS,CAAmB;IAC5B,WAAW,CAA8C;IACzD,aAAa,CAAS;IACtB,SAAS,CAAS;IAClB,YAAY,CAAmB;IAEvC,YAAY,MAAuB;QACjC,IAAI,CAAC,YAAY,GAAG,MAAM,CAAC,YAAY,CAAC;QACxC,IAAI,CAAC,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC;QACpC,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;QAClC,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;QACtC,IAAI,CAAC,aAAa,GAAG,MAAM,CAAC,qBAAqB,IAAI,sBAAsB,CAAC;QAC5E,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,mBAAmB,IAAI,kBAAkB,CAAC;QAClE,IAAI,CAAC,YAAY,GAAG,uBAAuB,CAAC;IAC9C,CAAC;IAED,+EAA+E;IAC/E,yBAAyB;QACvB,OAAO,IAAI,CAAC,YAAY,CAAC,aAAa,EAAE,CAAC;IAC3C,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,QAAQ,CAAC,GAAsB;QACnC,MAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QAE/C,gEAAgE;QAChE,IAAI,GAAG,CAAC,oBAAoB,EAAE,CAAC;YAC7B,MAAM,CAAC,IAAI,CAAC,qCAAqC,GAAG,CAAC,oBAAoB,EAAE,CAAC,CAAC;YAC7E,MAAM,IAAI,CAAC,WAAW,CAAC,gBAAgB,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC;QACpE,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,MAAM,QAAQ,GAAa,EAAE,CAAC;QAE9B,0FAA0F;QAC1F,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,CAAC,kBAAkB,EAAE,IAAI,EAAE,CAAC;QAC/D,MAAM,kBAAkB,GAAG,wBAAwB,CAAC,YAAY,CAAC,CAAC;QAClE,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,CAAC,eAAe,EAAE,CAAC;QACtD,MAAM,YAAY,GAAG,iBAAiB,CAAC,YAAY,EAAE,kBAAkB,CAAC,CAAC;QAEzE,yDAAyD;QACzD,MAAM,WAAW,GAAG,IAAI,CAAC,UAAU;aAChC,SAAS,EAAE;aACX,WAAW,EAAE;aACb,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACX,IAAI,EAAE,CAAC,CAAC,IAAI;YACZ,WAAW,EAAE,CAAC,CAAC,WAAW;YAC1B,UAAU,EAAG,CAAS,CAAC,WAAW,IAAI,EAAE,IAAI,EAAE,QAAiB,EAAE,UAAU,EAAE,EAAE,EAAE;SAClF,CAAC,CAAW,CAAC;QAEhB,MAAM,eAAe,GAAW,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAC5D,IAAI,EAAE,CAAC,CAAC,IAAI;YACZ,WAAW,EAAE,CAAC,CAAC,WAAW;YAC1B,UAAU,EAAE,CAAC,CAAC,UAAU;SACzB,CAAC,CAAC,CAAC;QAEJ,MAAM,WAAW,GAAW,CAAC,GAAG,WAAW,EAAE,GAAG,eAAe,CAAC,CAAC;QAEjE,gEAAgE;QAChE,mFAAmF;QACnF,IAAI,UAAU,GAA4B,IAAI,CAAC;QAE/C,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,IAAI,CAAC,SAAS,EAAE,KAAK,EAAE,EAAE,CAAC;YACpD,SAAS,GAAG,KAAK,GAAG,CAAC,CAAC;YACtB,MAAM,CAAC,IAAI,CAAC,qBAAqB,SAAS,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC;YAEhE,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,kBAAkB,CAAC;gBAChD,YAAY;gBACZ,WAAW,EAAE,GAAG,CAAC,WAAW;gBAC5B,WAAW;gBACX,YAAY;gBACZ,aAAa,EAAE,EAAE,KAAK,EAAE,UAAU,EAAE;gBACpC,QAAQ;aACT,CAAC,CAAC;YAEH,UAAU,GAAG,WAAW,CAAC,eAAe,CAAC;YACzC,UAAU,GAAG,qBAAqB,CAChC,WAAW,CAAC,SAAS,EACrB,UAAU,EACV,SAAS,EACT,CAAC,GAAG,QAAQ,CAAC,CACd,CAAC;YAEF,MAAM,CAAC,IAAI,CACT,qBAAqB,SAAS,uBAAuB,UAAU,CAAC,MAAM,WAAW,UAAU,CAAC,IAAI,CAAC,MAAM,EAAE,CAC1G,CAAC;YAEF,IAAI,UAAU,CAAC,MAAM,EAAE,CAAC;gBACtB,MAAM;YACR,CAAC;QACH,CAAC;QAED,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,OAAO;gBACL,MAAM,EAAE,KAAK;gBACb,IAAI,EAAE,CAAC,yEAAyE,CAAC;gBACjF,UAAU;gBACV,SAAS;gBACT,QAAQ;gBACR,OAAO,EAAE,sEAAsE;aAChF,CAAC;QACJ,CAAC;QAED,OAAO,UAAU,CAAC;IACpB,CAAC;IAED,+EAA+E;IAEvE,KAAK,CAAC,kBAAkB,CAAC,MAOhC;QACC,MAAM,EAAE,YAAY,EAAE,WAAW,EAAE,WAAW,EAAE,YAAY,EAAE,aAAa,EAAE,QAAQ,EAAE,GACrF,MAAM,CAAC;QAET,gDAAgD;QAChD,MAAM,QAAQ,GAAc;YAC1B,EAAE,IAAI,EAAE,WAAkB,EAAE,OAAO,EAAE,YAAY,EAAE;YACnD;gBACE,IAAI,EAAE,MAAM;gBACZ,OAAO,EACL,iFAAiF;oBACjF,SAAS,WAAW,MAAM;oBAC1B,oFAAoF;oBACpF,sFAAsF;oBACtF,yCAAyC;aAC5C;SACF,CAAC;QAEF,MAAM,WAAW,GAAyB;YACxC,YAAY;YACZ,WAAW,EAAE,IAAI,CAAC,WAAW;YAC7B,WAAW,EAAE,GAAG,EAAE;gBAChB,aAAa,CAAC,KAAK,EAAE,CAAC;YACxB,CAAC;YACD,eAAe,EAAE,CAAC,IAAY,EAAE,EAAE;gBAChC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC;oBAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACpD,CAAC;SACF,CAAC;QAEF,sBAAsB;QACtB,MAAM,WAAW,GAAa,EAAE,CAAC;QACjC,IAAI,oBAAoB,GAAG,CAAC,CAAC;QAC7B,MAAM,cAAc,GAAG,CAAC,CAAC;QACzB,IAAI,SAAS,GAAG,EAAE,CAAC;QACnB,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,UAAU,GAAG,CAAC,GAAG,CAAC,CAAC;YACnB,MAAM,OAAO,GAAG,CAAC,GAAG,IAAI,CAAC,aAAa,CAAC;YACvC,MAAM,YAAY,GAAG,OAAO,IAAI,IAAI,CAAC;YAErC,mBAAmB;YACnB,IAAI,OAAO,IAAI,IAAI,IAAI,OAAO,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC/C,MAAM,aAAa,GAAG,QAAQ,CAAC,IAAI,CACjC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,OAAO,KAAK,QAAQ,IAAI,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,CAC1E,CAAC;gBACF,IAAI,CAAC,aAAa,EAAE,CAAC;oBACnB,QAAQ,CAAC,IAAI,CAAC;wBACZ,IAAI,EAAE,MAAM;wBACZ,OAAO,EACL,4CAA4C,UAAU,IAAI,IAAI,CAAC,aAAa,KAAK;4BACjF,qEAAqE;qBACxE,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;YAED,IAAI,YAAY,EAAE,CAAC;gBACjB,MAAM,eAAe,GAAG,QAAQ,CAAC,IAAI,CACnC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,OAAO,KAAK,QAAQ,IAAI,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,CAC1E,CAAC;gBACF,IAAI,CAAC,eAAe,EAAE,CAAC;oBACrB,QAAQ,CAAC,IAAI,CAAC;wBACZ,IAAI,EAAE,MAAM;wBACZ,OAAO,EACL,gGAAgG;4BAChG,oEAAoE;qBACvE,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;YAED,IAAI,QAAQ,CAAC;YACb,IAAI,CAAC;gBACH,QAAQ,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,gBAAgB,CACjD;oBACE,QAAQ;oBACR,KAAK,EAAE,YAAY,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW;oBACtC,WAAW,EAAE,GAAG;iBACjB,EACD,KAAK,CACN,CAAC;gBACF,oBAAoB,GAAG,CAAC,CAAC;YAC3B,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,oBAAoB,EAAE,CAAC;gBACvB,MAAM,GAAG,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBACnE,MAAM,CAAC,KAAK,CAAC,4BAA4B,oBAAoB,IAAI,cAAc,MAAM,GAAG,EAAE,CAAC,CAAC;gBAE5F,IAAI,oBAAoB,IAAI,cAAc,EAAE,CAAC;oBAC3C,SAAS,GAAG,mDAAmD,CAAC;oBAChE,MAAM;gBACR,CAAC;gBAED,0DAA0D;gBAC1D,MAAM,OAAO,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;gBAC9C,IAAI,OAAO,IAAI,OAAO,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;oBAC5C,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC,CAAC;gBACpD,CAAC;gBACD,QAAQ,CAAC,IAAI,CAAC;oBACZ,IAAI,EAAE,MAAM;oBACZ,OAAO,EAAE,8FAA8F;iBACxG,CAAC,CAAC;gBACH,SAAS;YACX,CAAC;YAED,2BAA2B;YAC3B,IAAI,QAAQ,CAAC,SAAS,IAAI,QAAQ,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxD,QAAQ,CAAC,IAAI,CAAC;oBACZ,IAAI,EAAE,WAAW;oBACjB,OAAO,EAAE,QAAQ,CAAC,OAAO,IAAI,IAAI;oBACjC,UAAU,EAAE,QAAQ,CAAC,SAAS;iBACxB,CAAC,CAAC;YACZ,CAAC;iBAAM,CAAC;gBACN,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,OAAO,EAAE,QAAQ,CAAC,OAAO,EAAE,CAAC,CAAC;YAClE,CAAC;YAED,oCAAoC;YACpC,IAAI,CAAC,QAAQ,CAAC,SAAS,IAAI,QAAQ,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC3D,SAAS,GAAG,QAAQ,CAAC,OAAO,IAAI,0BAA0B,CAAC;gBAC3D,MAAM;YACR,CAAC;YAED,qBAAqB;YACrB,KAAK,MAAM,QAAQ,IAAI,QAAQ,CAAC,SAAS,EAAE,CAAC;gBAC1C,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC;gBACxC,IAAI,QAAQ,GAA4B,EAAE,CAAC;gBAC3C,IAAI,CAAC;oBACH,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;gBACrD,CAAC;gBAAC,MAAM,CAAC;oBACP,4CAA4C;gBAC9C,CAAC;gBAED,kBAAkB;gBAClB,MAAM,OAAO,GAAG,GAAG,QAAQ,IAAI,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC1D,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBAC1B,IAAI,WAAW,CAAC,MAAM,GAAG,mBAAmB;oBAAE,WAAW,CAAC,KAAK,EAAE,CAAC;gBAElE,IACE,WAAW,CAAC,MAAM,KAAK,mBAAmB;oBAC1C,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,WAAW,CAAC,CAAC,CAAC,CAAC,EAC9C,CAAC;oBACD,MAAM,CAAC,IAAI,CAAC,4CAA4C,QAAQ,EAAE,CAAC,CAAC;oBACpE,QAAQ,CAAC,IAAI,CAAC;wBACZ,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE,2BAA2B,QAAQ,yCAAyC;4BACnF,oFAAoF;qBACvF,CAAC,CAAC;oBACH,MAAM;gBACR,CAAC;gBAED,MAAM,CAAC,IAAI,CAAC,qBAAqB,QAAQ,EAAE,CAAC,CAAC;gBAE7C,IAAI,UAAkB,CAAC;gBAEvB,sDAAsD;gBACtD,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;gBAEvE,IAAI,WAAW,EAAE,CAAC;oBAChB,IAAI,CAAC;wBACH,UAAU,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAC;oBAChE,CAAC;oBAAC,OAAO,CAAC,EAAE,CAAC;wBACX,UAAU,GAAG,mBAAmB,QAAQ,KAAK,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;oBAC5F,CAAC;gBACH,CAAC;qBAAM,CAAC;oBACN,WAAW;oBACX,IAAI,CAAC;wBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,SAAS,EAAE,CAAC,WAAW,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;wBACjF,UAAU,GAAG,OAAO,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;wBAE1E,qCAAqC;wBACrC,IACE,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC;4BACzB,QAAQ,CAAC,QAAQ,CAAC,QAAQ,CAAC;4BAC3B,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,EACzB,CAAC;4BACD,MAAM,OAAO,GACV,QAAQ,CAAC,IAAe;gCACxB,QAAQ,CAAC,KAAgB;gCACzB,QAAQ,CAAC,OAAkB,CAAC;4BAC/B,IAAI,OAAO;gCAAE,WAAW,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC;wBACpD,CAAC;oBACH,CAAC;oBAAC,OAAO,CAAC,EAAE,CAAC;wBACX,UAAU,GAAG,mBAAmB,QAAQ,KAAK,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;oBAC5F,CAAC;gBACH,CAAC;gBAED,MAAM,WAAW,GAAG,gBAAgB,CAAC,QAAQ,CAAC,EAAE,EAAE,QAAQ,EAAE,UAAU,CAAC,CAAC;gBACxE,QAAQ,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YAC7B,CAAC;QACH,CAAC;QAED,IAAI,CAAC,SAAS,EAAE,CAAC;YACf,SAAS,GAAG,sCAAsC,UAAU,yCAAyC,CAAC;QACxG,CAAC;QAED,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,aAAa,CAAC,KAAK,EAAE,CAAC;IAC7D,CAAC;CACF"}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvaluatorHarness — coordinates the main Jiva agent and the evaluator.
|
|
3
|
+
*
|
|
4
|
+
* Flow per user message:
|
|
5
|
+
* 1. Run the main agent (mainAgent.chat(userMessage))
|
|
6
|
+
* 2. Run the evaluator (evaluatorAgent.evaluate({ userMessage }))
|
|
7
|
+
* — the evaluator reads workspace files, identifies gaps, and nudges the
|
|
8
|
+
* main agent as needed via its interact_with_agent tool
|
|
9
|
+
* 3. Return a HarnessResult containing both the main response and the
|
|
10
|
+
* evaluation outcome
|
|
11
|
+
*/
|
|
12
|
+
import { EvaluatorAgent } from './evaluator-agent.js';
|
|
13
|
+
import type { IAgent } from '../core/agent-interface.js';
|
|
14
|
+
import type { EvaluationContext, HarnessOptions, HarnessResult } from './types.js';
|
|
15
|
+
export declare class EvaluatorHarness {
|
|
16
|
+
private mainAgent;
|
|
17
|
+
private evaluatorAgent;
|
|
18
|
+
private verbose;
|
|
19
|
+
constructor(mainAgent: IAgent, evaluatorAgent: EvaluatorAgent, options?: HarnessOptions);
|
|
20
|
+
/**
|
|
21
|
+
* Process a user message through the harness:
|
|
22
|
+
* 1. Main agent processes the request.
|
|
23
|
+
* 2. Evaluator validates and guides completion.
|
|
24
|
+
*/
|
|
25
|
+
run(userMessage: string, evalCtx?: Partial<EvaluationContext>): Promise<HarnessResult>;
|
|
26
|
+
/** Stop both agents cooperatively. */
|
|
27
|
+
stop(): void;
|
|
28
|
+
/** Cleanup both agents' resources. */
|
|
29
|
+
cleanup(): Promise<void>;
|
|
30
|
+
}
|
|
31
|
+
//# sourceMappingURL=harness.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"harness.d.ts","sourceRoot":"","sources":["../../src/evaluator/harness.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAGH,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,4BAA4B,CAAC;AACzD,OAAO,KAAK,EAAE,iBAAiB,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAEnF,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,cAAc,CAAiB;IACvC,OAAO,CAAC,OAAO,CAAU;gBAGvB,SAAS,EAAE,MAAM,EACjB,cAAc,EAAE,cAAc,EAC9B,OAAO,GAAE,cAAmB;IAO9B;;;;OAIG;IACG,GAAG,CAAC,WAAW,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,CAAC,iBAAiB,CAAC,GAAG,OAAO,CAAC,aAAa,CAAC;IA2C5F,sCAAsC;IACtC,IAAI,IAAI,IAAI;IAIZ,sCAAsC;IAChC,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAI/B"}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvaluatorHarness — coordinates the main Jiva agent and the evaluator.
|
|
3
|
+
*
|
|
4
|
+
* Flow per user message:
|
|
5
|
+
* 1. Run the main agent (mainAgent.chat(userMessage))
|
|
6
|
+
* 2. Run the evaluator (evaluatorAgent.evaluate({ userMessage }))
|
|
7
|
+
* — the evaluator reads workspace files, identifies gaps, and nudges the
|
|
8
|
+
* main agent as needed via its interact_with_agent tool
|
|
9
|
+
* 3. Return a HarnessResult containing both the main response and the
|
|
10
|
+
* evaluation outcome
|
|
11
|
+
*/
|
|
12
|
+
import { logger } from '../utils/logger.js';
|
|
13
|
+
export class EvaluatorHarness {
|
|
14
|
+
mainAgent;
|
|
15
|
+
evaluatorAgent;
|
|
16
|
+
verbose;
|
|
17
|
+
constructor(mainAgent, evaluatorAgent, options = {}) {
|
|
18
|
+
this.mainAgent = mainAgent;
|
|
19
|
+
this.evaluatorAgent = evaluatorAgent;
|
|
20
|
+
this.verbose = options.verbose ?? true;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Process a user message through the harness:
|
|
24
|
+
* 1. Main agent processes the request.
|
|
25
|
+
* 2. Evaluator validates and guides completion.
|
|
26
|
+
*/
|
|
27
|
+
async run(userMessage, evalCtx) {
|
|
28
|
+
// ── Step 1: Main agent ─────────────────────────────────────────────────
|
|
29
|
+
if (this.verbose) {
|
|
30
|
+
logger.info('[Harness] Main agent processing user request');
|
|
31
|
+
}
|
|
32
|
+
const mainResponse = await this.mainAgent.chat(userMessage);
|
|
33
|
+
if (this.verbose) {
|
|
34
|
+
logger.info(`[Harness] Main agent done — ${mainResponse.iterations} iterations, ` +
|
|
35
|
+
`${mainResponse.toolsUsed.length} tools used`);
|
|
36
|
+
}
|
|
37
|
+
// ── Step 2: Evaluator ──────────────────────────────────────────────────
|
|
38
|
+
if (this.verbose) {
|
|
39
|
+
logger.info('[Harness] Evaluator starting validation');
|
|
40
|
+
}
|
|
41
|
+
const evaluationContext = {
|
|
42
|
+
userMessage,
|
|
43
|
+
targetConversationId: evalCtx?.targetConversationId,
|
|
44
|
+
};
|
|
45
|
+
const evaluation = await this.evaluatorAgent.evaluate(evaluationContext);
|
|
46
|
+
if (this.verbose) {
|
|
47
|
+
const outcome = evaluation.passed
|
|
48
|
+
? `✓ Passed (${evaluation.cyclesRan} cycles, ${evaluation.nudgesSent} nudges)`
|
|
49
|
+
: `✗ Failed — ${evaluation.gaps.length} gap(s) remain after ${evaluation.cyclesRan} cycles`;
|
|
50
|
+
logger.info(`[Harness] Evaluation complete — ${outcome}`);
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
mainAgentResponse: mainResponse.content,
|
|
54
|
+
mainAgentIterations: mainResponse.iterations,
|
|
55
|
+
evaluation,
|
|
56
|
+
mainAgentTokenUsage: this.mainAgent.getTokenUsage(),
|
|
57
|
+
evaluatorTokenUsage: this.evaluatorAgent.getOrchestratorTokenUsage(),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
/** Stop both agents cooperatively. */
|
|
61
|
+
stop() {
|
|
62
|
+
this.mainAgent.stop();
|
|
63
|
+
}
|
|
64
|
+
/** Cleanup both agents' resources. */
|
|
65
|
+
async cleanup() {
|
|
66
|
+
await this.mainAgent.cleanup();
|
|
67
|
+
await this.evaluatorAgent['mcpManager'].cleanup();
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
//# sourceMappingURL=harness.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"harness.js","sourceRoot":"","sources":["../../src/evaluator/harness.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAK5C,MAAM,OAAO,gBAAgB;IACnB,SAAS,CAAS;IAClB,cAAc,CAAiB;IAC/B,OAAO,CAAU;IAEzB,YACE,SAAiB,EACjB,cAA8B,EAC9B,UAA0B,EAAE;QAE5B,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,IAAI,CAAC,cAAc,GAAG,cAAc,CAAC;QACrC,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC;IACzC,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,GAAG,CAAC,WAAmB,EAAE,OAAoC;QACjE,0EAA0E;QAC1E,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,CAAC,IAAI,CAAC,8CAA8C,CAAC,CAAC;QAC9D,CAAC;QAED,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAE5D,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,CAAC,IAAI,CACT,+BAA+B,YAAY,CAAC,UAAU,eAAe;gBACrE,GAAG,YAAY,CAAC,SAAS,CAAC,MAAM,aAAa,CAC9C,CAAC;QACJ,CAAC;QAED,0EAA0E;QAC1E,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,CAAC,IAAI,CAAC,yCAAyC,CAAC,CAAC;QACzD,CAAC;QAED,MAAM,iBAAiB,GAAsB;YAC3C,WAAW;YACX,oBAAoB,EAAE,OAAO,EAAE,oBAAoB;SACpD,CAAC;QAEF,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;QAEzE,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,UAAU,CAAC,MAAM;gBAC/B,CAAC,CAAC,aAAa,UAAU,CAAC,SAAS,YAAY,UAAU,CAAC,UAAU,UAAU;gBAC9E,CAAC,CAAC,cAAc,UAAU,CAAC,IAAI,CAAC,MAAM,wBAAwB,UAAU,CAAC,SAAS,SAAS,CAAC;YAC9F,MAAM,CAAC,IAAI,CAAC,mCAAmC,OAAO,EAAE,CAAC,CAAC;QAC5D,CAAC;QAED,OAAO;YACL,iBAAiB,EAAE,YAAY,CAAC,OAAO;YACvC,mBAAmB,EAAE,YAAY,CAAC,UAAU;YAC5C,UAAU;YACV,mBAAmB,EAAE,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE;YACnD,mBAAmB,EAAE,IAAI,CAAC,cAAc,CAAC,yBAAyB,EAAE;SACrE,CAAC;IACJ,CAAC;IAED,sCAAsC;IACtC,IAAI;QACF,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;IACxB,CAAC;IAED,sCAAsC;IACtC,KAAK,CAAC,OAAO;QACX,MAAM,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,CAAC;QAC/B,MAAM,IAAI,CAAC,cAAc,CAAC,YAAY,CAAC,CAAC,OAAO,EAAE,CAAC;IACpD,CAAC;CACF"}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluator module — public API.
|
|
3
|
+
*
|
|
4
|
+
* Usage (CLI):
|
|
5
|
+
* const harness = await createEvaluatorHarness(mainAgent, mcpServers, orchestratorConfig);
|
|
6
|
+
* const result = await harness.run(userMessage);
|
|
7
|
+
*
|
|
8
|
+
* Usage (HTTP):
|
|
9
|
+
* const harness = await createEvaluatorHarness(mainAgent, mcpServers, orchestratorConfig);
|
|
10
|
+
* app.post('/api/chat/harness', async (req, res) => {
|
|
11
|
+
* const result = await harness.run(req.body.message);
|
|
12
|
+
* res.json(result);
|
|
13
|
+
* });
|
|
14
|
+
*/
|
|
15
|
+
export { EvaluatorAgent } from './evaluator-agent.js';
|
|
16
|
+
export { EvaluatorHarness } from './harness.js';
|
|
17
|
+
export { deriveEvaluatorDirective } from './directive-adapter.js';
|
|
18
|
+
export { InteractWithAgentTool, ListAgentConversationsTool, GetConversationHistoryTool, EVALUATOR_VIRTUAL_TOOLS, } from './tools/agent-tools.js';
|
|
19
|
+
export type { EvaluatorConfig, EvaluationContext, EvaluationResult, HarnessOptions, HarnessResult, EvaluatorToolContext, } from './types.js';
|
|
20
|
+
import { EvaluatorHarness } from './harness.js';
|
|
21
|
+
import type { IAgent } from '../core/agent-interface.js';
|
|
22
|
+
import type { MCPServerConfig } from '../core/config.js';
|
|
23
|
+
import type { HarnessOptions } from './types.js';
|
|
24
|
+
export interface EvaluatorOrchestratorConfig {
|
|
25
|
+
endpoint: string;
|
|
26
|
+
apiKey: string;
|
|
27
|
+
model: string;
|
|
28
|
+
useHarmonyFormat?: boolean;
|
|
29
|
+
/** Optional tool-calling model (same env vars as main agent). */
|
|
30
|
+
toolCallingEndpoint?: string;
|
|
31
|
+
toolCallingApiKey?: string;
|
|
32
|
+
toolCallingModel?: string;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Create an EvaluatorHarness that pairs the given main agent with a fresh,
|
|
36
|
+
* isolated EvaluatorAgent.
|
|
37
|
+
*
|
|
38
|
+
* The evaluator uses the same model config and MCP servers as the main agent
|
|
39
|
+
* but runs with completely separate LLM state.
|
|
40
|
+
*
|
|
41
|
+
* @param mainAgent The IAgent instance to supervise.
|
|
42
|
+
* @param mcpServers MCP server configs (same set as the main agent uses).
|
|
43
|
+
* @param orchestratorCfg Model connection details for the evaluator's orchestrator.
|
|
44
|
+
* @param harnessOptions Optional harness behaviour tweaks.
|
|
45
|
+
*/
|
|
46
|
+
export declare function createEvaluatorHarness(mainAgent: IAgent, mcpServers: Record<string, MCPServerConfig>, orchestratorCfg: EvaluatorOrchestratorConfig, harnessOptions?: HarnessOptions): Promise<EvaluatorHarness>;
|
|
47
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/evaluator/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAChD,OAAO,EAAE,wBAAwB,EAAE,MAAM,wBAAwB,CAAC;AAClE,OAAO,EACL,qBAAqB,EACrB,0BAA0B,EAC1B,0BAA0B,EAC1B,uBAAuB,GACxB,MAAM,wBAAwB,CAAC;AAChC,YAAY,EACV,eAAe,EACf,iBAAiB,EACjB,gBAAgB,EAChB,cAAc,EACd,aAAa,EACb,oBAAoB,GACrB,MAAM,YAAY,CAAC;AAQpB,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAChD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,4BAA4B,CAAC;AACzD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACzD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAEjD,MAAM,WAAW,2BAA2B;IAC1C,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,iEAAiE;IACjE,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;;;;;;;GAWG;AACH,wBAAsB,sBAAsB,CAC1C,SAAS,EAAE,MAAM,EACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,eAAe,CAAC,EAC3C,eAAe,EAAE,2BAA2B,EAC5C,cAAc,GAAE,cAAmB,GAClC,OAAO,CAAC,gBAAgB,CAAC,CAiD3B"}
|