jiva-core 0.3.1 → 0.3.3-dev.24b219d
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm-publish-dev.yml +39 -0
- package/.github/workflows/npm-publish.yml +31 -0
- package/Dockerfile +12 -7
- package/README.md +32 -0
- package/cloud-run.yaml +1 -1
- package/cloud-run.yaml.template +1 -1
- package/dist/core/agent-spawner.d.ts.map +1 -1
- package/dist/core/agent-spawner.js +37 -21
- package/dist/core/agent-spawner.js.map +1 -1
- package/dist/core/agent.js +1 -1
- package/dist/core/agent.js.map +1 -1
- package/dist/core/client-agent.d.ts +59 -11
- package/dist/core/client-agent.d.ts.map +1 -1
- package/dist/core/client-agent.js +519 -176
- package/dist/core/client-agent.js.map +1 -1
- package/dist/core/config.d.ts.map +1 -1
- package/dist/core/config.js +11 -5
- package/dist/core/config.js.map +1 -1
- package/dist/core/conversation-manager.js +3 -3
- package/dist/core/conversation-manager.js.map +1 -1
- package/dist/core/dual-agent.d.ts +20 -0
- package/dist/core/dual-agent.d.ts.map +1 -1
- package/dist/core/dual-agent.js +188 -33
- package/dist/core/dual-agent.js.map +1 -1
- package/dist/core/manager-agent.d.ts +21 -2
- package/dist/core/manager-agent.d.ts.map +1 -1
- package/dist/core/manager-agent.js +150 -52
- package/dist/core/manager-agent.js.map +1 -1
- package/dist/core/types/agent-context.d.ts +30 -0
- package/dist/core/types/agent-context.d.ts.map +1 -0
- package/dist/core/types/agent-context.js +8 -0
- package/dist/core/types/agent-context.js.map +1 -0
- package/dist/core/types/completion-signal.d.ts +17 -0
- package/dist/core/types/completion-signal.d.ts.map +1 -0
- package/dist/core/types/completion-signal.js +8 -0
- package/dist/core/types/completion-signal.js.map +1 -0
- package/dist/core/utils/serialize-agent-context.d.ts +23 -0
- package/dist/core/utils/serialize-agent-context.d.ts.map +1 -0
- package/dist/core/utils/serialize-agent-context.js +73 -0
- package/dist/core/utils/serialize-agent-context.js.map +1 -0
- package/dist/core/worker-agent.d.ts +2 -1
- package/dist/core/worker-agent.d.ts.map +1 -1
- package/dist/core/worker-agent.js +18 -4
- package/dist/core/worker-agent.js.map +1 -1
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/interfaces/cli/index.js +35 -16
- package/dist/interfaces/cli/index.js.map +1 -1
- package/dist/interfaces/cli/setup-wizard.js +1 -1
- package/dist/interfaces/cli/setup-wizard.js.map +1 -1
- package/dist/interfaces/http/session-manager.d.ts.map +1 -1
- package/dist/interfaces/http/session-manager.js +30 -10
- package/dist/interfaces/http/session-manager.js.map +1 -1
- package/dist/models/krutrim.js +1 -1
- package/dist/models/krutrim.js.map +1 -1
- package/dist/personas/persona-manager.d.ts +14 -3
- package/dist/personas/persona-manager.d.ts.map +1 -1
- package/dist/personas/persona-manager.js +74 -17
- package/dist/personas/persona-manager.js.map +1 -1
- package/dist/storage/gcp-bucket-provider.d.ts +1 -0
- package/dist/storage/gcp-bucket-provider.d.ts.map +1 -1
- package/dist/storage/gcp-bucket-provider.js +18 -11
- package/dist/storage/gcp-bucket-provider.js.map +1 -1
- package/dist/storage/local-provider.d.ts +1 -0
- package/dist/storage/local-provider.d.ts.map +1 -1
- package/dist/storage/local-provider.js +9 -0
- package/dist/storage/local-provider.js.map +1 -1
- package/dist/storage/provider.d.ts +5 -0
- package/dist/storage/provider.d.ts.map +1 -1
- package/dist/storage/provider.js.map +1 -1
- package/dist/utils/logger.d.ts +19 -0
- package/dist/utils/logger.d.ts.map +1 -1
- package/dist/utils/logger.js +49 -1
- package/dist/utils/logger.js.map +1 -1
- package/dist/utils/orchestration-logger.d.ts +27 -3
- package/dist/utils/orchestration-logger.d.ts.map +1 -1
- package/dist/utils/orchestration-logger.js +110 -6
- package/dist/utils/orchestration-logger.js.map +1 -1
- package/dist/utils/platform.d.ts +13 -0
- package/dist/utils/platform.d.ts.map +1 -0
- package/dist/utils/platform.js +23 -0
- package/dist/utils/platform.js.map +1 -0
- package/package.json +7 -8
- package/.claude/settings.local.json +0 -18
|
@@ -9,137 +9,204 @@
|
|
|
9
9
|
* - STANDARD: Creation requests → file existence + basic validation
|
|
10
10
|
* - THOROUGH: Complex/testing requests OR failures → full E2E validation with tools
|
|
11
11
|
*/
|
|
12
|
+
import { serializeAgentContext } from './utils/serialize-agent-context.js';
|
|
12
13
|
import { logger } from '../utils/logger.js';
|
|
14
|
+
import { orchestrationLogger } from '../utils/orchestration-logger.js';
|
|
13
15
|
export var InvolvementLevel;
|
|
14
16
|
(function (InvolvementLevel) {
|
|
15
17
|
InvolvementLevel["MINIMAL"] = "minimal";
|
|
16
18
|
InvolvementLevel["STANDARD"] = "standard";
|
|
17
19
|
InvolvementLevel["THOROUGH"] = "thorough";
|
|
18
20
|
})(InvolvementLevel || (InvolvementLevel = {}));
|
|
21
|
+
/**
|
|
22
|
+
* Normalize the LLM-returned mustUseTools value to string[] | undefined.
|
|
23
|
+
* The LLM may return null, a string, or an array.
|
|
24
|
+
*/
|
|
25
|
+
function normalizeMustUseTools(value) {
|
|
26
|
+
if (Array.isArray(value))
|
|
27
|
+
return value;
|
|
28
|
+
if (typeof value === 'string' && value)
|
|
29
|
+
return [value];
|
|
30
|
+
return undefined;
|
|
31
|
+
}
|
|
19
32
|
export class ClientAgent {
|
|
20
33
|
orchestrator;
|
|
21
34
|
mcpManager;
|
|
22
35
|
mcpClient;
|
|
23
36
|
failureCount = 0;
|
|
24
|
-
//
|
|
25
|
-
|
|
26
|
-
'filesystem__read_text_file',
|
|
27
|
-
'filesystem__list_directory',
|
|
28
|
-
'filesystem__directory_tree',
|
|
29
|
-
'filesystem__search_files',
|
|
30
|
-
'playwright__browser_navigate',
|
|
31
|
-
'playwright__browser_console_messages',
|
|
32
|
-
'playwright__browser_take_screenshot',
|
|
33
|
-
'playwright__browser_evaluate',
|
|
34
|
-
];
|
|
37
|
+
// Lazily cached list of all available tool names (populated on first use)
|
|
38
|
+
_availableTools = null;
|
|
35
39
|
constructor(orchestrator, mcpManager) {
|
|
36
40
|
this.orchestrator = orchestrator;
|
|
37
41
|
this.mcpManager = mcpManager;
|
|
38
42
|
this.mcpClient = mcpManager.getClient();
|
|
39
43
|
}
|
|
44
|
+
// ─── Tool Discovery ───────────────────────────────────────────────────────
|
|
40
45
|
/**
|
|
41
|
-
*
|
|
46
|
+
* Returns all tool names currently available from connected MCP servers.
|
|
47
|
+
* Result is cached after the first call; call resetToolCache() if servers change.
|
|
42
48
|
*/
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if (this.failureCount > 0) {
|
|
54
|
-
logger.debug(`[Client] THOROUGH mode: ${this.failureCount} previous failures detected`);
|
|
55
|
-
return InvolvementLevel.THOROUGH;
|
|
56
|
-
}
|
|
57
|
-
// THOROUGH: Complex multi-file operations
|
|
58
|
-
if (subtasks.length > 3 || (messageLower.includes('component') && messageLower.includes('index.html'))) {
|
|
59
|
-
logger.debug('[Client] THOROUGH mode: Complex multi-file operation');
|
|
60
|
-
return InvolvementLevel.THOROUGH;
|
|
61
|
-
}
|
|
62
|
-
// MINIMAL: Information-only requests
|
|
63
|
-
const infoKeywords = ['what', 'list', 'show', 'explain', 'describe', 'how', 'tell me'];
|
|
64
|
-
const creationKeywords = ['create', 'build', 'write', 'generate', 'make', 'add'];
|
|
65
|
-
const hasInfoKeyword = infoKeywords.some(kw => messageLower.includes(kw));
|
|
66
|
-
const hasCreationKeyword = creationKeywords.some(kw => messageLower.includes(kw));
|
|
67
|
-
if (hasInfoKeyword && !hasCreationKeyword) {
|
|
68
|
-
logger.debug('[Client] MINIMAL mode: Information request');
|
|
69
|
-
return InvolvementLevel.MINIMAL;
|
|
70
|
-
}
|
|
71
|
-
// STANDARD: Default for creation/modification tasks
|
|
72
|
-
logger.debug('[Client] STANDARD mode: Regular creation task');
|
|
73
|
-
return InvolvementLevel.STANDARD;
|
|
49
|
+
getAvailableTools() {
|
|
50
|
+
if (this._availableTools === null) {
|
|
51
|
+
this._availableTools = this.mcpClient.getAllTools().map(t => t.name);
|
|
52
|
+
logger.debug(`[Client] Discovered ${this._availableTools.length} available tools: ${this._availableTools.join(', ')}`);
|
|
53
|
+
}
|
|
54
|
+
return this._availableTools;
|
|
55
|
+
}
|
|
56
|
+
/** Reset the tool cache (e.g. after MCP server reconnects). */
|
|
57
|
+
resetToolCache() {
|
|
58
|
+
this._availableTools = null;
|
|
74
59
|
}
|
|
75
60
|
/**
|
|
76
|
-
*
|
|
61
|
+
* Find the first available tool whose name contains any of the given substrings.
|
|
62
|
+
* Returns null if no match is found.
|
|
77
63
|
*/
|
|
78
|
-
|
|
79
|
-
const
|
|
80
|
-
const
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if (fileMatches) {
|
|
85
|
-
fileMatches.forEach(match => {
|
|
86
|
-
const filename = match.split(/\s+/).pop();
|
|
87
|
-
if (filename) {
|
|
88
|
-
requirements.push({
|
|
89
|
-
type: 'file_creation',
|
|
90
|
-
description: `Create file: ${filename}`,
|
|
91
|
-
filePath: filename,
|
|
92
|
-
});
|
|
93
|
-
}
|
|
94
|
-
});
|
|
64
|
+
findTool(...patterns) {
|
|
65
|
+
const tools = this.getAvailableTools();
|
|
66
|
+
for (const pattern of patterns) {
|
|
67
|
+
const found = tools.find(t => t.includes(pattern));
|
|
68
|
+
if (found)
|
|
69
|
+
return found;
|
|
95
70
|
}
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Build a human-readable summary of available tool categories for LLM prompts.
|
|
75
|
+
*/
|
|
76
|
+
buildToolContextForPrompt() {
|
|
77
|
+
const tools = this.getAvailableTools();
|
|
78
|
+
if (tools.length === 0) {
|
|
79
|
+
return 'No MCP tools are currently available.';
|
|
104
80
|
}
|
|
105
|
-
//
|
|
106
|
-
|
|
107
|
-
const
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
type: 'verification',
|
|
113
|
-
description: 'Browser testing required for local HTML file',
|
|
114
|
-
mustUseTools: ['playwright__browser_navigate', 'playwright__browser_console_messages'],
|
|
115
|
-
});
|
|
81
|
+
// Group by server prefix (everything before __)
|
|
82
|
+
const byServer = {};
|
|
83
|
+
for (const tool of tools) {
|
|
84
|
+
const [server] = tool.split('__');
|
|
85
|
+
if (!byServer[server])
|
|
86
|
+
byServer[server] = [];
|
|
87
|
+
byServer[server].push(tool);
|
|
116
88
|
}
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
89
|
+
return Object.entries(byServer)
|
|
90
|
+
.map(([server, serverTools]) => `- ${server}: ${serverTools.join(', ')}`)
|
|
91
|
+
.join('\n');
|
|
92
|
+
}
|
|
93
|
+
// ─── Task Analysis ────────────────────────────────────────────────────────
|
|
94
|
+
/**
|
|
95
|
+
* Use LLM to analyze the task and determine involvement level + requirements.
|
|
96
|
+
* Replaces keyword-based determineInvolvementLevel() and parseRequirements()
|
|
97
|
+
* with semantic understanding that avoids false positives.
|
|
98
|
+
*
|
|
99
|
+
* NOTE: failureCount escalation has been removed. Involvement level is now
|
|
100
|
+
* determined purely by task type. Per-subtask correction is handled by
|
|
101
|
+
* CompletionSignal + DualAgent retry budget instead.
|
|
102
|
+
*/
|
|
103
|
+
async analyzeTaskRequirements(userMessage, subtasks, workerResult, agentContext) {
|
|
104
|
+
const workerContext = workerResult
|
|
105
|
+
? `\nWorker Result (first 500 chars): ${workerResult.result.substring(0, 500)}\nWorker Success: ${workerResult.success}\nTools Used: ${workerResult.toolsUsed.join(', ') || 'none'} (${workerResult.toolsUsed.length} total)`
|
|
106
|
+
: '';
|
|
107
|
+
const availableToolsContext = this.buildToolContextForPrompt();
|
|
108
|
+
// Include serialized agent context for richer analysis
|
|
109
|
+
const contextBlock = agentContext
|
|
110
|
+
? `\nAGENT CONTEXT:\n${serializeAgentContext(agentContext, 'client')}`
|
|
111
|
+
: '';
|
|
112
|
+
const analysisPrompt = `You are a task analyst for a software agent system. Analyze the user's request to determine:
|
|
113
|
+
1. How deeply to validate the Worker's output (involvement level)
|
|
114
|
+
2. What specific requirements the task implies
|
|
115
|
+
|
|
116
|
+
USER MESSAGE: ${userMessage}
|
|
117
|
+
|
|
118
|
+
SUBTASKS: ${JSON.stringify(subtasks)}
|
|
119
|
+
${workerContext}
|
|
120
|
+
${contextBlock}
|
|
121
|
+
|
|
122
|
+
AVAILABLE TOOLS (what the Worker and Client can actually use):
|
|
123
|
+
${availableToolsContext}
|
|
124
|
+
|
|
125
|
+
Respond ONLY with valid JSON in this exact format (no other text):
|
|
126
|
+
{
|
|
127
|
+
"involvementLevel": "<MINIMAL | STANDARD | THOROUGH>",
|
|
128
|
+
"involvementReasoning": "<brief explanation of why this level>",
|
|
129
|
+
"requirements": [
|
|
130
|
+
{
|
|
131
|
+
"type": "<file_creation | file_modification | testing | verification | information | other>",
|
|
132
|
+
"description": "<what this requirement entails>",
|
|
133
|
+
"filePath": null,
|
|
134
|
+
"mustUseTools": null
|
|
135
|
+
}
|
|
136
|
+
]
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
CRITICAL RULES for involvementLevel:
|
|
140
|
+
- THOROUGH: ONLY when the user EXPLICITLY asks to test or verify something, OR for complex multi-file operations (>3 subtasks)
|
|
141
|
+
- MINIMAL: Information-only requests (listing files, explaining code, describing something, answering questions) where no files are created or modified
|
|
142
|
+
- STANDARD: Default for creation, modification, or action tasks
|
|
143
|
+
|
|
144
|
+
CRITICAL RULES for requirements:
|
|
145
|
+
- Only set mustUseTools to tool names listed in AVAILABLE TOOLS above — do NOT reference tools that are not available
|
|
146
|
+
- If the required tool is not in AVAILABLE TOOLS, set mustUseTools to null
|
|
147
|
+
- "testing" type should only be set when the user explicitly wants something executed and verified
|
|
148
|
+
- If no specific tools are required, set mustUseTools to null
|
|
149
|
+
- Always include at least one requirement entry`;
|
|
150
|
+
try {
|
|
151
|
+
const response = await this.orchestrator.chat({
|
|
152
|
+
messages: [
|
|
153
|
+
{ role: 'system', content: 'You are a strict task analyst. Respond only with valid JSON.' },
|
|
154
|
+
{ role: 'user', content: analysisPrompt },
|
|
155
|
+
],
|
|
156
|
+
temperature: 0.1,
|
|
123
157
|
});
|
|
158
|
+
const jsonMatch = response.content.match(/\{[\s\S]*\}/);
|
|
159
|
+
if (jsonMatch) {
|
|
160
|
+
const analysis = JSON.parse(jsonMatch[0]);
|
|
161
|
+
// Map string to enum
|
|
162
|
+
let level;
|
|
163
|
+
switch (analysis.involvementLevel?.toUpperCase()) {
|
|
164
|
+
case 'THOROUGH':
|
|
165
|
+
level = InvolvementLevel.THOROUGH;
|
|
166
|
+
break;
|
|
167
|
+
case 'MINIMAL':
|
|
168
|
+
level = InvolvementLevel.MINIMAL;
|
|
169
|
+
break;
|
|
170
|
+
default:
|
|
171
|
+
level = InvolvementLevel.STANDARD;
|
|
172
|
+
}
|
|
173
|
+
// NOTE: failureCount escalation removed — involvement level is determined
|
|
174
|
+
// purely by task type. Per-subtask correction is handled by CompletionSignal.
|
|
175
|
+
const requirements = (analysis.requirements || []).map((req) => ({
|
|
176
|
+
type: req.type || 'other',
|
|
177
|
+
description: req.description || 'General task completion',
|
|
178
|
+
filePath: req.filePath || undefined,
|
|
179
|
+
mustUseTools: normalizeMustUseTools(req.mustUseTools),
|
|
180
|
+
}));
|
|
181
|
+
// Ensure at least one requirement
|
|
182
|
+
if (requirements.length === 0) {
|
|
183
|
+
requirements.push({ type: 'other', description: 'General task completion' });
|
|
184
|
+
}
|
|
185
|
+
logger.info(`[Client] LLM task analysis: ${level.toUpperCase()} involvement — ${analysis.involvementReasoning || 'no reasoning provided'}`);
|
|
186
|
+
logger.debug(`[Client] Requirements: ${JSON.stringify(requirements.map(r => ({ type: r.type, desc: r.description })))}`);
|
|
187
|
+
return { level, requirements };
|
|
188
|
+
}
|
|
124
189
|
}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
requirements.push({
|
|
128
|
-
type: 'other',
|
|
129
|
-
description: 'General task completion',
|
|
130
|
-
});
|
|
190
|
+
catch (error) {
|
|
191
|
+
logger.warn(`[Client] LLM task analysis failed: ${error}, falling back to STANDARD`);
|
|
131
192
|
}
|
|
132
|
-
|
|
193
|
+
// Fallback: STANDARD with generic requirement (no failureCount escalation)
|
|
194
|
+
return {
|
|
195
|
+
level: InvolvementLevel.STANDARD,
|
|
196
|
+
requirements: [{ type: 'other', description: 'General task completion' }],
|
|
197
|
+
};
|
|
133
198
|
}
|
|
199
|
+
// ─── Main Validation Entry Point ──────────────────────────────────────────
|
|
134
200
|
/**
|
|
135
201
|
* Validate Worker's work at appropriate involvement level
|
|
136
202
|
*/
|
|
137
|
-
async validate(userMessage, subtasks, workerResult, involvementLevel) {
|
|
138
|
-
|
|
139
|
-
const requirements = this.
|
|
203
|
+
async validate(userMessage, subtasks, workerResult, involvementLevel, agentContext) {
|
|
204
|
+
// Use LLM-based analysis instead of keyword matching
|
|
205
|
+
const { level: analyzedLevel, requirements } = await this.analyzeTaskRequirements(userMessage, subtasks, workerResult, agentContext);
|
|
206
|
+
let level = involvementLevel || analyzedLevel;
|
|
140
207
|
// CRITICAL: Use LLM to check for unjustified failure claims BEFORE other validation
|
|
141
208
|
// Even in MINIMAL mode, we must catch agents giving up without trying
|
|
142
|
-
const failureAnalysis = await this.analyzeForUnjustifiedFailure(userMessage, workerResult);
|
|
209
|
+
const failureAnalysis = await this.analyzeForUnjustifiedFailure(userMessage, workerResult, agentContext);
|
|
143
210
|
if (failureAnalysis.claimsFailure && !failureAnalysis.hasEvidence) {
|
|
144
211
|
logger.info(`[Client] Detected unjustified failure claim - escalating from ${level} to STANDARD`);
|
|
145
212
|
logger.info(`[Client] LLM reasoning: ${failureAnalysis.reasoning}`);
|
|
@@ -150,6 +217,8 @@ export class ClientAgent {
|
|
|
150
217
|
}
|
|
151
218
|
}
|
|
152
219
|
logger.info(`[Client] Validating with ${level.toUpperCase()} involvement`);
|
|
220
|
+
// Log the analysis for orchestration tracing
|
|
221
|
+
orchestrationLogger.logClientAnalysis(level, requirements.length, `Requirements: ${requirements.map(r => r.type).join(', ')}`);
|
|
153
222
|
const result = {
|
|
154
223
|
approved: false,
|
|
155
224
|
requirementsMet: false,
|
|
@@ -162,8 +231,26 @@ export class ClientAgent {
|
|
|
162
231
|
`REJECTED: Worker claims failure without sufficient evidence. ${failureAnalysis.reasoning}`;
|
|
163
232
|
result.issues.push(failureIssue);
|
|
164
233
|
}
|
|
234
|
+
// Layer 0.5: Result-vs-Evidence Coherence Check (always done, catches hallucinated accomplishments)
|
|
235
|
+
// This detects when the Worker claims to have done things its tool usage doesn't support
|
|
236
|
+
const coherenceAnalysis = await this.analyzeResultCoherence(userMessage, workerResult, agentContext);
|
|
237
|
+
orchestrationLogger.logClientCoherenceCheck(coherenceAnalysis.isCoherent, coherenceAnalysis.unsupportedClaims, coherenceAnalysis.reasoning);
|
|
238
|
+
if (!coherenceAnalysis.isCoherent) {
|
|
239
|
+
logger.info(`[Client] Detected incoherent result — Worker claims not supported by tool usage`);
|
|
240
|
+
logger.info(`[Client] Unsupported claims: ${coherenceAnalysis.unsupportedClaims.join('; ')}`);
|
|
241
|
+
logger.info(`[Client] Coherence reasoning: ${coherenceAnalysis.reasoning}`);
|
|
242
|
+
const coherenceIssue = coherenceAnalysis.suggestedAction ||
|
|
243
|
+
`REJECTED: Worker's result contains claims not supported by its actual tool usage. ${coherenceAnalysis.reasoning}`;
|
|
244
|
+
result.issues.push(coherenceIssue);
|
|
245
|
+
// Escalate involvement level — the Worker is hallucinating, we need stricter validation
|
|
246
|
+
if (level === InvolvementLevel.MINIMAL) {
|
|
247
|
+
level = InvolvementLevel.STANDARD;
|
|
248
|
+
result.involvementLevel = level;
|
|
249
|
+
logger.info(`[Client] Escalating to STANDARD due to incoherent result`);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
165
252
|
// Layer 1: Process Validation (always done, no tools needed)
|
|
166
|
-
const processValidation = this.validateProcess(requirements, workerResult);
|
|
253
|
+
const processValidation = this.validateProcess(requirements, workerResult, level);
|
|
167
254
|
if (processValidation.issues.length > 0) {
|
|
168
255
|
result.issues.push(...processValidation.issues);
|
|
169
256
|
}
|
|
@@ -178,24 +265,31 @@ export class ClientAgent {
|
|
|
178
265
|
result.requirementsMet = result.issues.length === 0;
|
|
179
266
|
result.approved = result.requirementsMet;
|
|
180
267
|
if (!result.approved && result.issues.length > 0) {
|
|
181
|
-
|
|
182
|
-
this.
|
|
268
|
+
// Generate an actionable correction instruction via LLM instead of echoing raw validation issues
|
|
269
|
+
result.nextAction = await this.generateCorrectionInstruction(userMessage, subtasks.join('; '), result.issues, workerResult, agentContext);
|
|
270
|
+
this.failureCount++; // Telemetry only — no longer drives escalation
|
|
183
271
|
}
|
|
184
272
|
else {
|
|
185
|
-
this.failureCount = 0;
|
|
273
|
+
this.failureCount = 0;
|
|
186
274
|
}
|
|
275
|
+
// Analyze CompletionSignal (LLM-based per-subtask assessment)
|
|
276
|
+
result.completionSignal = await this.analyzeCompletionSignal(userMessage, workerResult, result.issues, agentContext);
|
|
277
|
+
// Log the validation outcome
|
|
278
|
+
orchestrationLogger.logClientValidation(result.approved, result.issues, result.nextAction);
|
|
187
279
|
return result;
|
|
188
280
|
}
|
|
189
281
|
/**
|
|
190
282
|
* Use LLM to analyze worker result for unjustified failure claims
|
|
191
283
|
* This is language-agnostic and captures semantic meaning
|
|
192
284
|
*/
|
|
193
|
-
async analyzeForUnjustifiedFailure(userMessage, workerResult) {
|
|
285
|
+
async analyzeForUnjustifiedFailure(userMessage, workerResult, agentContext) {
|
|
194
286
|
const toolCount = workerResult.toolsUsed.length;
|
|
195
287
|
const toolList = workerResult.toolsUsed.join(', ') || 'none';
|
|
288
|
+
const contextBlock = agentContext ? `\n${serializeAgentContext(agentContext, 'client')}` : '';
|
|
196
289
|
const analysisPrompt = `You are a quality control agent. Analyze the following Worker response to determine if it's claiming failure and whether that failure is justified.
|
|
197
290
|
|
|
198
291
|
USER REQUEST: ${userMessage}
|
|
292
|
+
${contextBlock}
|
|
199
293
|
|
|
200
294
|
WORKER RESPONSE:
|
|
201
295
|
${workerResult.result}
|
|
@@ -245,28 +339,112 @@ Respond ONLY with the JSON, no other text.`;
|
|
|
245
339
|
reasoning: 'Analysis could not be performed',
|
|
246
340
|
};
|
|
247
341
|
}
|
|
342
|
+
/**
|
|
343
|
+
* Use LLM to cross-check the Worker's result claims against its actual tool usage.
|
|
344
|
+
* Catches hallucinated accomplishments — e.g., Worker claims "I inspected all source files
|
|
345
|
+
* and found no bugs" but only used list_directory and never read a single file.
|
|
346
|
+
* This runs at ALL involvement levels including MINIMAL.
|
|
347
|
+
*/
|
|
348
|
+
async analyzeResultCoherence(userMessage, workerResult, agentContext) {
|
|
349
|
+
// Skip coherence check if Worker used no tools (caught by zero-tools guard)
|
|
350
|
+
// or if Worker explicitly failed (caught by failure analysis)
|
|
351
|
+
if (workerResult.toolsUsed.length === 0 || !workerResult.success) {
|
|
352
|
+
return { isCoherent: true, reasoning: 'Skipped — handled by other checks', unsupportedClaims: [] };
|
|
353
|
+
}
|
|
354
|
+
const toolList = workerResult.toolsUsed.join(', ');
|
|
355
|
+
const uniqueTools = [...new Set(workerResult.toolsUsed)].join(', ');
|
|
356
|
+
const availableToolsContext = this.buildToolContextForPrompt();
|
|
357
|
+
const contextBlock = agentContext ? `\n${serializeAgentContext(agentContext, 'client')}` : '';
|
|
358
|
+
const coherencePrompt = `You are a strict quality auditor. Your job is to determine whether a Worker agent's result is SUPPORTED by the tools it actually used, or whether it fabricated/hallucinated claims.
|
|
359
|
+
|
|
360
|
+
USER REQUEST: ${userMessage}
|
|
361
|
+
${contextBlock}
|
|
362
|
+
|
|
363
|
+
WORKER RESULT:
|
|
364
|
+
${workerResult.result.substring(0, 1000)}
|
|
365
|
+
|
|
366
|
+
TOOLS ACTUALLY USED (in order): ${toolList}
|
|
367
|
+
UNIQUE TOOLS USED: ${uniqueTools}
|
|
368
|
+
TOTAL TOOL CALLS: ${workerResult.toolsUsed.length}
|
|
369
|
+
|
|
370
|
+
AVAILABLE TOOLS IN THIS SYSTEM:
|
|
371
|
+
${availableToolsContext}
|
|
372
|
+
|
|
373
|
+
CRITICAL: Analyze whether the claims in the Worker's result are supported by the tools it used.
|
|
374
|
+
|
|
375
|
+
Key tool semantics to apply:
|
|
376
|
+
- Tools with names like "list_directory", "directory_tree", "search_files" show file/folder NAMES only — they do NOT read file contents
|
|
377
|
+
- Tools with names like "read_text_file", "read_file", "get_file_content" actually read file content
|
|
378
|
+
- Tools with names like "shell_exec", "run_command", "bash", "execute" run shell commands — infer what was run from the worker's result
|
|
379
|
+
- For any other tool, infer its semantics from its name
|
|
380
|
+
|
|
381
|
+
Common hallucination patterns to detect:
|
|
382
|
+
1. Worker claims to have "inspected", "reviewed", "analyzed", or "scanned" source code but never used read_text_file — it only listed directories
|
|
383
|
+
2. Worker claims "no bugs found" or "code is correct" without reading any source files
|
|
384
|
+
3. Worker claims to have run tests or builds but no shell_exec tool was used (or the result doesn't reference actual test output)
|
|
385
|
+
4. Worker provides specific code details (line numbers, variable names, function logic) without having read the files containing them
|
|
386
|
+
5. Worker makes definitive statements about code quality, correctness, or behavior without having read the relevant code
|
|
387
|
+
|
|
388
|
+
Respond ONLY with valid JSON:
|
|
389
|
+
{
|
|
390
|
+
"isCoherent": <true if ALL claims in the result are supported by actual tool usage, false if any claims are fabricated>,
|
|
391
|
+
"reasoning": "<brief explanation of what's supported vs what's fabricated>",
|
|
392
|
+
"unsupportedClaims": ["<list each specific claim that is NOT supported by tool usage>"],
|
|
393
|
+
"suggestedAction": "<if not coherent, what the Worker should actually do — e.g., 'Read the source files using filesystem__read_text_file before claiming to have analyzed them'>"
|
|
394
|
+
}`;
|
|
395
|
+
try {
|
|
396
|
+
const response = await this.orchestrator.chat({
|
|
397
|
+
messages: [
|
|
398
|
+
{ role: 'system', content: 'You are a strict quality auditor. Respond only with valid JSON.' },
|
|
399
|
+
{ role: 'user', content: coherencePrompt },
|
|
400
|
+
],
|
|
401
|
+
temperature: 0.1,
|
|
402
|
+
});
|
|
403
|
+
const jsonMatch = response.content.match(/\{[\s\S]*\}/);
|
|
404
|
+
if (jsonMatch) {
|
|
405
|
+
const analysis = JSON.parse(jsonMatch[0]);
|
|
406
|
+
return analysis;
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
catch (error) {
|
|
410
|
+
logger.debug(`[Client] Failed to analyze result coherence: ${error}`);
|
|
411
|
+
}
|
|
412
|
+
// Default: assume coherent if analysis fails
|
|
413
|
+
return {
|
|
414
|
+
isCoherent: true,
|
|
415
|
+
reasoning: 'Coherence analysis could not be performed',
|
|
416
|
+
unsupportedClaims: [],
|
|
417
|
+
};
|
|
418
|
+
}
|
|
248
419
|
/**
|
|
249
420
|
* Layer 1: Process Validation (metadata only, no tools)
|
|
250
421
|
*/
|
|
251
|
-
validateProcess(requirements, workerResult) {
|
|
422
|
+
validateProcess(requirements, workerResult, involvementLevel) {
|
|
252
423
|
const issues = [];
|
|
424
|
+
// Zero-tools guard: if Worker used no tools at all and this is not a purely
|
|
425
|
+
// informational/conversational task, reject immediately
|
|
426
|
+
if (workerResult.toolsUsed.length === 0 && involvementLevel !== InvolvementLevel.MINIMAL) {
|
|
427
|
+
const isConversational = requirements.every(r => r.type === 'information' || r.type === 'other');
|
|
428
|
+
if (!isConversational) {
|
|
429
|
+
issues.push('Worker completed the task without using any tools. ' +
|
|
430
|
+
'The task requires actual tool usage (file operations, shell commands, browser actions, etc.) — ' +
|
|
431
|
+
'not just generating a text response. Use the available tools to actually perform the task.');
|
|
432
|
+
}
|
|
433
|
+
}
|
|
253
434
|
// Check if Worker used appropriate tools for requirements
|
|
254
435
|
for (const req of requirements) {
|
|
255
436
|
if (req.mustUseTools && req.mustUseTools.length > 0) {
|
|
256
437
|
const usedRequiredTool = req.mustUseTools.some(requiredTool => {
|
|
257
|
-
// Match if required tool is contained in actual tool name (e.g., 'playwright__browser' matches 'playwright__browser_navigate')
|
|
258
|
-
// OR if actual tool exactly matches the required tool
|
|
259
438
|
return workerResult.toolsUsed.some(actualTool => actualTool === requiredTool || actualTool.startsWith(requiredTool) || requiredTool.startsWith(actualTool.split('__')[0] + '__'));
|
|
260
439
|
});
|
|
261
440
|
if (!usedRequiredTool) {
|
|
262
|
-
issues.push(`${req.description} requires using ${req.mustUseTools.join(' or ')} but Worker did not use these tools
|
|
263
|
-
`Create a subtask specifically for ${req.description.toLowerCase()}.`);
|
|
441
|
+
issues.push(`${req.description} requires using ${req.mustUseTools.join(' or ')} but Worker did not use these tools.`);
|
|
264
442
|
}
|
|
265
443
|
}
|
|
266
444
|
}
|
|
267
445
|
// Check if Worker succeeded
|
|
268
446
|
if (!workerResult.success) {
|
|
269
|
-
issues.push(
|
|
447
|
+
issues.push('Worker did not complete the task successfully. The task needs to be retried with appropriate tool usage.');
|
|
270
448
|
}
|
|
271
449
|
return { issues };
|
|
272
450
|
}
|
|
@@ -296,111 +474,276 @@ Respond ONLY with the JSON, no other text.`;
|
|
|
296
474
|
}
|
|
297
475
|
}
|
|
298
476
|
}
|
|
299
|
-
//
|
|
477
|
+
// Shell-based deep verification (THOROUGH only)
|
|
300
478
|
if (level === InvolvementLevel.THOROUGH) {
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
workerResult.result.includes('.html')) {
|
|
304
|
-
// Extract HTML filename from result
|
|
305
|
-
const htmlMatch = workerResult.result.match(/([a-zA-Z0-9._-]+\.html)/);
|
|
306
|
-
if (htmlMatch) {
|
|
307
|
-
const htmlFile = htmlMatch[1];
|
|
308
|
-
const browserValidation = await this.validateInBrowser(htmlFile);
|
|
309
|
-
if (!browserValidation.valid) {
|
|
310
|
-
issues.push(browserValidation.issue);
|
|
311
|
-
}
|
|
312
|
-
}
|
|
313
|
-
}
|
|
314
|
-
}
|
|
479
|
+
const shellIssues = await this.validateWithShell(requirements, workerResult);
|
|
480
|
+
issues.push(...shellIssues.issues);
|
|
315
481
|
}
|
|
316
482
|
return { issues };
|
|
317
483
|
}
|
|
484
|
+
// ─── Tool-Based Verification ────────────────────────────────────────────────────
|
|
318
485
|
/**
|
|
319
|
-
* Check
|
|
486
|
+
* Check whether a file exists, using whichever MCP tool is available.
|
|
487
|
+
* Tries filesystem read tools first, then falls back to shell.
|
|
320
488
|
*/
|
|
321
489
|
async fileExists(filePath) {
|
|
490
|
+
const readTool = this.findTool('read_text_file', 'read_file', 'get_file_content');
|
|
491
|
+
if (readTool) {
|
|
492
|
+
try {
|
|
493
|
+
await this.mcpClient.executeTool(readTool, { path: filePath, head: 1 });
|
|
494
|
+
return true;
|
|
495
|
+
}
|
|
496
|
+
catch {
|
|
497
|
+
return false;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
const shellTool = this.findTool('shell_exec', 'run_command', 'bash', 'execute');
|
|
501
|
+
if (shellTool) {
|
|
502
|
+
try {
|
|
503
|
+
const result = await this.mcpClient.executeTool(shellTool, {
|
|
504
|
+
command: `test -f "${filePath}" && echo "exists" || echo "not_found"`,
|
|
505
|
+
});
|
|
506
|
+
return String(result).includes('exists');
|
|
507
|
+
}
|
|
508
|
+
catch {
|
|
509
|
+
return false;
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
logger.debug('[Client] No tool available to verify file existence');
|
|
513
|
+
return false;
|
|
514
|
+
}
|
|
515
|
+
/**
|
|
516
|
+
* Use LLM to generate an actionable correction instruction from raw validation issues.
|
|
517
|
+
* Translates internal validation failures into concrete, tool-specific directions
|
|
518
|
+
* for the Worker, referencing only the tools actually available in the system.
|
|
519
|
+
*/
|
|
520
|
+
async generateCorrectionInstruction(userMessage, subtask, issues, workerResult, agentContext) {
|
|
521
|
+
const availableToolsContext = this.buildToolContextForPrompt();
|
|
522
|
+
const contextBlock = agentContext ? `\nAGENT CONTEXT:\n${serializeAgentContext(agentContext, 'client')}` : '';
|
|
523
|
+
const correctionPrompt = `You are generating a correction instruction for a Worker agent that failed to complete a task properly.
|
|
524
|
+
|
|
525
|
+
ORIGINAL USER REQUEST: ${userMessage}
|
|
526
|
+
|
|
527
|
+
SUBTASK THAT WAS ATTEMPTED: ${subtask}
|
|
528
|
+
|
|
529
|
+
VALIDATION ISSUES FOUND:
|
|
530
|
+
${issues.map((issue, i) => `${i + 1}. ${issue}`).join('\n')}
|
|
531
|
+
|
|
532
|
+
WORKER'S RESULT (first 300 chars): ${workerResult.result.substring(0, 300)}
|
|
533
|
+
TOOLS WORKER USED: ${workerResult.toolsUsed.join(', ') || 'none'}
|
|
534
|
+
|
|
535
|
+
AVAILABLE TOOLS THE WORKER CAN USE:
|
|
536
|
+
${availableToolsContext}
|
|
537
|
+
${contextBlock}
|
|
538
|
+
|
|
539
|
+
Generate a CLEAR, ACTIONABLE instruction that tells the Worker exactly what to do to fix the issues.
|
|
540
|
+
The instruction should:
|
|
541
|
+
- Be a direct command referencing specific available tools by name
|
|
542
|
+
- Be concise (1-2 sentences)
|
|
543
|
+
- NOT include validation jargon like "mustUseTools", "requirements", or "involvement level"
|
|
544
|
+
- NOT be a generic statement like "retry the task" — be specific about WHAT to do
|
|
545
|
+
|
|
546
|
+
Respond ONLY with the correction instruction text, nothing else.`;
|
|
322
547
|
try {
|
|
323
|
-
await this.
|
|
324
|
-
|
|
325
|
-
|
|
548
|
+
const response = await this.orchestrator.chat({
|
|
549
|
+
messages: [
|
|
550
|
+
{ role: 'system', content: 'You generate concise, actionable correction instructions for a Worker agent. Respond with only the instruction text.' },
|
|
551
|
+
{ role: 'user', content: correctionPrompt },
|
|
552
|
+
],
|
|
553
|
+
temperature: 0.1,
|
|
326
554
|
});
|
|
327
|
-
|
|
555
|
+
const instruction = response.content.trim();
|
|
556
|
+
if (instruction.length > 10 && instruction.length < 500) {
|
|
557
|
+
return instruction;
|
|
558
|
+
}
|
|
328
559
|
}
|
|
329
560
|
catch (error) {
|
|
330
|
-
|
|
561
|
+
logger.warn(`[Client] Failed to generate correction instruction: ${error}`);
|
|
331
562
|
}
|
|
563
|
+
// Fallback: use first issue with a prefix
|
|
564
|
+
return `Fix the following issue and retry: ${issues[0]}`;
|
|
332
565
|
}
|
|
333
566
|
/**
|
|
334
|
-
* Validate file contents for common issues
|
|
567
|
+
* Validate file contents for common issues, using whichever MCP tool is available.
|
|
335
568
|
*/
|
|
336
569
|
async validateFileContents(filePath) {
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
570
|
+
const readTool = this.findTool('read_text_file', 'read_file', 'get_file_content');
|
|
571
|
+
const shellTool = this.findTool('shell_exec', 'run_command', 'bash', 'execute');
|
|
572
|
+
let contentStr = null;
|
|
573
|
+
if (readTool) {
|
|
574
|
+
try {
|
|
575
|
+
const content = await this.mcpClient.executeTool(readTool, { path: filePath, head: 200 });
|
|
576
|
+
contentStr = typeof content === 'string' ? content : JSON.stringify(content);
|
|
577
|
+
}
|
|
578
|
+
catch (error) {
|
|
579
|
+
return { valid: false, issue: `Could not read file: ${error}` };
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
else if (shellTool) {
|
|
583
|
+
try {
|
|
584
|
+
const content = await this.mcpClient.executeTool(shellTool, {
|
|
585
|
+
command: `head -200 "${filePath}" 2>&1`,
|
|
586
|
+
});
|
|
587
|
+
contentStr = String(content);
|
|
588
|
+
}
|
|
589
|
+
catch (error) {
|
|
590
|
+
return { valid: false, issue: `Could not read file via shell: ${error}` };
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
if (contentStr === null) {
|
|
594
|
+
return { valid: true }; // No read tool available; skip content check
|
|
595
|
+
}
|
|
596
|
+
// Check path reference integrity in HTML files
|
|
597
|
+
if (filePath.endsWith('.html')) {
|
|
598
|
+
const hrefMatches = contentStr.match(/href="([^"]+)"/g) || [];
|
|
599
|
+
const srcMatches = contentStr.match(/src="([^"]+)"/g) || [];
|
|
600
|
+
for (const match of [...hrefMatches, ...srcMatches]) {
|
|
601
|
+
const pathMatch = match.match(/(?:href|src)="([^"]+)"/);
|
|
602
|
+
if (pathMatch) {
|
|
603
|
+
const referencedPath = pathMatch[1];
|
|
604
|
+
if (!referencedPath.startsWith('http') && !referencedPath.startsWith('data:')) {
|
|
605
|
+
const exists = await this.fileExists(referencedPath);
|
|
606
|
+
if (!exists) {
|
|
607
|
+
return {
|
|
608
|
+
valid: false,
|
|
609
|
+
issue: `HTML references non-existent file: ${referencedPath}. Fix file paths or create missing files.`,
|
|
610
|
+
};
|
|
359
611
|
}
|
|
360
612
|
}
|
|
361
613
|
}
|
|
362
614
|
}
|
|
363
|
-
return { valid: true };
|
|
364
615
|
}
|
|
365
|
-
|
|
366
|
-
|
|
616
|
+
return { valid: true };
|
|
617
|
+
}
|
|
618
|
+
/**
|
|
619
|
+
* THOROUGH-level shell-based verification: runs lightweight, read-only shell
|
|
620
|
+
* commands to confirm work was actually done (file sizes, test output presence, etc.).
|
|
621
|
+
* Skips gracefully when no shell tool is available.
|
|
622
|
+
*/
|
|
623
|
+
async validateWithShell(requirements, workerResult) {
|
|
624
|
+
const issues = [];
|
|
625
|
+
const shellTool = this.findTool('shell_exec', 'run_command', 'bash', 'execute');
|
|
626
|
+
if (!shellTool) {
|
|
627
|
+
logger.debug('[Client] No shell tool available for THOROUGH shell validation — skipping');
|
|
628
|
+
return { issues };
|
|
367
629
|
}
|
|
630
|
+
for (const req of requirements) {
|
|
631
|
+
if ((req.type === 'file_creation' || req.type === 'file_modification') && req.filePath) {
|
|
632
|
+
try {
|
|
633
|
+
const result = await this.mcpClient.executeTool(shellTool, {
|
|
634
|
+
command: `wc -c "${req.filePath}" 2>&1`,
|
|
635
|
+
});
|
|
636
|
+
const resultStr = String(result);
|
|
637
|
+
if (resultStr.includes('No such file') || resultStr.includes('cannot access')) {
|
|
638
|
+
issues.push(`Shell verification failed: ${req.filePath} does not exist on disk.`);
|
|
639
|
+
}
|
|
640
|
+
else {
|
|
641
|
+
const sizeMatch = resultStr.match(/^\s*(\d+)/);
|
|
642
|
+
if (sizeMatch && parseInt(sizeMatch[1], 10) === 0) {
|
|
643
|
+
issues.push(`File ${req.filePath} was created but is empty.`);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
catch (error) {
|
|
648
|
+
logger.debug(`[Client] Shell validation error for ${req.filePath}: ${error}`);
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
if (req.type === 'testing') {
|
|
652
|
+
const hasTestOutput = workerResult.result.match(/passed|failed|error|PASS|FAIL|✓|✗|tests run|test suite/i);
|
|
653
|
+
if (!hasTestOutput) {
|
|
654
|
+
logger.debug('[Client] THOROUGH: testing requirement but no test output detected in worker result');
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
return { issues };
|
|
368
659
|
}
|
|
660
|
+
// ─── Completion Signal Analysis ──────────────────────────────────────────────
|
|
369
661
|
/**
|
|
370
|
-
*
|
|
662
|
+
* LLM-based per-subtask assessment of completion confidence, blocker type,
|
|
663
|
+
* and suggested corrective strategy. Called at the end of validate().
|
|
371
664
|
*/
|
|
372
|
-
async
|
|
665
|
+
async analyzeCompletionSignal(userMessage, workerResult, issues, agentContext) {
|
|
666
|
+
// If approved with no issues, high confidence
|
|
667
|
+
if (issues.length === 0) {
|
|
668
|
+
return { confidence: 'high', progressMade: true };
|
|
669
|
+
}
|
|
670
|
+
const contextBlock = agentContext ? `\n${serializeAgentContext(agentContext, 'client')}` : '';
|
|
671
|
+
const signalPrompt = `You are analyzing a subtask execution to produce a CompletionSignal.
|
|
672
|
+
|
|
673
|
+
USER REQUEST: ${userMessage}
|
|
674
|
+
${contextBlock}
|
|
675
|
+
|
|
676
|
+
WORKER RESULT (first 500 chars): ${workerResult.result.substring(0, 500)}
|
|
677
|
+
WORKER SUCCESS: ${workerResult.success}
|
|
678
|
+
TOOLS USED: ${workerResult.toolsUsed.join(', ') || 'none'} (${workerResult.toolsUsed.length} total)
|
|
679
|
+
|
|
680
|
+
VALIDATION ISSUES:
|
|
681
|
+
${issues.map((issue, i) => `${i + 1}. ${issue}`).join('\n')}
|
|
682
|
+
|
|
683
|
+
Analyze and respond ONLY with valid JSON:
|
|
684
|
+
{
|
|
685
|
+
"confidence": "<high | medium | low | none>",
|
|
686
|
+
"progressMade": <true if Worker made any measurable forward progress, false otherwise>,
|
|
687
|
+
"blockerType": "<tool_failure | hallucination | scope_drift | partial | loop | capability_gap | null>",
|
|
688
|
+
"suggestedStrategy": "<retry | rephrase | decompose | skip | escalate>"
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
RULES:
|
|
692
|
+
- confidence "high": all issues are minor or cosmetic
|
|
693
|
+
- confidence "medium": some progress but incomplete
|
|
694
|
+
- confidence "low": significant issues, little useful work
|
|
695
|
+
- confidence "none": no useful work done at all
|
|
696
|
+
- blockerType "tool_failure": Worker tried but tool errored
|
|
697
|
+
- blockerType "hallucination": Worker claimed work it didn't do
|
|
698
|
+
- blockerType "scope_drift": Worker did something unrelated
|
|
699
|
+
- blockerType "partial": Worker made progress but didn't finish
|
|
700
|
+
- blockerType "loop": Worker is repeating the same action
|
|
701
|
+
- blockerType "capability_gap": Task requires tools/capabilities not available
|
|
702
|
+
- suggestedStrategy: recommend the best recovery approach`;
|
|
373
703
|
try {
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
// Check for console errors
|
|
381
|
-
const errors = await this.mcpClient.executeTool('playwright__browser_console_messages', {
|
|
382
|
-
level: 'error',
|
|
704
|
+
const response = await this.orchestrator.chat({
|
|
705
|
+
messages: [
|
|
706
|
+
{ role: 'system', content: 'You are a strict completion analyst. Respond only with valid JSON.' },
|
|
707
|
+
{ role: 'user', content: signalPrompt },
|
|
708
|
+
],
|
|
709
|
+
temperature: 0.1,
|
|
383
710
|
});
|
|
384
|
-
const
|
|
385
|
-
if (
|
|
711
|
+
const jsonMatch = response.content.match(/\{[\s\S]*\}/);
|
|
712
|
+
if (jsonMatch) {
|
|
713
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
386
714
|
return {
|
|
387
|
-
|
|
388
|
-
|
|
715
|
+
confidence: parsed.confidence || 'low',
|
|
716
|
+
progressMade: parsed.progressMade ?? false,
|
|
717
|
+
blockerType: parsed.blockerType === 'null' ? undefined : parsed.blockerType,
|
|
718
|
+
suggestedStrategy: parsed.suggestedStrategy,
|
|
389
719
|
};
|
|
390
720
|
}
|
|
391
|
-
return { valid: true };
|
|
392
721
|
}
|
|
393
722
|
catch (error) {
|
|
394
|
-
logger.debug(`[Client]
|
|
395
|
-
// Don't fail validation if browser test fails - might not have browser available
|
|
396
|
-
return { valid: true };
|
|
723
|
+
logger.debug(`[Client] Failed to analyze completion signal: ${error}`);
|
|
397
724
|
}
|
|
725
|
+
// Fallback: conservative signal
|
|
726
|
+
return {
|
|
727
|
+
confidence: 'low',
|
|
728
|
+
progressMade: workerResult.toolsUsed.length > 0,
|
|
729
|
+
blockerType: 'partial',
|
|
730
|
+
suggestedStrategy: 'retry',
|
|
731
|
+
};
|
|
398
732
|
}
|
|
733
|
+
// ─── Session Management ─────────────────────────────────────────────────────
|
|
399
734
|
/**
|
|
400
|
-
* Reset
|
|
735
|
+
* Reset session state (call at the start of each new conversation/session).
|
|
736
|
+
* Renamed from resetFailureTracking() for clarity — failureCount is now
|
|
737
|
+
* telemetry-only and does not drive involvement escalation.
|
|
401
738
|
*/
|
|
402
|
-
|
|
739
|
+
resetSessionState() {
|
|
403
740
|
this.failureCount = 0;
|
|
404
741
|
}
|
|
742
|
+
/**
|
|
743
|
+
* @deprecated Use resetSessionState() instead.
|
|
744
|
+
*/
|
|
745
|
+
resetFailureTracking() {
|
|
746
|
+
this.resetSessionState();
|
|
747
|
+
}
|
|
405
748
|
}
|
|
406
749
|
//# sourceMappingURL=client-agent.js.map
|