jiva-core 0.3.2 → 0.3.3-dev.24b219d

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/.github/workflows/npm-publish-dev.yml +39 -0
  2. package/.github/workflows/npm-publish.yml +31 -0
  3. package/Dockerfile +12 -7
  4. package/README.md +10 -0
  5. package/cloud-run.yaml +1 -1
  6. package/cloud-run.yaml.template +1 -1
  7. package/dist/core/agent-spawner.d.ts.map +1 -1
  8. package/dist/core/agent-spawner.js +3 -0
  9. package/dist/core/agent-spawner.js.map +1 -1
  10. package/dist/core/client-agent.d.ts +46 -19
  11. package/dist/core/client-agent.d.ts.map +1 -1
  12. package/dist/core/client-agent.js +306 -216
  13. package/dist/core/client-agent.js.map +1 -1
  14. package/dist/core/config.d.ts.map +1 -1
  15. package/dist/core/config.js +11 -5
  16. package/dist/core/config.js.map +1 -1
  17. package/dist/core/dual-agent.d.ts +20 -0
  18. package/dist/core/dual-agent.d.ts.map +1 -1
  19. package/dist/core/dual-agent.js +185 -32
  20. package/dist/core/dual-agent.js.map +1 -1
  21. package/dist/core/manager-agent.d.ts +8 -2
  22. package/dist/core/manager-agent.d.ts.map +1 -1
  23. package/dist/core/manager-agent.js +30 -9
  24. package/dist/core/manager-agent.js.map +1 -1
  25. package/dist/core/types/agent-context.d.ts +30 -0
  26. package/dist/core/types/agent-context.d.ts.map +1 -0
  27. package/dist/core/types/agent-context.js +8 -0
  28. package/dist/core/types/agent-context.js.map +1 -0
  29. package/dist/core/types/completion-signal.d.ts +17 -0
  30. package/dist/core/types/completion-signal.d.ts.map +1 -0
  31. package/dist/core/types/completion-signal.js +8 -0
  32. package/dist/core/types/completion-signal.js.map +1 -0
  33. package/dist/core/utils/serialize-agent-context.d.ts +23 -0
  34. package/dist/core/utils/serialize-agent-context.d.ts.map +1 -0
  35. package/dist/core/utils/serialize-agent-context.js +73 -0
  36. package/dist/core/utils/serialize-agent-context.js.map +1 -0
  37. package/dist/core/worker-agent.d.ts +2 -1
  38. package/dist/core/worker-agent.d.ts.map +1 -1
  39. package/dist/core/worker-agent.js +7 -1
  40. package/dist/core/worker-agent.js.map +1 -1
  41. package/dist/index.d.ts +3 -0
  42. package/dist/index.d.ts.map +1 -1
  43. package/dist/index.js +1 -0
  44. package/dist/index.js.map +1 -1
  45. package/dist/interfaces/cli/index.js +33 -14
  46. package/dist/interfaces/cli/index.js.map +1 -1
  47. package/dist/interfaces/cli/setup-wizard.js +1 -1
  48. package/dist/interfaces/cli/setup-wizard.js.map +1 -1
  49. package/dist/interfaces/http/session-manager.d.ts.map +1 -1
  50. package/dist/interfaces/http/session-manager.js +16 -7
  51. package/dist/interfaces/http/session-manager.js.map +1 -1
  52. package/dist/storage/gcp-bucket-provider.d.ts.map +1 -1
  53. package/dist/storage/gcp-bucket-provider.js +1 -11
  54. package/dist/storage/gcp-bucket-provider.js.map +1 -1
  55. package/dist/utils/platform.d.ts +13 -0
  56. package/dist/utils/platform.d.ts.map +1 -0
  57. package/dist/utils/platform.js +23 -0
  58. package/dist/utils/platform.js.map +1 -0
  59. package/package.json +7 -8
  60. package/.claude/settings.local.json +0 -18
@@ -9,6 +9,7 @@
9
9
  * - STANDARD: Creation requests → file existence + basic validation
10
10
  * - THOROUGH: Complex/testing requests OR failures → full E2E validation with tools
11
11
  */
12
+ import { serializeAgentContext } from './utils/serialize-agent-context.js';
12
13
  import { logger } from '../utils/logger.js';
13
14
  import { orchestrationLogger } from '../utils/orchestration-logger.js';
14
15
  export var InvolvementLevel;
@@ -17,36 +18,97 @@ export var InvolvementLevel;
17
18
  InvolvementLevel["STANDARD"] = "standard";
18
19
  InvolvementLevel["THOROUGH"] = "thorough";
19
20
  })(InvolvementLevel || (InvolvementLevel = {}));
21
+ /**
22
+ * Normalize the LLM-returned mustUseTools value to string[] | undefined.
23
+ * The LLM may return null, a string, or an array.
24
+ */
25
+ function normalizeMustUseTools(value) {
26
+ if (Array.isArray(value))
27
+ return value;
28
+ if (typeof value === 'string' && value)
29
+ return [value];
30
+ return undefined;
31
+ }
20
32
  export class ClientAgent {
21
33
  orchestrator;
22
34
  mcpManager;
23
35
  mcpClient;
24
36
  failureCount = 0;
25
- // Read-only tools Client can use for validation
26
- ALLOWED_TOOLS = [
27
- 'filesystem__read_text_file',
28
- 'filesystem__list_directory',
29
- 'filesystem__directory_tree',
30
- 'filesystem__search_files',
31
- 'playwright__browser_navigate',
32
- 'playwright__browser_console_messages',
33
- 'playwright__browser_take_screenshot',
34
- 'playwright__browser_evaluate',
35
- ];
37
+ // Lazily cached list of all available tool names (populated on first use)
38
+ _availableTools = null;
36
39
  constructor(orchestrator, mcpManager) {
37
40
  this.orchestrator = orchestrator;
38
41
  this.mcpManager = mcpManager;
39
42
  this.mcpClient = mcpManager.getClient();
40
43
  }
44
+ // ─── Tool Discovery ───────────────────────────────────────────────────────
45
+ /**
46
+ * Returns all tool names currently available from connected MCP servers.
47
+ * Result is cached after the first call; call resetToolCache() if servers change.
48
+ */
49
+ getAvailableTools() {
50
+ if (this._availableTools === null) {
51
+ this._availableTools = this.mcpClient.getAllTools().map(t => t.name);
52
+ logger.debug(`[Client] Discovered ${this._availableTools.length} available tools: ${this._availableTools.join(', ')}`);
53
+ }
54
+ return this._availableTools;
55
+ }
56
+ /** Reset the tool cache (e.g. after MCP server reconnects). */
57
+ resetToolCache() {
58
+ this._availableTools = null;
59
+ }
60
+ /**
61
+ * Find the first available tool whose name contains any of the given substrings.
62
+ * Returns null if no match is found.
63
+ */
64
+ findTool(...patterns) {
65
+ const tools = this.getAvailableTools();
66
+ for (const pattern of patterns) {
67
+ const found = tools.find(t => t.includes(pattern));
68
+ if (found)
69
+ return found;
70
+ }
71
+ return null;
72
+ }
73
+ /**
74
+ * Build a human-readable summary of available tool categories for LLM prompts.
75
+ */
76
+ buildToolContextForPrompt() {
77
+ const tools = this.getAvailableTools();
78
+ if (tools.length === 0) {
79
+ return 'No MCP tools are currently available.';
80
+ }
81
+ // Group by server prefix (everything before __)
82
+ const byServer = {};
83
+ for (const tool of tools) {
84
+ const [server] = tool.split('__');
85
+ if (!byServer[server])
86
+ byServer[server] = [];
87
+ byServer[server].push(tool);
88
+ }
89
+ return Object.entries(byServer)
90
+ .map(([server, serverTools]) => `- ${server}: ${serverTools.join(', ')}`)
91
+ .join('\n');
92
+ }
93
+ // ─── Task Analysis ────────────────────────────────────────────────────────
41
94
  /**
42
95
  * Use LLM to analyze the task and determine involvement level + requirements.
43
96
  * Replaces keyword-based determineInvolvementLevel() and parseRequirements()
44
97
  * with semantic understanding that avoids false positives.
98
+ *
99
+ * NOTE: failureCount escalation has been removed. Involvement level is now
100
+ * determined purely by task type. Per-subtask correction is handled by
101
+ * CompletionSignal + DualAgent retry budget instead.
45
102
  */
46
- async analyzeTaskRequirements(userMessage, subtasks, workerResult) {
103
+ async analyzeTaskRequirements(userMessage, subtasks, workerResult, agentContext) {
47
104
  const workerContext = workerResult
48
105
  ? `\nWorker Result (first 500 chars): ${workerResult.result.substring(0, 500)}\nWorker Success: ${workerResult.success}\nTools Used: ${workerResult.toolsUsed.join(', ') || 'none'} (${workerResult.toolsUsed.length} total)`
49
106
  : '';
107
+ const availableToolsContext = this.buildToolContextForPrompt();
108
+ // Include serialized agent context for richer analysis
109
+ const contextBlock = agentContext
110
+ ? `\nAGENT CONTEXT:\n${serializeAgentContext(agentContext, 'client')}`
111
+ : '';
50
112
  const analysisPrompt = `You are a task analyst for a software agent system. Analyze the user's request to determine:
51
113
  1. How deeply to validate the Worker's output (involvement level)
52
114
  2. What specific requirements the task implies
@@ -55,8 +117,10 @@ USER MESSAGE: ${userMessage}
55
117
 
56
118
  SUBTASKS: ${JSON.stringify(subtasks)}
57
119
  ${workerContext}
120
+ ${contextBlock}
58
121
 
59
- PREVIOUS FAILURE COUNT: ${this.failureCount}
122
+ AVAILABLE TOOLS (what the Worker and Client can actually use):
123
+ ${availableToolsContext}
60
124
 
61
125
  Respond ONLY with valid JSON in this exact format (no other text):
62
126
  {
@@ -73,19 +137,14 @@ Respond ONLY with valid JSON in this exact format (no other text):
73
137
  }
74
138
 
75
139
  CRITICAL RULES for involvementLevel:
76
- - THOROUGH: ONLY when the user EXPLICITLY asks to test or verify something in a browser/test environment, OR after previous failures (failureCount > 0), OR for complex multi-file operations (>3 subtasks)
140
+ - THOROUGH: ONLY when the user EXPLICITLY asks to test or verify something, OR for complex multi-file operations (>3 subtasks)
77
141
  - MINIMAL: Information-only requests (listing files, explaining code, describing something, answering questions) where no files are created or modified
78
142
  - STANDARD: Default for creation, modification, or action tasks
79
143
 
80
144
  CRITICAL RULES for requirements:
81
- - "testing" type with mustUseTools ["playwright__"] should ONLY be set when the user wants browser-based testing or verification of a web page/HTML/UI
82
- - Words like "check", "find", "verify" in the context of system administration (disk space, processes, configurations) are NOT browser testing — they are "information" type WITHOUT playwright tools
83
- - Examples:
84
- - "check how much space my caches use" = type "information", mustUseTools null
85
- - "find the biggest files in Downloads" = type "information", mustUseTools null
86
- - "test the login page in the browser" = type "testing", mustUseTools ["playwright__"]
87
- - "create index.html and verify it works" = type "file_creation" + type "testing" with playwright
88
- - "make sure the server is running" = type "verification", mustUseTools null
145
+ - Only set mustUseTools to tool names listed in AVAILABLE TOOLS above do NOT reference tools that are not available
146
+ - If the required tool is not in AVAILABLE TOOLS, set mustUseTools to null
147
+ - "testing" type should only be set when the user explicitly wants something executed and verified
89
148
  - If no specific tools are required, set mustUseTools to null
90
149
  - Always include at least one requirement entry`;
91
150
  try {
@@ -111,16 +170,13 @@ CRITICAL RULES for requirements:
111
170
  default:
112
171
  level = InvolvementLevel.STANDARD;
113
172
  }
114
- // Hard override: escalate to THOROUGH after failures
115
- if (this.failureCount > 0 && level !== InvolvementLevel.THOROUGH) {
116
- logger.debug(`[Client] Escalating to THOROUGH due to ${this.failureCount} previous failures`);
117
- level = InvolvementLevel.THOROUGH;
118
- }
173
+ // NOTE: failureCount escalation removed involvement level is determined
174
+ // purely by task type. Per-subtask correction is handled by CompletionSignal.
119
175
  const requirements = (analysis.requirements || []).map((req) => ({
120
176
  type: req.type || 'other',
121
177
  description: req.description || 'General task completion',
122
178
  filePath: req.filePath || undefined,
123
- mustUseTools: req.mustUseTools || undefined,
179
+ mustUseTools: normalizeMustUseTools(req.mustUseTools),
124
180
  }));
125
181
  // Ensure at least one requirement
126
182
  if (requirements.length === 0) {
@@ -134,118 +190,23 @@ CRITICAL RULES for requirements:
134
190
  catch (error) {
135
191
  logger.warn(`[Client] LLM task analysis failed: ${error}, falling back to STANDARD`);
136
192
  }
137
- // Fallback: STANDARD with generic requirement
193
+ // Fallback: STANDARD with generic requirement (no failureCount escalation)
138
194
  return {
139
- level: this.failureCount > 0 ? InvolvementLevel.THOROUGH : InvolvementLevel.STANDARD,
195
+ level: InvolvementLevel.STANDARD,
140
196
  requirements: [{ type: 'other', description: 'General task completion' }],
141
197
  };
142
198
  }
143
- /**
144
- * @deprecated Use analyzeTaskRequirements() instead. Kept for reference.
145
- * Determine involvement level based on user request complexity
146
- */
147
- determineInvolvementLevel(userMessage, subtasks) {
148
- const messageLower = userMessage.toLowerCase();
149
- const subtasksLower = subtasks.join(' ').toLowerCase();
150
- // THOROUGH: User explicitly requests testing/verification
151
- const testKeywords = ['test', 'verify', 'check', 'make sure', 'ensure', 'validate'];
152
- if (testKeywords.some(kw => messageLower.includes(kw))) {
153
- logger.debug('[Client] THOROUGH mode: Testing/verification requested');
154
- return InvolvementLevel.THOROUGH;
155
- }
156
- // THOROUGH: After failures (user frustrated)
157
- if (this.failureCount > 0) {
158
- logger.debug(`[Client] THOROUGH mode: ${this.failureCount} previous failures detected`);
159
- return InvolvementLevel.THOROUGH;
160
- }
161
- // THOROUGH: Complex multi-file operations
162
- if (subtasks.length > 3 || (messageLower.includes('component') && messageLower.includes('index.html'))) {
163
- logger.debug('[Client] THOROUGH mode: Complex multi-file operation');
164
- return InvolvementLevel.THOROUGH;
165
- }
166
- // MINIMAL: Information-only requests
167
- const infoKeywords = ['what', 'list', 'show', 'explain', 'describe', 'how', 'tell me'];
168
- const creationKeywords = ['create', 'build', 'write', 'generate', 'make', 'add'];
169
- const hasInfoKeyword = infoKeywords.some(kw => messageLower.includes(kw));
170
- const hasCreationKeyword = creationKeywords.some(kw => messageLower.includes(kw));
171
- if (hasInfoKeyword && !hasCreationKeyword) {
172
- logger.debug('[Client] MINIMAL mode: Information request');
173
- return InvolvementLevel.MINIMAL;
174
- }
175
- // STANDARD: Default for creation/modification tasks
176
- logger.debug('[Client] STANDARD mode: Regular creation task');
177
- return InvolvementLevel.STANDARD;
178
- }
179
- /**
180
- * @deprecated Use analyzeTaskRequirements() instead. Kept for reference.
181
- * Parse requirements from user message
182
- */
183
- parseRequirements(userMessage, subtasks) {
184
- const requirements = [];
185
- const messageLower = userMessage.toLowerCase();
186
- const combined = (messageLower + ' ' + subtasks.join(' ').toLowerCase());
187
- // Detect file creation requirements
188
- const fileMatches = userMessage.match(/(?:create|build|generate|write|save as)\s+([a-zA-Z0-9._/-]+\.(html|js|css|md|json|txt|py|ts|tsx|jsx))/gi);
189
- if (fileMatches) {
190
- fileMatches.forEach(match => {
191
- const filename = match.split(/\s+/).pop();
192
- if (filename) {
193
- requirements.push({
194
- type: 'file_creation',
195
- description: `Create file: ${filename}`,
196
- filePath: filename,
197
- });
198
- }
199
- });
200
- }
201
- // Detect testing requirements (explicit verification requests)
202
- const testKeywords = ['test', 'verify', 'check', 'make sure', 'ensure'];
203
- if (testKeywords.some(kw => combined.includes(kw))) {
204
- requirements.push({
205
- type: 'testing',
206
- description: 'Verify functionality through testing',
207
- mustUseTools: ['playwright__'],
208
- });
209
- }
210
- // Detect browser verification requirements - ONLY for file verification, not general browsing
211
- // This should trigger for "open index.html in browser" but NOT for "open linkedin.com"
212
- const isLocalFileOpen = (combined.includes('open') && combined.includes('.html')) ||
213
- (combined.includes('browser') && combined.includes('.html'));
214
- const isExternalUrl = combined.match(/open\s+(?:https?:\/\/)?(?:www\.)?[a-z0-9-]+\.[a-z]{2,}/i);
215
- if (isLocalFileOpen && !isExternalUrl) {
216
- requirements.push({
217
- type: 'verification',
218
- description: 'Browser testing required for local HTML file',
219
- mustUseTools: ['playwright__browser_navigate', 'playwright__browser_console_messages'],
220
- });
221
- }
222
- // For external URLs, don't require specific tools - just opening the page is enough
223
- if (isExternalUrl) {
224
- requirements.push({
225
- type: 'verification',
226
- description: 'Open external URL',
227
- // No mustUseTools - Worker just needs to navigate, Client shouldn't demand specific validation tools
228
- });
229
- }
230
- // Default: at least verify Worker did some work
231
- if (requirements.length === 0) {
232
- requirements.push({
233
- type: 'other',
234
- description: 'General task completion',
235
- });
236
- }
237
- return requirements;
238
- }
199
+ // ─── Main Validation Entry Point ──────────────────────────────────────────
239
200
  /**
240
201
  * Validate Worker's work at appropriate involvement level
241
202
  */
242
- async validate(userMessage, subtasks, workerResult, involvementLevel) {
203
+ async validate(userMessage, subtasks, workerResult, involvementLevel, agentContext) {
243
204
  // Use LLM-based analysis instead of keyword matching
244
- const { level: analyzedLevel, requirements } = await this.analyzeTaskRequirements(userMessage, subtasks, workerResult);
205
+ const { level: analyzedLevel, requirements } = await this.analyzeTaskRequirements(userMessage, subtasks, workerResult, agentContext);
245
206
  let level = involvementLevel || analyzedLevel;
246
207
  // CRITICAL: Use LLM to check for unjustified failure claims BEFORE other validation
247
208
  // Even in MINIMAL mode, we must catch agents giving up without trying
248
- const failureAnalysis = await this.analyzeForUnjustifiedFailure(userMessage, workerResult);
209
+ const failureAnalysis = await this.analyzeForUnjustifiedFailure(userMessage, workerResult, agentContext);
249
210
  if (failureAnalysis.claimsFailure && !failureAnalysis.hasEvidence) {
250
211
  logger.info(`[Client] Detected unjustified failure claim - escalating from ${level} to STANDARD`);
251
212
  logger.info(`[Client] LLM reasoning: ${failureAnalysis.reasoning}`);
@@ -272,7 +233,7 @@ CRITICAL RULES for requirements:
272
233
  }
273
234
  // Layer 0.5: Result-vs-Evidence Coherence Check (always done, catches hallucinated accomplishments)
274
235
  // This detects when the Worker claims to have done things its tool usage doesn't support
275
- const coherenceAnalysis = await this.analyzeResultCoherence(userMessage, workerResult);
236
+ const coherenceAnalysis = await this.analyzeResultCoherence(userMessage, workerResult, agentContext);
276
237
  orchestrationLogger.logClientCoherenceCheck(coherenceAnalysis.isCoherent, coherenceAnalysis.unsupportedClaims, coherenceAnalysis.reasoning);
277
238
  if (!coherenceAnalysis.isCoherent) {
278
239
  logger.info(`[Client] Detected incoherent result — Worker claims not supported by tool usage`);
@@ -305,12 +266,14 @@ CRITICAL RULES for requirements:
305
266
  result.approved = result.requirementsMet;
306
267
  if (!result.approved && result.issues.length > 0) {
307
268
  // Generate an actionable correction instruction via LLM instead of echoing raw validation issues
308
- result.nextAction = await this.generateCorrectionInstruction(userMessage, subtasks.join('; '), result.issues, workerResult);
309
- this.failureCount++;
269
+ result.nextAction = await this.generateCorrectionInstruction(userMessage, subtasks.join('; '), result.issues, workerResult, agentContext);
270
+ this.failureCount++; // Telemetry only — no longer drives escalation
310
271
  }
311
272
  else {
312
- this.failureCount = 0; // Reset on success
273
+ this.failureCount = 0;
313
274
  }
275
+ // Analyze CompletionSignal (LLM-based per-subtask assessment)
276
+ result.completionSignal = await this.analyzeCompletionSignal(userMessage, workerResult, result.issues, agentContext);
314
277
  // Log the validation outcome
315
278
  orchestrationLogger.logClientValidation(result.approved, result.issues, result.nextAction);
316
279
  return result;
@@ -319,12 +282,14 @@ CRITICAL RULES for requirements:
319
282
  * Use LLM to analyze worker result for unjustified failure claims
320
283
  * This is language-agnostic and captures semantic meaning
321
284
  */
322
- async analyzeForUnjustifiedFailure(userMessage, workerResult) {
285
+ async analyzeForUnjustifiedFailure(userMessage, workerResult, agentContext) {
323
286
  const toolCount = workerResult.toolsUsed.length;
324
287
  const toolList = workerResult.toolsUsed.join(', ') || 'none';
288
+ const contextBlock = agentContext ? `\n${serializeAgentContext(agentContext, 'client')}` : '';
325
289
  const analysisPrompt = `You are a quality control agent. Analyze the following Worker response to determine if it's claiming failure and whether that failure is justified.
326
290
 
327
291
  USER REQUEST: ${userMessage}
292
+ ${contextBlock}
328
293
 
329
294
  WORKER RESPONSE:
330
295
  ${workerResult.result}
@@ -380,7 +345,7 @@ Respond ONLY with the JSON, no other text.`;
380
345
  * and found no bugs" but only used list_directory and never read a single file.
381
346
  * This runs at ALL involvement levels including MINIMAL.
382
347
  */
383
- async analyzeResultCoherence(userMessage, workerResult) {
348
+ async analyzeResultCoherence(userMessage, workerResult, agentContext) {
384
349
  // Skip coherence check if Worker used no tools (caught by zero-tools guard)
385
350
  // or if Worker explicitly failed (caught by failure analysis)
386
351
  if (workerResult.toolsUsed.length === 0 || !workerResult.success) {
@@ -388,9 +353,12 @@ Respond ONLY with the JSON, no other text.`;
388
353
  }
389
354
  const toolList = workerResult.toolsUsed.join(', ');
390
355
  const uniqueTools = [...new Set(workerResult.toolsUsed)].join(', ');
356
+ const availableToolsContext = this.buildToolContextForPrompt();
357
+ const contextBlock = agentContext ? `\n${serializeAgentContext(agentContext, 'client')}` : '';
391
358
  const coherencePrompt = `You are a strict quality auditor. Your job is to determine whether a Worker agent's result is SUPPORTED by the tools it actually used, or whether it fabricated/hallucinated claims.
392
359
 
393
360
  USER REQUEST: ${userMessage}
361
+ ${contextBlock}
394
362
 
395
363
  WORKER RESULT:
396
364
  ${workerResult.result.substring(0, 1000)}
@@ -399,13 +367,16 @@ TOOLS ACTUALLY USED (in order): ${toolList}
399
367
  UNIQUE TOOLS USED: ${uniqueTools}
400
368
  TOTAL TOOL CALLS: ${workerResult.toolsUsed.length}
401
369
 
370
+ AVAILABLE TOOLS IN THIS SYSTEM:
371
+ ${availableToolsContext}
372
+
402
373
  CRITICAL: Analyze whether the claims in the Worker's result are supported by the tools it used.
403
374
 
404
- Key tool semantics:
405
- - filesystem__list_directory / filesystem__directory_tree / filesystem__search_files = only shows file/folder NAMES and structure, does NOT read file contents
406
- - filesystem__read_text_file / filesystem__read_file = actually reads file content
407
- - mcp-shell-server__shell_exec = runs a shell command (check what command was likely run based on context)
408
- - playwright__* = browser automation tools
375
+ Key tool semantics to apply:
376
+ - Tools with names like "list_directory", "directory_tree", "search_files" show file/folder NAMES only they do NOT read file contents
377
+ - Tools with names like "read_text_file", "read_file", "get_file_content" actually read file content
378
+ - Tools with names like "shell_exec", "run_command", "bash", "execute" run shell commands infer what was run from the worker's result
379
+ - For any other tool, infer its semantics from its name
409
380
 
410
381
  Common hallucination patterns to detect:
411
382
  1. Worker claims to have "inspected", "reviewed", "analyzed", or "scanned" source code but never used read_text_file — it only listed directories
@@ -503,46 +474,52 @@ Respond ONLY with valid JSON:
503
474
  }
504
475
  }
505
476
  }
506
- // Browser testing validation (only for THOROUGH)
477
+ // Shell-based deep verification (THOROUGH only)
507
478
  if (level === InvolvementLevel.THOROUGH) {
508
- for (const req of requirements) {
509
- if ((req.type === 'testing' || req.type === 'verification') &&
510
- workerResult.result.includes('.html')) {
511
- // Extract HTML filename from result
512
- const htmlMatch = workerResult.result.match(/([a-zA-Z0-9._-]+\.html)/);
513
- if (htmlMatch) {
514
- const htmlFile = htmlMatch[1];
515
- const browserValidation = await this.validateInBrowser(htmlFile);
516
- if (!browserValidation.valid) {
517
- issues.push(browserValidation.issue);
518
- }
519
- }
520
- }
521
- }
479
+ const shellIssues = await this.validateWithShell(requirements, workerResult);
480
+ issues.push(...shellIssues.issues);
522
481
  }
523
482
  return { issues };
524
483
  }
484
+ // ─── Tool-Based Verification ────────────────────────────────────────────────────
525
485
  /**
526
- * Check if file exists using read tool
486
+ * Check whether a file exists, using whichever MCP tool is available.
487
+ * Tries filesystem read tools first, then falls back to shell.
527
488
  */
528
489
  async fileExists(filePath) {
529
- try {
530
- await this.mcpClient.executeTool('filesystem__read_text_file', {
531
- path: filePath,
532
- head: 1,
533
- });
534
- return true;
490
+ const readTool = this.findTool('read_text_file', 'read_file', 'get_file_content');
491
+ if (readTool) {
492
+ try {
493
+ await this.mcpClient.executeTool(readTool, { path: filePath, head: 1 });
494
+ return true;
495
+ }
496
+ catch {
497
+ return false;
498
+ }
535
499
  }
536
- catch (error) {
537
- return false;
500
+ const shellTool = this.findTool('shell_exec', 'run_command', 'bash', 'execute');
501
+ if (shellTool) {
502
+ try {
503
+ const result = await this.mcpClient.executeTool(shellTool, {
504
+ command: `test -f "${filePath}" && echo "exists" || echo "not_found"`,
505
+ });
506
+ return String(result).includes('exists');
507
+ }
508
+ catch {
509
+ return false;
510
+ }
538
511
  }
512
+ logger.debug('[Client] No tool available to verify file existence');
513
+ return false;
539
514
  }
540
515
  /**
541
516
  * Use LLM to generate an actionable correction instruction from raw validation issues.
542
- * Instead of echoing "requires using playwright__ but Worker did not use these tools",
543
- * produces something like "Use the filesystem tools to list ~/Library/Caches and report sizes".
517
+ * Translates internal validation failures into concrete, tool-specific directions
518
+ * for the Worker, referencing only the tools actually available in the system.
544
519
  */
545
- async generateCorrectionInstruction(userMessage, subtask, issues, workerResult) {
520
+ async generateCorrectionInstruction(userMessage, subtask, issues, workerResult, agentContext) {
521
+ const availableToolsContext = this.buildToolContextForPrompt();
522
+ const contextBlock = agentContext ? `\nAGENT CONTEXT:\n${serializeAgentContext(agentContext, 'client')}` : '';
546
523
  const correctionPrompt = `You are generating a correction instruction for a Worker agent that failed to complete a task properly.
547
524
 
548
525
  ORIGINAL USER REQUEST: ${userMessage}
@@ -555,10 +532,13 @@ ${issues.map((issue, i) => `${i + 1}. ${issue}`).join('\n')}
555
532
  WORKER'S RESULT (first 300 chars): ${workerResult.result.substring(0, 300)}
556
533
  TOOLS WORKER USED: ${workerResult.toolsUsed.join(', ') || 'none'}
557
534
 
535
+ AVAILABLE TOOLS THE WORKER CAN USE:
536
+ ${availableToolsContext}
537
+ ${contextBlock}
538
+
558
539
  Generate a CLEAR, ACTIONABLE instruction that tells the Worker exactly what to do to fix the issues.
559
540
  The instruction should:
560
- - Be a direct command (e.g., "Use the run_process tool to execute 'du -sh ~/Library/Caches' and report the output")
561
- - Reference specific tools or actions the Worker should take
541
+ - Be a direct command referencing specific available tools by name
562
542
  - Be concise (1-2 sentences)
563
543
  - NOT include validation jargon like "mustUseTools", "requirements", or "involvement level"
564
544
  - NOT be a generic statement like "retry the task" — be specific about WHAT to do
@@ -584,76 +564,186 @@ Respond ONLY with the correction instruction text, nothing else.`;
584
564
  return `Fix the following issue and retry: ${issues[0]}`;
585
565
  }
586
566
  /**
587
- * Validate file contents for common issues
567
+ * Validate file contents for common issues, using whichever MCP tool is available.
588
568
  */
589
569
  async validateFileContents(filePath) {
590
- try {
591
- const content = await this.mcpClient.executeTool('filesystem__read_text_file', {
592
- path: filePath,
593
- head: 200,
594
- });
595
- const contentStr = typeof content === 'string' ? content : JSON.stringify(content);
596
- // Check for HTML path mismatches
597
- if (filePath.endsWith('.html')) {
598
- const hrefMatches = contentStr.match(/href="([^"]+)"/g) || [];
599
- const srcMatches = contentStr.match(/src="([^"]+)"/g) || [];
600
- for (const match of [...hrefMatches, ...srcMatches]) {
601
- const pathMatch = match.match(/(?:href|src)="([^"]+)"/);
602
- if (pathMatch) {
603
- const referencedPath = pathMatch[1];
604
- if (!referencedPath.startsWith('http') && !referencedPath.startsWith('data:')) {
605
- const exists = await this.fileExists(referencedPath);
606
- if (!exists) {
607
- return {
608
- valid: false,
609
- issue: `HTML references non-existent file: ${referencedPath}. Fix file paths or create missing files.`
610
- };
611
- }
570
+ const readTool = this.findTool('read_text_file', 'read_file', 'get_file_content');
571
+ const shellTool = this.findTool('shell_exec', 'run_command', 'bash', 'execute');
572
+ let contentStr = null;
573
+ if (readTool) {
574
+ try {
575
+ const content = await this.mcpClient.executeTool(readTool, { path: filePath, head: 200 });
576
+ contentStr = typeof content === 'string' ? content : JSON.stringify(content);
577
+ }
578
+ catch (error) {
579
+ return { valid: false, issue: `Could not read file: ${error}` };
580
+ }
581
+ }
582
+ else if (shellTool) {
583
+ try {
584
+ const content = await this.mcpClient.executeTool(shellTool, {
585
+ command: `head -200 "${filePath}" 2>&1`,
586
+ });
587
+ contentStr = String(content);
588
+ }
589
+ catch (error) {
590
+ return { valid: false, issue: `Could not read file via shell: ${error}` };
591
+ }
592
+ }
593
+ if (contentStr === null) {
594
+ return { valid: true }; // No read tool available; skip content check
595
+ }
596
+ // Check path reference integrity in HTML files
597
+ if (filePath.endsWith('.html')) {
598
+ const hrefMatches = contentStr.match(/href="([^"]+)"/g) || [];
599
+ const srcMatches = contentStr.match(/src="([^"]+)"/g) || [];
600
+ for (const match of [...hrefMatches, ...srcMatches]) {
601
+ const pathMatch = match.match(/(?:href|src)="([^"]+)"/);
602
+ if (pathMatch) {
603
+ const referencedPath = pathMatch[1];
604
+ if (!referencedPath.startsWith('http') && !referencedPath.startsWith('data:')) {
605
+ const exists = await this.fileExists(referencedPath);
606
+ if (!exists) {
607
+ return {
608
+ valid: false,
609
+ issue: `HTML references non-existent file: ${referencedPath}. Fix file paths or create missing files.`,
610
+ };
612
611
  }
613
612
  }
614
613
  }
615
614
  }
616
- return { valid: true };
617
615
  }
618
- catch (error) {
619
- return { valid: false, issue: `Could not read file: ${error}` };
616
+ return { valid: true };
617
+ }
618
+ /**
619
+ * THOROUGH-level shell-based verification: runs lightweight, read-only shell
620
+ * commands to confirm work was actually done (file sizes, test output presence, etc.).
621
+ * Skips gracefully when no shell tool is available.
622
+ */
623
+ async validateWithShell(requirements, workerResult) {
624
+ const issues = [];
625
+ const shellTool = this.findTool('shell_exec', 'run_command', 'bash', 'execute');
626
+ if (!shellTool) {
627
+ logger.debug('[Client] No shell tool available for THOROUGH shell validation — skipping');
628
+ return { issues };
629
+ }
630
+ for (const req of requirements) {
631
+ if ((req.type === 'file_creation' || req.type === 'file_modification') && req.filePath) {
632
+ try {
633
+ const result = await this.mcpClient.executeTool(shellTool, {
634
+ command: `wc -c "${req.filePath}" 2>&1`,
635
+ });
636
+ const resultStr = String(result);
637
+ if (resultStr.includes('No such file') || resultStr.includes('cannot access')) {
638
+ issues.push(`Shell verification failed: ${req.filePath} does not exist on disk.`);
639
+ }
640
+ else {
641
+ const sizeMatch = resultStr.match(/^\s*(\d+)/);
642
+ if (sizeMatch && parseInt(sizeMatch[1], 10) === 0) {
643
+ issues.push(`File ${req.filePath} was created but is empty.`);
644
+ }
645
+ }
646
+ }
647
+ catch (error) {
648
+ logger.debug(`[Client] Shell validation error for ${req.filePath}: ${error}`);
649
+ }
650
+ }
651
+ if (req.type === 'testing') {
652
+ const hasTestOutput = workerResult.result.match(/passed|failed|error|PASS|FAIL|✓|✗|tests run|test suite/i);
653
+ if (!hasTestOutput) {
654
+ logger.debug('[Client] THOROUGH: testing requirement but no test output detected in worker result');
655
+ }
656
+ }
620
657
  }
658
+ return { issues };
621
659
  }
660
+ // ─── Completion Signal Analysis ──────────────────────────────────────────────
622
661
  /**
623
- * Validate HTML file in browser
662
+ * LLM-based per-subtask assessment of completion confidence, blocker type,
663
+ * and suggested corrective strategy. Called at the end of validate().
624
664
  */
625
- async validateInBrowser(htmlFile) {
665
+ async analyzeCompletionSignal(userMessage, workerResult, issues, agentContext) {
666
+ // If approved with no issues, high confidence
667
+ if (issues.length === 0) {
668
+ return { confidence: 'high', progressMade: true };
669
+ }
670
+ const contextBlock = agentContext ? `\n${serializeAgentContext(agentContext, 'client')}` : '';
671
+ const signalPrompt = `You are analyzing a subtask execution to produce a CompletionSignal.
672
+
673
+ USER REQUEST: ${userMessage}
674
+ ${contextBlock}
675
+
676
+ WORKER RESULT (first 500 chars): ${workerResult.result.substring(0, 500)}
677
+ WORKER SUCCESS: ${workerResult.success}
678
+ TOOLS USED: ${workerResult.toolsUsed.join(', ') || 'none'} (${workerResult.toolsUsed.length} total)
679
+
680
+ VALIDATION ISSUES:
681
+ ${issues.map((issue, i) => `${i + 1}. ${issue}`).join('\n')}
682
+
683
+ Analyze and respond ONLY with valid JSON:
684
+ {
685
+ "confidence": "<high | medium | low | none>",
686
+ "progressMade": <true if Worker made any measurable forward progress, false otherwise>,
687
+ "blockerType": "<tool_failure | hallucination | scope_drift | partial | loop | capability_gap | null>",
688
+ "suggestedStrategy": "<retry | rephrase | decompose | skip | escalate>"
689
+ }
690
+
691
+ RULES:
692
+ - confidence "high": all issues are minor or cosmetic
693
+ - confidence "medium": some progress but incomplete
694
+ - confidence "low": significant issues, little useful work
695
+ - confidence "none": no useful work done at all
696
+ - blockerType "tool_failure": Worker tried but tool errored
697
+ - blockerType "hallucination": Worker claimed work it didn't do
698
+ - blockerType "scope_drift": Worker did something unrelated
699
+ - blockerType "partial": Worker made progress but didn't finish
700
+ - blockerType "loop": Worker is repeating the same action
701
+ - blockerType "capability_gap": Task requires tools/capabilities not available
702
+ - suggestedStrategy: recommend the best recovery approach`;
626
703
  try {
627
- // Navigate to file
628
- const workspaceDir = process.cwd();
629
- const fileUrl = `file://${workspaceDir}/${htmlFile}`;
630
- await this.mcpClient.executeTool('playwright__browser_navigate', {
631
- url: fileUrl,
632
- });
633
- // Check for console errors
634
- const errors = await this.mcpClient.executeTool('playwright__browser_console_messages', {
635
- level: 'error',
704
+ const response = await this.orchestrator.chat({
705
+ messages: [
706
+ { role: 'system', content: 'You are a strict completion analyst. Respond only with valid JSON.' },
707
+ { role: 'user', content: signalPrompt },
708
+ ],
709
+ temperature: 0.1,
636
710
  });
637
- const errorStr = typeof errors === 'string' ? errors : JSON.stringify(errors);
638
- if (errorStr && errorStr.length > 0 && !errorStr.includes('[]')) {
711
+ const jsonMatch = response.content.match(/\{[\s\S]*\}/);
712
+ if (jsonMatch) {
713
+ const parsed = JSON.parse(jsonMatch[0]);
639
714
  return {
640
- valid: false,
641
- issue: `Browser errors detected in ${htmlFile}: ${errorStr}. Fix these errors before delivery.`
715
+ confidence: parsed.confidence || 'low',
716
+ progressMade: parsed.progressMade ?? false,
717
+ blockerType: parsed.blockerType === 'null' ? undefined : parsed.blockerType,
718
+ suggestedStrategy: parsed.suggestedStrategy,
642
719
  };
643
720
  }
644
- return { valid: true };
645
721
  }
646
722
  catch (error) {
647
- logger.debug(`[Client] Browser validation error: ${error}`);
648
- // Don't fail validation if browser test fails - might not have browser available
649
- return { valid: true };
723
+ logger.debug(`[Client] Failed to analyze completion signal: ${error}`);
650
724
  }
725
+ // Fallback: conservative signal
726
+ return {
727
+ confidence: 'low',
728
+ progressMade: workerResult.toolsUsed.length > 0,
729
+ blockerType: 'partial',
730
+ suggestedStrategy: 'retry',
731
+ };
651
732
  }
733
+ // ─── Session Management ─────────────────────────────────────────────────────
652
734
  /**
653
- * Reset failure tracking (for new conversation/session)
735
+ * Reset session state (call at the start of each new conversation/session).
736
+ * Renamed from resetFailureTracking() for clarity — failureCount is now
737
+ * telemetry-only and does not drive involvement escalation.
654
738
  */
655
- resetFailureTracking() {
739
+ resetSessionState() {
656
740
  this.failureCount = 0;
657
741
  }
742
+ /**
743
+ * @deprecated Use resetSessionState() instead.
744
+ */
745
+ resetFailureTracking() {
746
+ this.resetSessionState();
747
+ }
658
748
  }
659
749
  //# sourceMappingURL=client-agent.js.map