jiva-core 0.2.3 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. package/.dockerignore +53 -0
  2. package/.gcloudignore +49 -0
  3. package/CONTRIBUTING.md +92 -0
  4. package/Dockerfile +63 -0
  5. package/README.md +250 -88
  6. package/cloud-run-deploy.yaml +135 -0
  7. package/cloud-run.yaml +135 -0
  8. package/cloud-run.yaml.template +143 -0
  9. package/deploy.sh +107 -0
  10. package/dist/core/agent-spawner.d.ts +89 -0
  11. package/dist/core/agent-spawner.d.ts.map +1 -0
  12. package/dist/core/agent-spawner.js +208 -0
  13. package/dist/core/agent-spawner.js.map +1 -0
  14. package/dist/core/agent.js +1 -1
  15. package/dist/core/agent.js.map +1 -1
  16. package/dist/core/client-agent.d.ts +103 -0
  17. package/dist/core/client-agent.d.ts.map +1 -0
  18. package/dist/core/client-agent.js +659 -0
  19. package/dist/core/client-agent.js.map +1 -0
  20. package/dist/core/config.d.ts +59 -10
  21. package/dist/core/config.d.ts.map +1 -1
  22. package/dist/core/config.js +19 -2
  23. package/dist/core/config.js.map +1 -1
  24. package/dist/core/conversation-manager.d.ts +10 -18
  25. package/dist/core/conversation-manager.d.ts.map +1 -1
  26. package/dist/core/conversation-manager.js +31 -63
  27. package/dist/core/conversation-manager.js.map +1 -1
  28. package/dist/core/dual-agent.d.ts +22 -3
  29. package/dist/core/dual-agent.d.ts.map +1 -1
  30. package/dist/core/dual-agent.js +86 -10
  31. package/dist/core/dual-agent.js.map +1 -1
  32. package/dist/core/manager-agent.d.ts +16 -1
  33. package/dist/core/manager-agent.d.ts.map +1 -1
  34. package/dist/core/manager-agent.js +127 -44
  35. package/dist/core/manager-agent.js.map +1 -1
  36. package/dist/core/worker-agent.d.ts +9 -1
  37. package/dist/core/worker-agent.d.ts.map +1 -1
  38. package/dist/core/worker-agent.js +169 -14
  39. package/dist/core/worker-agent.js.map +1 -1
  40. package/dist/core/workspace.d.ts +5 -0
  41. package/dist/core/workspace.d.ts.map +1 -1
  42. package/dist/core/workspace.js +47 -7
  43. package/dist/core/workspace.js.map +1 -1
  44. package/dist/index.d.ts +3 -0
  45. package/dist/index.d.ts.map +1 -1
  46. package/dist/index.js +4 -0
  47. package/dist/index.js.map +1 -1
  48. package/dist/interfaces/cli/index.js +349 -42
  49. package/dist/interfaces/cli/index.js.map +1 -1
  50. package/dist/interfaces/http/index.d.ts +22 -0
  51. package/dist/interfaces/http/index.d.ts.map +1 -0
  52. package/dist/interfaces/http/index.js +135 -0
  53. package/dist/interfaces/http/index.js.map +1 -0
  54. package/dist/interfaces/http/middleware/auth.d.ts +32 -0
  55. package/dist/interfaces/http/middleware/auth.d.ts.map +1 -0
  56. package/dist/interfaces/http/middleware/auth.js +176 -0
  57. package/dist/interfaces/http/middleware/auth.js.map +1 -0
  58. package/dist/interfaces/http/routes/chat.d.ts +7 -0
  59. package/dist/interfaces/http/routes/chat.d.ts.map +1 -0
  60. package/dist/interfaces/http/routes/chat.js +144 -0
  61. package/dist/interfaces/http/routes/chat.js.map +1 -0
  62. package/dist/interfaces/http/routes/health.d.ts +6 -0
  63. package/dist/interfaces/http/routes/health.d.ts.map +1 -0
  64. package/dist/interfaces/http/routes/health.js +25 -0
  65. package/dist/interfaces/http/routes/health.js.map +1 -0
  66. package/dist/interfaces/http/routes/session.d.ts +7 -0
  67. package/dist/interfaces/http/routes/session.d.ts.map +1 -0
  68. package/dist/interfaces/http/routes/session.js +114 -0
  69. package/dist/interfaces/http/routes/session.js.map +1 -0
  70. package/dist/interfaces/http/session-manager.d.ts +76 -0
  71. package/dist/interfaces/http/session-manager.d.ts.map +1 -0
  72. package/dist/interfaces/http/session-manager.js +350 -0
  73. package/dist/interfaces/http/session-manager.js.map +1 -0
  74. package/dist/interfaces/http/websocket-handler.d.ts +18 -0
  75. package/dist/interfaces/http/websocket-handler.d.ts.map +1 -0
  76. package/dist/interfaces/http/websocket-handler.js +146 -0
  77. package/dist/interfaces/http/websocket-handler.js.map +1 -0
  78. package/dist/mcp/client.d.ts +11 -2
  79. package/dist/mcp/client.d.ts.map +1 -1
  80. package/dist/mcp/client.js +44 -19
  81. package/dist/mcp/client.js.map +1 -1
  82. package/dist/mcp/server-manager.d.ts +1 -1
  83. package/dist/mcp/server-manager.d.ts.map +1 -1
  84. package/dist/mcp/server-manager.js +12 -2
  85. package/dist/mcp/server-manager.js.map +1 -1
  86. package/dist/models/krutrim.js +1 -1
  87. package/dist/models/krutrim.js.map +1 -1
  88. package/dist/personas/index.d.ts +13 -0
  89. package/dist/personas/index.d.ts.map +1 -0
  90. package/dist/personas/index.js +13 -0
  91. package/dist/personas/index.js.map +1 -0
  92. package/dist/personas/persona-loader.d.ts +30 -0
  93. package/dist/personas/persona-loader.d.ts.map +1 -0
  94. package/dist/personas/persona-loader.js +246 -0
  95. package/dist/personas/persona-loader.js.map +1 -0
  96. package/dist/personas/persona-manager.d.ts +93 -0
  97. package/dist/personas/persona-manager.d.ts.map +1 -0
  98. package/dist/personas/persona-manager.js +268 -0
  99. package/dist/personas/persona-manager.js.map +1 -0
  100. package/dist/personas/skill-loader.d.ts +35 -0
  101. package/dist/personas/skill-loader.d.ts.map +1 -0
  102. package/dist/personas/skill-loader.js +144 -0
  103. package/dist/personas/skill-loader.js.map +1 -0
  104. package/dist/personas/skill-packager.d.ts +25 -0
  105. package/dist/personas/skill-packager.d.ts.map +1 -0
  106. package/dist/personas/skill-packager.js +233 -0
  107. package/dist/personas/skill-packager.js.map +1 -0
  108. package/dist/personas/types.d.ts +134 -0
  109. package/dist/personas/types.d.ts.map +1 -0
  110. package/dist/personas/types.js +7 -0
  111. package/dist/personas/types.js.map +1 -0
  112. package/dist/personas/validator.d.ts +22 -0
  113. package/dist/personas/validator.d.ts.map +1 -0
  114. package/dist/personas/validator.js +144 -0
  115. package/dist/personas/validator.js.map +1 -0
  116. package/dist/storage/factory.d.ts +51 -0
  117. package/dist/storage/factory.d.ts.map +1 -0
  118. package/dist/storage/factory.js +154 -0
  119. package/dist/storage/factory.js.map +1 -0
  120. package/dist/storage/gcp-bucket-provider.d.ts +60 -0
  121. package/dist/storage/gcp-bucket-provider.d.ts.map +1 -0
  122. package/dist/storage/gcp-bucket-provider.js +292 -0
  123. package/dist/storage/gcp-bucket-provider.js.map +1 -0
  124. package/dist/storage/index.d.ts +33 -0
  125. package/dist/storage/index.d.ts.map +1 -0
  126. package/dist/storage/index.js +37 -0
  127. package/dist/storage/index.js.map +1 -0
  128. package/dist/storage/local-provider.d.ts +37 -0
  129. package/dist/storage/local-provider.d.ts.map +1 -0
  130. package/dist/storage/local-provider.js +228 -0
  131. package/dist/storage/local-provider.js.map +1 -0
  132. package/dist/storage/provider.d.ts +142 -0
  133. package/dist/storage/provider.d.ts.map +1 -0
  134. package/dist/storage/provider.js +136 -0
  135. package/dist/storage/provider.js.map +1 -0
  136. package/dist/storage/types.d.ts +78 -0
  137. package/dist/storage/types.d.ts.map +1 -0
  138. package/dist/storage/types.js +14 -0
  139. package/dist/storage/types.js.map +1 -0
  140. package/dist/utils/logger.d.ts +19 -0
  141. package/dist/utils/logger.d.ts.map +1 -1
  142. package/dist/utils/logger.js +49 -1
  143. package/dist/utils/logger.js.map +1 -1
  144. package/dist/utils/orchestration-logger.d.ts +27 -3
  145. package/dist/utils/orchestration-logger.d.ts.map +1 -1
  146. package/dist/utils/orchestration-logger.js +110 -6
  147. package/dist/utils/orchestration-logger.js.map +1 -1
  148. package/package.json +30 -2
  149. package/.fluen/cache/state.json +0 -7
  150. package/actions/action_registry.py +0 -75
  151. package/actions/python_coder.py +0 -470
  152. package/api/main.py +0 -269
@@ -0,0 +1,659 @@
1
+ /**
2
+ * Client Agent - Adaptive validation and quality control
3
+ *
4
+ * Acts as user advocate, validating that Manager/Worker actually deliver
5
+ * what the user requested. Uses tiered involvement levels to minimize cost:
6
+ *
7
+ * - MINIMAL: Info requests, simple queries → metadata validation only
8
+ * BUT: Detects unjustified failures and escalates to STANDARD
9
+ * - STANDARD: Creation requests → file existence + basic validation
10
+ * - THOROUGH: Complex/testing requests OR failures → full E2E validation with tools
11
+ */
12
+ import { logger } from '../utils/logger.js';
13
+ import { orchestrationLogger } from '../utils/orchestration-logger.js';
14
+ export var InvolvementLevel;
15
+ (function (InvolvementLevel) {
16
+ InvolvementLevel["MINIMAL"] = "minimal";
17
+ InvolvementLevel["STANDARD"] = "standard";
18
+ InvolvementLevel["THOROUGH"] = "thorough";
19
+ })(InvolvementLevel || (InvolvementLevel = {}));
20
+ export class ClientAgent {
21
+ orchestrator;
22
+ mcpManager;
23
+ mcpClient;
24
+ failureCount = 0;
25
+ // Read-only tools Client can use for validation
26
+ ALLOWED_TOOLS = [
27
+ 'filesystem__read_text_file',
28
+ 'filesystem__list_directory',
29
+ 'filesystem__directory_tree',
30
+ 'filesystem__search_files',
31
+ 'playwright__browser_navigate',
32
+ 'playwright__browser_console_messages',
33
+ 'playwright__browser_take_screenshot',
34
+ 'playwright__browser_evaluate',
35
+ ];
36
+ constructor(orchestrator, mcpManager) {
37
+ this.orchestrator = orchestrator;
38
+ this.mcpManager = mcpManager;
39
+ this.mcpClient = mcpManager.getClient();
40
+ }
41
+ /**
42
+ * Use LLM to analyze the task and determine involvement level + requirements.
43
+ * Replaces keyword-based determineInvolvementLevel() and parseRequirements()
44
+ * with semantic understanding that avoids false positives.
45
+ */
46
+ async analyzeTaskRequirements(userMessage, subtasks, workerResult) {
47
+ const workerContext = workerResult
48
+ ? `\nWorker Result (first 500 chars): ${workerResult.result.substring(0, 500)}\nWorker Success: ${workerResult.success}\nTools Used: ${workerResult.toolsUsed.join(', ') || 'none'} (${workerResult.toolsUsed.length} total)`
49
+ : '';
50
+ const analysisPrompt = `You are a task analyst for a software agent system. Analyze the user's request to determine:
51
+ 1. How deeply to validate the Worker's output (involvement level)
52
+ 2. What specific requirements the task implies
53
+
54
+ USER MESSAGE: ${userMessage}
55
+
56
+ SUBTASKS: ${JSON.stringify(subtasks)}
57
+ ${workerContext}
58
+
59
+ PREVIOUS FAILURE COUNT: ${this.failureCount}
60
+
61
+ Respond ONLY with valid JSON in this exact format (no other text):
62
+ {
63
+ "involvementLevel": "<MINIMAL | STANDARD | THOROUGH>",
64
+ "involvementReasoning": "<brief explanation of why this level>",
65
+ "requirements": [
66
+ {
67
+ "type": "<file_creation | file_modification | testing | verification | information | other>",
68
+ "description": "<what this requirement entails>",
69
+ "filePath": null,
70
+ "mustUseTools": null
71
+ }
72
+ ]
73
+ }
74
+
75
+ CRITICAL RULES for involvementLevel:
76
+ - THOROUGH: ONLY when the user EXPLICITLY asks to test or verify something in a browser/test environment, OR after previous failures (failureCount > 0), OR for complex multi-file operations (>3 subtasks)
77
+ - MINIMAL: Information-only requests (listing files, explaining code, describing something, answering questions) where no files are created or modified
78
+ - STANDARD: Default for creation, modification, or action tasks
79
+
80
+ CRITICAL RULES for requirements:
81
+ - "testing" type with mustUseTools ["playwright__"] should ONLY be set when the user wants browser-based testing or verification of a web page/HTML/UI
82
+ - Words like "check", "find", "verify" in the context of system administration (disk space, processes, configurations) are NOT browser testing — they are "information" type WITHOUT playwright tools
83
+ - Examples:
84
+ - "check how much space my caches use" = type "information", mustUseTools null
85
+ - "find the biggest files in Downloads" = type "information", mustUseTools null
86
+ - "test the login page in the browser" = type "testing", mustUseTools ["playwright__"]
87
+ - "create index.html and verify it works" = type "file_creation" + type "testing" with playwright
88
+ - "make sure the server is running" = type "verification", mustUseTools null
89
+ - If no specific tools are required, set mustUseTools to null
90
+ - Always include at least one requirement entry`;
91
+ try {
92
+ const response = await this.orchestrator.chat({
93
+ messages: [
94
+ { role: 'system', content: 'You are a strict task analyst. Respond only with valid JSON.' },
95
+ { role: 'user', content: analysisPrompt },
96
+ ],
97
+ temperature: 0.1,
98
+ });
99
+ const jsonMatch = response.content.match(/\{[\s\S]*\}/);
100
+ if (jsonMatch) {
101
+ const analysis = JSON.parse(jsonMatch[0]);
102
+ // Map string to enum
103
+ let level;
104
+ switch (analysis.involvementLevel?.toUpperCase()) {
105
+ case 'THOROUGH':
106
+ level = InvolvementLevel.THOROUGH;
107
+ break;
108
+ case 'MINIMAL':
109
+ level = InvolvementLevel.MINIMAL;
110
+ break;
111
+ default:
112
+ level = InvolvementLevel.STANDARD;
113
+ }
114
+ // Hard override: escalate to THOROUGH after failures
115
+ if (this.failureCount > 0 && level !== InvolvementLevel.THOROUGH) {
116
+ logger.debug(`[Client] Escalating to THOROUGH due to ${this.failureCount} previous failures`);
117
+ level = InvolvementLevel.THOROUGH;
118
+ }
119
+ const requirements = (analysis.requirements || []).map((req) => ({
120
+ type: req.type || 'other',
121
+ description: req.description || 'General task completion',
122
+ filePath: req.filePath || undefined,
123
+ mustUseTools: req.mustUseTools || undefined,
124
+ }));
125
+ // Ensure at least one requirement
126
+ if (requirements.length === 0) {
127
+ requirements.push({ type: 'other', description: 'General task completion' });
128
+ }
129
+ logger.info(`[Client] LLM task analysis: ${level.toUpperCase()} involvement — ${analysis.involvementReasoning || 'no reasoning provided'}`);
130
+ logger.debug(`[Client] Requirements: ${JSON.stringify(requirements.map(r => ({ type: r.type, desc: r.description })))}`);
131
+ return { level, requirements };
132
+ }
133
+ }
134
+ catch (error) {
135
+ logger.warn(`[Client] LLM task analysis failed: ${error}, falling back to STANDARD`);
136
+ }
137
+ // Fallback: STANDARD with generic requirement
138
+ return {
139
+ level: this.failureCount > 0 ? InvolvementLevel.THOROUGH : InvolvementLevel.STANDARD,
140
+ requirements: [{ type: 'other', description: 'General task completion' }],
141
+ };
142
+ }
143
+ /**
144
+ * @deprecated Use analyzeTaskRequirements() instead. Kept for reference.
145
+ * Determine involvement level based on user request complexity
146
+ */
147
+ determineInvolvementLevel(userMessage, subtasks) {
148
+ const messageLower = userMessage.toLowerCase();
149
+ const subtasksLower = subtasks.join(' ').toLowerCase();
150
+ // THOROUGH: User explicitly requests testing/verification
151
+ const testKeywords = ['test', 'verify', 'check', 'make sure', 'ensure', 'validate'];
152
+ if (testKeywords.some(kw => messageLower.includes(kw))) {
153
+ logger.debug('[Client] THOROUGH mode: Testing/verification requested');
154
+ return InvolvementLevel.THOROUGH;
155
+ }
156
+ // THOROUGH: After failures (user frustrated)
157
+ if (this.failureCount > 0) {
158
+ logger.debug(`[Client] THOROUGH mode: ${this.failureCount} previous failures detected`);
159
+ return InvolvementLevel.THOROUGH;
160
+ }
161
+ // THOROUGH: Complex multi-file operations
162
+ if (subtasks.length > 3 || (messageLower.includes('component') && messageLower.includes('index.html'))) {
163
+ logger.debug('[Client] THOROUGH mode: Complex multi-file operation');
164
+ return InvolvementLevel.THOROUGH;
165
+ }
166
+ // MINIMAL: Information-only requests
167
+ const infoKeywords = ['what', 'list', 'show', 'explain', 'describe', 'how', 'tell me'];
168
+ const creationKeywords = ['create', 'build', 'write', 'generate', 'make', 'add'];
169
+ const hasInfoKeyword = infoKeywords.some(kw => messageLower.includes(kw));
170
+ const hasCreationKeyword = creationKeywords.some(kw => messageLower.includes(kw));
171
+ if (hasInfoKeyword && !hasCreationKeyword) {
172
+ logger.debug('[Client] MINIMAL mode: Information request');
173
+ return InvolvementLevel.MINIMAL;
174
+ }
175
+ // STANDARD: Default for creation/modification tasks
176
+ logger.debug('[Client] STANDARD mode: Regular creation task');
177
+ return InvolvementLevel.STANDARD;
178
+ }
179
+ /**
180
+ * @deprecated Use analyzeTaskRequirements() instead. Kept for reference.
181
+ * Parse requirements from user message
182
+ */
183
+ parseRequirements(userMessage, subtasks) {
184
+ const requirements = [];
185
+ const messageLower = userMessage.toLowerCase();
186
+ const combined = (messageLower + ' ' + subtasks.join(' ').toLowerCase());
187
+ // Detect file creation requirements
188
+ const fileMatches = userMessage.match(/(?:create|build|generate|write|save as)\s+([a-zA-Z0-9._/-]+\.(html|js|css|md|json|txt|py|ts|tsx|jsx))/gi);
189
+ if (fileMatches) {
190
+ fileMatches.forEach(match => {
191
+ const filename = match.split(/\s+/).pop();
192
+ if (filename) {
193
+ requirements.push({
194
+ type: 'file_creation',
195
+ description: `Create file: ${filename}`,
196
+ filePath: filename,
197
+ });
198
+ }
199
+ });
200
+ }
201
+ // Detect testing requirements (explicit verification requests)
202
+ const testKeywords = ['test', 'verify', 'check', 'make sure', 'ensure'];
203
+ if (testKeywords.some(kw => combined.includes(kw))) {
204
+ requirements.push({
205
+ type: 'testing',
206
+ description: 'Verify functionality through testing',
207
+ mustUseTools: ['playwright__'],
208
+ });
209
+ }
210
+ // Detect browser verification requirements - ONLY for file verification, not general browsing
211
+ // This should trigger for "open index.html in browser" but NOT for "open linkedin.com"
212
+ const isLocalFileOpen = (combined.includes('open') && combined.includes('.html')) ||
213
+ (combined.includes('browser') && combined.includes('.html'));
214
+ const isExternalUrl = combined.match(/open\s+(?:https?:\/\/)?(?:www\.)?[a-z0-9-]+\.[a-z]{2,}/i);
215
+ if (isLocalFileOpen && !isExternalUrl) {
216
+ requirements.push({
217
+ type: 'verification',
218
+ description: 'Browser testing required for local HTML file',
219
+ mustUseTools: ['playwright__browser_navigate', 'playwright__browser_console_messages'],
220
+ });
221
+ }
222
+ // For external URLs, don't require specific tools - just opening the page is enough
223
+ if (isExternalUrl) {
224
+ requirements.push({
225
+ type: 'verification',
226
+ description: 'Open external URL',
227
+ // No mustUseTools - Worker just needs to navigate, Client shouldn't demand specific validation tools
228
+ });
229
+ }
230
+ // Default: at least verify Worker did some work
231
+ if (requirements.length === 0) {
232
+ requirements.push({
233
+ type: 'other',
234
+ description: 'General task completion',
235
+ });
236
+ }
237
+ return requirements;
238
+ }
239
+ /**
240
+ * Validate Worker's work at appropriate involvement level
241
+ */
242
+ async validate(userMessage, subtasks, workerResult, involvementLevel) {
243
+ // Use LLM-based analysis instead of keyword matching
244
+ const { level: analyzedLevel, requirements } = await this.analyzeTaskRequirements(userMessage, subtasks, workerResult);
245
+ let level = involvementLevel || analyzedLevel;
246
+ // CRITICAL: Use LLM to check for unjustified failure claims BEFORE other validation
247
+ // Even in MINIMAL mode, we must catch agents giving up without trying
248
+ const failureAnalysis = await this.analyzeForUnjustifiedFailure(userMessage, workerResult);
249
+ if (failureAnalysis.claimsFailure && !failureAnalysis.hasEvidence) {
250
+ logger.info(`[Client] Detected unjustified failure claim - escalating from ${level} to STANDARD`);
251
+ logger.info(`[Client] LLM reasoning: ${failureAnalysis.reasoning}`);
252
+ logger.info(`[Client] Tools attempted: ${workerResult.toolsUsed.length}`);
253
+ // Escalate to at least STANDARD to properly validate
254
+ if (level === InvolvementLevel.MINIMAL) {
255
+ level = InvolvementLevel.STANDARD;
256
+ }
257
+ }
258
+ logger.info(`[Client] Validating with ${level.toUpperCase()} involvement`);
259
+ // Log the analysis for orchestration tracing
260
+ orchestrationLogger.logClientAnalysis(level, requirements.length, `Requirements: ${requirements.map(r => r.type).join(', ')}`);
261
+ const result = {
262
+ approved: false,
263
+ requirementsMet: false,
264
+ issues: [],
265
+ involvementLevel: level,
266
+ };
267
+ // Layer 0: Unjustified Failure Detection (always done first)
268
+ if (failureAnalysis.claimsFailure && !failureAnalysis.hasEvidence) {
269
+ const failureIssue = failureAnalysis.suggestedAction ||
270
+ `REJECTED: Worker claims failure without sufficient evidence. ${failureAnalysis.reasoning}`;
271
+ result.issues.push(failureIssue);
272
+ }
273
+ // Layer 0.5: Result-vs-Evidence Coherence Check (always done, catches hallucinated accomplishments)
274
+ // This detects when the Worker claims to have done things its tool usage doesn't support
275
+ const coherenceAnalysis = await this.analyzeResultCoherence(userMessage, workerResult);
276
+ orchestrationLogger.logClientCoherenceCheck(coherenceAnalysis.isCoherent, coherenceAnalysis.unsupportedClaims, coherenceAnalysis.reasoning);
277
+ if (!coherenceAnalysis.isCoherent) {
278
+ logger.info(`[Client] Detected incoherent result — Worker claims not supported by tool usage`);
279
+ logger.info(`[Client] Unsupported claims: ${coherenceAnalysis.unsupportedClaims.join('; ')}`);
280
+ logger.info(`[Client] Coherence reasoning: ${coherenceAnalysis.reasoning}`);
281
+ const coherenceIssue = coherenceAnalysis.suggestedAction ||
282
+ `REJECTED: Worker's result contains claims not supported by its actual tool usage. ${coherenceAnalysis.reasoning}`;
283
+ result.issues.push(coherenceIssue);
284
+ // Escalate involvement level — the Worker is hallucinating, we need stricter validation
285
+ if (level === InvolvementLevel.MINIMAL) {
286
+ level = InvolvementLevel.STANDARD;
287
+ result.involvementLevel = level;
288
+ logger.info(`[Client] Escalating to STANDARD due to incoherent result`);
289
+ }
290
+ }
291
+ // Layer 1: Process Validation (always done, no tools needed)
292
+ const processValidation = this.validateProcess(requirements, workerResult, level);
293
+ if (processValidation.issues.length > 0) {
294
+ result.issues.push(...processValidation.issues);
295
+ }
296
+ // Layer 2: Outcome Validation (only for STANDARD and THOROUGH)
297
+ if (level === InvolvementLevel.STANDARD || level === InvolvementLevel.THOROUGH) {
298
+ const outcomeValidation = await this.validateOutcome(requirements, workerResult, level);
299
+ if (outcomeValidation.issues.length > 0) {
300
+ result.issues.push(...outcomeValidation.issues);
301
+ }
302
+ }
303
+ // Determine approval
304
+ result.requirementsMet = result.issues.length === 0;
305
+ result.approved = result.requirementsMet;
306
+ if (!result.approved && result.issues.length > 0) {
307
+ // Generate an actionable correction instruction via LLM instead of echoing raw validation issues
308
+ result.nextAction = await this.generateCorrectionInstruction(userMessage, subtasks.join('; '), result.issues, workerResult);
309
+ this.failureCount++;
310
+ }
311
+ else {
312
+ this.failureCount = 0; // Reset on success
313
+ }
314
+ // Log the validation outcome
315
+ orchestrationLogger.logClientValidation(result.approved, result.issues, result.nextAction);
316
+ return result;
317
+ }
318
+ /**
319
+ * Use LLM to analyze worker result for unjustified failure claims
320
+ * This is language-agnostic and captures semantic meaning
321
+ */
322
+ async analyzeForUnjustifiedFailure(userMessage, workerResult) {
323
+ const toolCount = workerResult.toolsUsed.length;
324
+ const toolList = workerResult.toolsUsed.join(', ') || 'none';
325
+ const analysisPrompt = `You are a quality control agent. Analyze the following Worker response to determine if it's claiming failure and whether that failure is justified.
326
+
327
+ USER REQUEST: ${userMessage}
328
+
329
+ WORKER RESPONSE:
330
+ ${workerResult.result}
331
+
332
+ WORKER REASONING: ${workerResult.reasoning}
333
+
334
+ TOOLS USED: ${toolList} (${toolCount} total)
335
+ WORKER SUCCESS FLAG: ${workerResult.success}
336
+
337
+ Analyze and respond in this EXACT JSON format:
338
+ {
339
+ "claimsFailure": <true if the response indicates the task cannot/could not be done, or refuses to do it>,
340
+ "hasEvidence": <true if there is concrete evidence justifying the failure (actual error messages, specific technical blockers, permission issues, etc.)>,
341
+ "reasoning": "<brief explanation of your analysis>",
342
+ "suggestedAction": "<if claimsFailure is true and hasEvidence is false, provide a specific instruction for the Worker to actually attempt the task>"
343
+ }
344
+
345
+ IMPORTANT CRITERIA:
346
+ - If the Worker used 0 tools and claims failure, hasEvidence should be false (they didn't even try)
347
+ - If the Worker claims something is "impossible" or "cannot be done" without showing actual error messages, hasEvidence should be false
348
+ - Legitimate evidence includes: actual error output, specific file/permission errors, concrete technical limitations
349
+ - Vague reasons like "I don't have the ability" or "this is outside my scope" are NOT evidence
350
+
351
+ Respond ONLY with the JSON, no other text.`;
352
+ try {
353
+ const response = await this.orchestrator.chat({
354
+ messages: [
355
+ { role: 'system', content: 'You are a strict quality control validator. Respond only with valid JSON.' },
356
+ { role: 'user', content: analysisPrompt },
357
+ ],
358
+ temperature: 0.1, // Low temperature for consistent analysis
359
+ });
360
+ // Parse the JSON response
361
+ const jsonMatch = response.content.match(/\{[\s\S]*\}/);
362
+ if (jsonMatch) {
363
+ const analysis = JSON.parse(jsonMatch[0]);
364
+ return analysis;
365
+ }
366
+ }
367
+ catch (error) {
368
+ logger.debug(`[Client] Failed to analyze failure claim: ${error}`);
369
+ }
370
+ // Default: assume no failure claim if analysis fails
371
+ return {
372
+ claimsFailure: false,
373
+ hasEvidence: true,
374
+ reasoning: 'Analysis could not be performed',
375
+ };
376
+ }
377
+ /**
378
+ * Use LLM to cross-check the Worker's result claims against its actual tool usage.
379
+ * Catches hallucinated accomplishments — e.g., Worker claims "I inspected all source files
380
+ * and found no bugs" but only used list_directory and never read a single file.
381
+ * This runs at ALL involvement levels including MINIMAL.
382
+ */
383
+ async analyzeResultCoherence(userMessage, workerResult) {
384
+ // Skip coherence check if Worker used no tools (caught by zero-tools guard)
385
+ // or if Worker explicitly failed (caught by failure analysis)
386
+ if (workerResult.toolsUsed.length === 0 || !workerResult.success) {
387
+ return { isCoherent: true, reasoning: 'Skipped — handled by other checks', unsupportedClaims: [] };
388
+ }
389
+ const toolList = workerResult.toolsUsed.join(', ');
390
+ const uniqueTools = [...new Set(workerResult.toolsUsed)].join(', ');
391
+ const coherencePrompt = `You are a strict quality auditor. Your job is to determine whether a Worker agent's result is SUPPORTED by the tools it actually used, or whether it fabricated/hallucinated claims.
392
+
393
+ USER REQUEST: ${userMessage}
394
+
395
+ WORKER RESULT:
396
+ ${workerResult.result.substring(0, 1000)}
397
+
398
+ TOOLS ACTUALLY USED (in order): ${toolList}
399
+ UNIQUE TOOLS USED: ${uniqueTools}
400
+ TOTAL TOOL CALLS: ${workerResult.toolsUsed.length}
401
+
402
+ CRITICAL: Analyze whether the claims in the Worker's result are supported by the tools it used.
403
+
404
+ Key tool semantics:
405
+ - filesystem__list_directory / filesystem__directory_tree / filesystem__search_files = only shows file/folder NAMES and structure, does NOT read file contents
406
+ - filesystem__read_text_file / filesystem__read_file = actually reads file content
407
+ - mcp-shell-server__shell_exec = runs a shell command (check what command was likely run based on context)
408
+ - playwright__* = browser automation tools
409
+
410
+ Common hallucination patterns to detect:
411
+ 1. Worker claims to have "inspected", "reviewed", "analyzed", or "scanned" source code but never used read_text_file — it only listed directories
412
+ 2. Worker claims "no bugs found" or "code is correct" without reading any source files
413
+ 3. Worker claims to have run tests or builds but no shell_exec tool was used (or the result doesn't reference actual test output)
414
+ 4. Worker provides specific code details (line numbers, variable names, function logic) without having read the files containing them
415
+ 5. Worker makes definitive statements about code quality, correctness, or behavior without having read the relevant code
416
+
417
+ Respond ONLY with valid JSON:
418
+ {
419
+ "isCoherent": <true if ALL claims in the result are supported by actual tool usage, false if any claims are fabricated>,
420
+ "reasoning": "<brief explanation of what's supported vs what's fabricated>",
421
+ "unsupportedClaims": ["<list each specific claim that is NOT supported by tool usage>"],
422
+ "suggestedAction": "<if not coherent, what the Worker should actually do — e.g., 'Read the source files using filesystem__read_text_file before claiming to have analyzed them'>"
423
+ }`;
424
+ try {
425
+ const response = await this.orchestrator.chat({
426
+ messages: [
427
+ { role: 'system', content: 'You are a strict quality auditor. Respond only with valid JSON.' },
428
+ { role: 'user', content: coherencePrompt },
429
+ ],
430
+ temperature: 0.1,
431
+ });
432
+ const jsonMatch = response.content.match(/\{[\s\S]*\}/);
433
+ if (jsonMatch) {
434
+ const analysis = JSON.parse(jsonMatch[0]);
435
+ return analysis;
436
+ }
437
+ }
438
+ catch (error) {
439
+ logger.debug(`[Client] Failed to analyze result coherence: ${error}`);
440
+ }
441
+ // Default: assume coherent if analysis fails
442
+ return {
443
+ isCoherent: true,
444
+ reasoning: 'Coherence analysis could not be performed',
445
+ unsupportedClaims: [],
446
+ };
447
+ }
448
+ /**
449
+ * Layer 1: Process Validation (metadata only, no tools)
450
+ */
451
+ validateProcess(requirements, workerResult, involvementLevel) {
452
+ const issues = [];
453
+ // Zero-tools guard: if Worker used no tools at all and this is not a purely
454
+ // informational/conversational task, reject immediately
455
+ if (workerResult.toolsUsed.length === 0 && involvementLevel !== InvolvementLevel.MINIMAL) {
456
+ const isConversational = requirements.every(r => r.type === 'information' || r.type === 'other');
457
+ if (!isConversational) {
458
+ issues.push('Worker completed the task without using any tools. ' +
459
+ 'The task requires actual tool usage (file operations, shell commands, browser actions, etc.) — ' +
460
+ 'not just generating a text response. Use the available tools to actually perform the task.');
461
+ }
462
+ }
463
+ // Check if Worker used appropriate tools for requirements
464
+ for (const req of requirements) {
465
+ if (req.mustUseTools && req.mustUseTools.length > 0) {
466
+ const usedRequiredTool = req.mustUseTools.some(requiredTool => {
467
+ return workerResult.toolsUsed.some(actualTool => actualTool === requiredTool || actualTool.startsWith(requiredTool) || requiredTool.startsWith(actualTool.split('__')[0] + '__'));
468
+ });
469
+ if (!usedRequiredTool) {
470
+ issues.push(`${req.description} requires using ${req.mustUseTools.join(' or ')} but Worker did not use these tools.`);
471
+ }
472
+ }
473
+ }
474
+ // Check if Worker succeeded
475
+ if (!workerResult.success) {
476
+ issues.push('Worker did not complete the task successfully. The task needs to be retried with appropriate tool usage.');
477
+ }
478
+ return { issues };
479
+ }
480
+ /**
481
+ * Layer 2: Outcome Validation (uses read-only tools)
482
+ */
483
+ async validateOutcome(requirements, workerResult, level) {
484
+ const issues = [];
485
+ // File existence validation
486
+ for (const req of requirements) {
487
+ if (req.type === 'file_creation' && req.filePath) {
488
+ try {
489
+ const exists = await this.fileExists(req.filePath);
490
+ if (!exists) {
491
+ issues.push(`Required file not created: ${req.filePath}`);
492
+ }
493
+ else if (level === InvolvementLevel.THOROUGH) {
494
+ // For THOROUGH, also validate file contents
495
+ const validation = await this.validateFileContents(req.filePath);
496
+ if (!validation.valid) {
497
+ issues.push(`File ${req.filePath} created but has issues: ${validation.issue}`);
498
+ }
499
+ }
500
+ }
501
+ catch (error) {
502
+ logger.debug(`[Client] Error validating file ${req.filePath}: ${error}`);
503
+ }
504
+ }
505
+ }
506
+ // Browser testing validation (only for THOROUGH)
507
+ if (level === InvolvementLevel.THOROUGH) {
508
+ for (const req of requirements) {
509
+ if ((req.type === 'testing' || req.type === 'verification') &&
510
+ workerResult.result.includes('.html')) {
511
+ // Extract HTML filename from result
512
+ const htmlMatch = workerResult.result.match(/([a-zA-Z0-9._-]+\.html)/);
513
+ if (htmlMatch) {
514
+ const htmlFile = htmlMatch[1];
515
+ const browserValidation = await this.validateInBrowser(htmlFile);
516
+ if (!browserValidation.valid) {
517
+ issues.push(browserValidation.issue);
518
+ }
519
+ }
520
+ }
521
+ }
522
+ }
523
+ return { issues };
524
+ }
525
+ /**
526
+ * Check if file exists using read tool
527
+ */
528
+ async fileExists(filePath) {
529
+ try {
530
+ await this.mcpClient.executeTool('filesystem__read_text_file', {
531
+ path: filePath,
532
+ head: 1,
533
+ });
534
+ return true;
535
+ }
536
+ catch (error) {
537
+ return false;
538
+ }
539
+ }
540
+ /**
541
+ * Use LLM to generate an actionable correction instruction from raw validation issues.
542
+ * Instead of echoing "requires using playwright__ but Worker did not use these tools",
543
+ * produces something like "Use the filesystem tools to list ~/Library/Caches and report sizes".
544
+ */
545
+ async generateCorrectionInstruction(userMessage, subtask, issues, workerResult) {
546
+ const correctionPrompt = `You are generating a correction instruction for a Worker agent that failed to complete a task properly.
547
+
548
+ ORIGINAL USER REQUEST: ${userMessage}
549
+
550
+ SUBTASK THAT WAS ATTEMPTED: ${subtask}
551
+
552
+ VALIDATION ISSUES FOUND:
553
+ ${issues.map((issue, i) => `${i + 1}. ${issue}`).join('\n')}
554
+
555
+ WORKER'S RESULT (first 300 chars): ${workerResult.result.substring(0, 300)}
556
+ TOOLS WORKER USED: ${workerResult.toolsUsed.join(', ') || 'none'}
557
+
558
+ Generate a CLEAR, ACTIONABLE instruction that tells the Worker exactly what to do to fix the issues.
559
+ The instruction should:
560
+ - Be a direct command (e.g., "Use the run_process tool to execute 'du -sh ~/Library/Caches' and report the output")
561
+ - Reference specific tools or actions the Worker should take
562
+ - Be concise (1-2 sentences)
563
+ - NOT include validation jargon like "mustUseTools", "requirements", or "involvement level"
564
+ - NOT be a generic statement like "retry the task" — be specific about WHAT to do
565
+
566
+ Respond ONLY with the correction instruction text, nothing else.`;
567
+ try {
568
+ const response = await this.orchestrator.chat({
569
+ messages: [
570
+ { role: 'system', content: 'You generate concise, actionable correction instructions for a Worker agent. Respond with only the instruction text.' },
571
+ { role: 'user', content: correctionPrompt },
572
+ ],
573
+ temperature: 0.1,
574
+ });
575
+ const instruction = response.content.trim();
576
+ if (instruction.length > 10 && instruction.length < 500) {
577
+ return instruction;
578
+ }
579
+ }
580
+ catch (error) {
581
+ logger.warn(`[Client] Failed to generate correction instruction: ${error}`);
582
+ }
583
+ // Fallback: use first issue with a prefix
584
+ return `Fix the following issue and retry: ${issues[0]}`;
585
+ }
586
+ /**
587
+ * Validate file contents for common issues
588
+ */
589
+ async validateFileContents(filePath) {
590
+ try {
591
+ const content = await this.mcpClient.executeTool('filesystem__read_text_file', {
592
+ path: filePath,
593
+ head: 200,
594
+ });
595
+ const contentStr = typeof content === 'string' ? content : JSON.stringify(content);
596
+ // Check for HTML path mismatches
597
+ if (filePath.endsWith('.html')) {
598
+ const hrefMatches = contentStr.match(/href="([^"]+)"/g) || [];
599
+ const srcMatches = contentStr.match(/src="([^"]+)"/g) || [];
600
+ for (const match of [...hrefMatches, ...srcMatches]) {
601
+ const pathMatch = match.match(/(?:href|src)="([^"]+)"/);
602
+ if (pathMatch) {
603
+ const referencedPath = pathMatch[1];
604
+ if (!referencedPath.startsWith('http') && !referencedPath.startsWith('data:')) {
605
+ const exists = await this.fileExists(referencedPath);
606
+ if (!exists) {
607
+ return {
608
+ valid: false,
609
+ issue: `HTML references non-existent file: ${referencedPath}. Fix file paths or create missing files.`
610
+ };
611
+ }
612
+ }
613
+ }
614
+ }
615
+ }
616
+ return { valid: true };
617
+ }
618
+ catch (error) {
619
+ return { valid: false, issue: `Could not read file: ${error}` };
620
+ }
621
+ }
622
+ /**
623
+ * Validate HTML file in browser
624
+ */
625
+ async validateInBrowser(htmlFile) {
626
+ try {
627
+ // Navigate to file
628
+ const workspaceDir = process.cwd();
629
+ const fileUrl = `file://${workspaceDir}/${htmlFile}`;
630
+ await this.mcpClient.executeTool('playwright__browser_navigate', {
631
+ url: fileUrl,
632
+ });
633
+ // Check for console errors
634
+ const errors = await this.mcpClient.executeTool('playwright__browser_console_messages', {
635
+ level: 'error',
636
+ });
637
+ const errorStr = typeof errors === 'string' ? errors : JSON.stringify(errors);
638
+ if (errorStr && errorStr.length > 0 && !errorStr.includes('[]')) {
639
+ return {
640
+ valid: false,
641
+ issue: `Browser errors detected in ${htmlFile}: ${errorStr}. Fix these errors before delivery.`
642
+ };
643
+ }
644
+ return { valid: true };
645
+ }
646
+ catch (error) {
647
+ logger.debug(`[Client] Browser validation error: ${error}`);
648
+ // Don't fail validation if browser test fails - might not have browser available
649
+ return { valid: true };
650
+ }
651
+ }
652
+ /**
653
+ * Reset failure tracking (for new conversation/session)
654
+ */
655
+ resetFailureTracking() {
656
+ this.failureCount = 0;
657
+ }
658
+ }
659
+ //# sourceMappingURL=client-agent.js.map