halo-agent 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/vision.js ADDED
@@ -0,0 +1,398 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Claude computer-use vision driver for HALO agent.
5
+ *
6
+ * Two entry points:
7
+ *
8
+ * 1. visionNavigateAndSubmit(page, aep, apiKey)
9
+ * Targeted use: DOM filling already ran, but submit button not found.
10
+ * Claude looks at the current page state and completes the navigation + submission.
11
+ * Used as a fallback after fillFields() + pagination have run.
12
+ *
13
+ * 2. visionFill(page, aep, apiKey)
14
+ * Full fallback: fills all visible fields from scratch via vision.
15
+ * Used for ATS types that completely defeat DOM automation (Workday canvas, Taleo).
16
+ *
17
+ * Both use a proper action loop:
18
+ * screenshot -> Claude decides action -> execute in Playwright -> screenshot -> repeat
19
+ * until Claude signals done or max iterations reached.
20
+ */
21
+
22
+ const CLAUDE_MODEL = 'claude-opus-4-6';
23
+ const MAX_ITERATIONS = 40;
24
+ const SCREENSHOT_QUALITY = 75;
25
+
26
+ /**
27
+ * Execute a single computer_use action returned by Claude.
28
+ * Returns a fresh screenshot buffer after the action.
29
+ */
30
+ async function executeAction(page, action) {
31
+ const { action: type, coordinate, text, key, scroll_direction, scroll_distance } = action;
32
+
33
+ switch (type) {
34
+ case 'screenshot':
35
+ // Claude is checking its work — just return fresh screenshot, no interaction
36
+ break;
37
+
38
+ case 'left_click':
39
+ await page.mouse.click(coordinate[0], coordinate[1]);
40
+ await page.waitForTimeout(400);
41
+ break;
42
+
43
+ case 'double_click':
44
+ await page.mouse.dblclick(coordinate[0], coordinate[1]);
45
+ await page.waitForTimeout(400);
46
+ break;
47
+
48
+ case 'right_click':
49
+ await page.mouse.click(coordinate[0], coordinate[1], { button: 'right' });
50
+ await page.waitForTimeout(300);
51
+ break;
52
+
53
+ case 'mouse_move':
54
+ await page.mouse.move(coordinate[0], coordinate[1]);
55
+ break;
56
+
57
+ case 'left_click_drag':
58
+ await page.mouse.move(coordinate[0], coordinate[1]);
59
+ await page.mouse.down();
60
+ if (action.end_coordinate) {
61
+ await page.mouse.move(action.end_coordinate[0], action.end_coordinate[1]);
62
+ }
63
+ await page.mouse.up();
64
+ break;
65
+
66
+ case 'type':
67
+ await page.keyboard.type(text, { delay: 25 });
68
+ break;
69
+
70
+ case 'key':
71
+ // Handle key combos like "ctrl+a"
72
+ if (key.includes('+')) {
73
+ const parts = key.split('+');
74
+ const modifiers = parts.slice(0, -1);
75
+ const mainKey = parts[parts.length - 1];
76
+ for (const mod of modifiers) await page.keyboard.down(mod);
77
+ await page.keyboard.press(mainKey);
78
+ for (const mod of modifiers.reverse()) await page.keyboard.up(mod);
79
+ } else {
80
+ await page.keyboard.press(key);
81
+ }
82
+ await page.waitForTimeout(200);
83
+ break;
84
+
85
+ case 'scroll':
86
+ await page.mouse.move(coordinate[0], coordinate[1]);
87
+ const delta = (scroll_distance || 3) * 100;
88
+ await page.mouse.wheel(0, scroll_direction === 'up' ? -delta : delta);
89
+ await page.waitForTimeout(300);
90
+ break;
91
+
92
+ default:
93
+ console.warn(`[vision] Unknown action type: ${type}`);
94
+ }
95
+
96
+ // Wait for any triggered navigation or DOM updates
97
+ await page.waitForLoadState('domcontentloaded', { timeout: 5000 }).catch(() => {});
98
+
99
+ // Return fresh screenshot
100
+ return page.screenshot({ type: 'jpeg', quality: SCREENSHOT_QUALITY });
101
+ }
102
+
103
+ /**
104
+ * Core action loop. Sends messages to Claude and executes returned actions
105
+ * until Claude stops issuing tool calls (signals completion).
106
+ */
107
+ async function runActionLoop(page, systemPrompt, userPrompt, apiKey, maxIter = MAX_ITERATIONS) {
108
+ // Get the current viewport size so Claude knows the coordinate space
109
+ const viewport = page.viewportSize() || { width: 1280, height: 800 };
110
+
111
+ const screenshot = await page.screenshot({ type: 'jpeg', quality: SCREENSHOT_QUALITY });
112
+ const base64 = screenshot.toString('base64');
113
+
114
+ const messages = [
115
+ {
116
+ role: 'user',
117
+ content: [
118
+ {
119
+ type: 'image',
120
+ source: { type: 'base64', media_type: 'image/jpeg', data: base64 },
121
+ },
122
+ { type: 'text', text: userPrompt },
123
+ ],
124
+ },
125
+ ];
126
+
127
+ let iteration = 0;
128
+ let actionsExecuted = 0;
129
+
130
+ while (iteration < maxIter) {
131
+ iteration++;
132
+
133
+ let response;
134
+ try {
135
+ const res = await fetch('https://api.anthropic.com/v1/messages', {
136
+ method: 'POST',
137
+ headers: {
138
+ 'Content-Type': 'application/json',
139
+ 'x-api-key': apiKey,
140
+ 'anthropic-version': '2023-06-01',
141
+ 'anthropic-beta': 'computer-use-2024-10-22',
142
+ },
143
+ body: JSON.stringify({
144
+ model: CLAUDE_MODEL,
145
+ max_tokens: 2048,
146
+ system: systemPrompt,
147
+ tools: [{
148
+ type: 'computer_20241022',
149
+ name: 'computer',
150
+ display_width_px: viewport.width,
151
+ display_height_px: viewport.height,
152
+ }],
153
+ messages,
154
+ }),
155
+ });
156
+
157
+ if (!res.ok) {
158
+ const errText = await res.text();
159
+ console.error('[vision] Claude API error:', res.status, errText.slice(0, 200));
160
+ break;
161
+ }
162
+
163
+ response = await res.json();
164
+ } catch (e) {
165
+ console.error('[vision] Fetch error:', e.message);
166
+ break;
167
+ }
168
+
169
+ // Check stop reason
170
+ if (response.stop_reason === 'end_turn') {
171
+ console.log(`[vision] Claude signaled completion after ${actionsExecuted} actions`);
172
+ break;
173
+ }
174
+
175
+ // Find tool use blocks
176
+ const toolUseBlocks = (response.content || []).filter(b => b.type === 'tool_use' && b.name === 'computer');
177
+ if (toolUseBlocks.length === 0) {
178
+ console.log(`[vision] No more tool calls — done after ${actionsExecuted} actions`);
179
+ break;
180
+ }
181
+
182
+ // Add assistant message to history
183
+ messages.push({ role: 'assistant', content: response.content });
184
+
185
+ // Execute each action and collect results
186
+ const toolResults = [];
187
+ for (const block of toolUseBlocks) {
188
+ const action = block.input;
189
+ console.log(`[vision] Action: ${action.action}${action.coordinate ? ` at [${action.coordinate}]` : ''}${action.text ? ` "${action.text.slice(0, 40)}"` : ''}`);
190
+
191
+ let newScreenshot;
192
+ try {
193
+ newScreenshot = await executeAction(page, action);
194
+ actionsExecuted++;
195
+ } catch (e) {
196
+ console.warn(`[vision] Action execution failed: ${e.message}`);
197
+ newScreenshot = await page.screenshot({ type: 'jpeg', quality: SCREENSHOT_QUALITY }).catch(() => screenshot);
198
+ }
199
+
200
+ toolResults.push({
201
+ type: 'tool_result',
202
+ tool_use_id: block.id,
203
+ content: [{
204
+ type: 'image',
205
+ source: {
206
+ type: 'base64',
207
+ media_type: 'image/jpeg',
208
+ data: newScreenshot.toString('base64'),
209
+ },
210
+ }],
211
+ });
212
+ }
213
+
214
+ // Add tool results to conversation so Claude sees the outcome
215
+ messages.push({ role: 'user', content: toolResults });
216
+ }
217
+
218
+ return { actionsExecuted, iterations: iteration };
219
+ }
220
+
221
+ /**
222
+ * Precision fill: DOM filling already ran but left some fields unfilled.
223
+ * Claude looks only at the skipped fields and fills just those.
224
+ * Called when fillResult.skipped > 2 or fillResult.failed > 0.
225
+ */
226
+ async function visionFillSkipped(page, aep, apiKey, alreadyFilledMap = new Map()) {
227
+ if (!apiKey) {
228
+ return { success: false, filled: 0 };
229
+ }
230
+
231
+ const alreadyFilledLabels = [...alreadyFilledMap.keys()].join(', ') || 'none';
232
+ console.log('[vision] Precision fill for skipped fields...');
233
+
234
+ const systemPrompt = `You are controlling a web browser to fill a job application form.
235
+ Some fields were already filled by an automated DOM filler. Your job is to fill only the REMAINING empty fields.
236
+
237
+ Rules:
238
+ - DO NOT re-fill fields that are already filled — they are listed below as "already filled"
239
+ - Fill only the visible fields that appear empty
240
+ - Click a field before typing, use clear+type pattern
241
+ - For dropdowns: click to open, wait for options, click the correct option
242
+ - For checkboxes/radio buttons: click to select the correct option
243
+ - Do NOT click Next, Continue, or Submit
244
+ - Work top to bottom, left to right
245
+ - If all fields appear filled, take a screenshot and stop
246
+
247
+ Already filled fields (DO NOT touch these): ${alreadyFilledLabels}`;
248
+
249
+ const answers = buildAnswerSummary(aep);
250
+ const userPrompt = `Fill the remaining empty fields with this candidate data:
251
+
252
+ ${answers}
253
+
254
+ Skip any fields that are already filled. Focus only on empty fields.`;
255
+
256
+ try {
257
+ const result = await runActionLoop(page, systemPrompt, userPrompt, apiKey, 20);
258
+ return { success: true, filled: Math.max(0, Math.floor(result.actionsExecuted / 2)) };
259
+ } catch (e) {
260
+ console.error('[vision] Precision fill error:', e.message);
261
+ return { success: false, filled: 0 };
262
+ }
263
+ }
264
+
265
+ /**
266
+ * Targeted fallback: DOM filling already ran.
267
+ * Claude's job is just to find and click through to the submit confirmation.
268
+ * Used when findSubmitButton() returns null after full pagination.
269
+ */
270
+ async function visionNavigateAndSubmit(page, aep, apiKey, { autoSubmit = false, alreadyFilled = new Map() } = {}) {
271
+ if (!apiKey) {
272
+ console.warn('[vision] No API key — vision fallback skipped');
273
+ return { success: false, reason: 'No API key' };
274
+ }
275
+
276
+ console.log('[vision] Starting targeted navigation loop to find and submit...');
277
+
278
+ const alreadyFilledLabels = [...alreadyFilled.keys()].join(', ') || 'none';
279
+
280
+ const systemPrompt = `You are controlling a web browser to complete a job application form that has already been partially filled.
281
+ Your goal is to navigate to the final submission page and, if auto-submit is enabled, click the submit button.
282
+
283
+ Rules:
284
+ - Click Next/Continue buttons to advance through multi-page forms
285
+ - If you see a review page with all filled answers, that is the target state — take a screenshot and stop
286
+ - Only click Submit if autoSubmit is true (it is currently: ${autoSubmit})
287
+ - If the page requires login or shows an error, stop and describe the issue
288
+ - Do NOT re-fill fields that are already filled (already filled: ${alreadyFilledLabels})
289
+ - Do NOT close the browser window or tab`;
290
+
291
+ const profile = aep?.profile_fill || {};
292
+ const userPrompt = `The form is already filled with data for ${profile.first_name} ${profile.last_name} applying to ${aep?.job?.company || 'this company'}.
293
+
294
+ Navigate to the final submit/review page:
295
+ 1. If you see a Next or Continue button, click it
296
+ 2. If you see a review page summarizing the application, stop (this is the target state)
297
+ 3. If you see a Submit button and autoSubmit=${autoSubmit}, click it
298
+ 4. If the form is already on the submit page and ${autoSubmit} is false, just confirm you can see the submit button
299
+
300
+ Current page: ${page.url()}`;
301
+
302
+ try {
303
+ const result = await runActionLoop(page, systemPrompt, userPrompt, apiKey, 20);
304
+ console.log(`[vision] Navigation complete. Actions: ${result.actionsExecuted}`);
305
+
306
+ // Check if we landed on a confirmation/thank-you page
307
+ const finalUrl = page.url();
308
+ const pageText = await page.textContent('body').catch(() => '');
309
+ const isDone = /thank you|application submitted|application received|we.ve received|confirmation/i.test(pageText)
310
+ || /thank|confirm|success|applied|submitted/i.test(finalUrl);
311
+
312
+ return { success: true, submitted: isDone, actionsExecuted: result.actionsExecuted };
313
+ } catch (e) {
314
+ console.error('[vision] Navigation loop error:', e.message);
315
+ return { success: false, reason: e.message };
316
+ }
317
+ }
318
+
319
+ /**
320
+ * Full vision fill: Claude fills the entire form from scratch.
321
+ * Last resort for ATS types that defeat all DOM automation.
322
+ */
323
+ async function visionFill(page, aep, apiKey, { alreadyFilled = new Map() } = {}) {
324
+ if (!apiKey) {
325
+ console.warn('[vision] No API key — vision fallback skipped');
326
+ return { success: false, filled: 0 };
327
+ }
328
+
329
+ console.log('[vision] Starting full vision fill...');
330
+
331
+ const alreadyFilledLabels = [...alreadyFilled.keys()].join(', ') || 'none';
332
+
333
+ const systemPrompt = `You are controlling a web browser to fill out a job application form.
334
+ Fill every visible input field, textarea, and dropdown using the candidate data provided.
335
+
336
+ Rules:
337
+ - Click a field before typing into it
338
+ - For dropdowns: click to open, wait for options, click the correct option
339
+ - For checkboxes/radio buttons: click to select the correct option
340
+ - Do NOT click Next, Continue, or Submit — only fill visible fields
341
+ - Skip fields that are already correctly filled: ${alreadyFilledLabels}
342
+ - If a field has no matching data, leave it empty
343
+ - Work systematically top to bottom, left to right`;
344
+
345
+ const answers = buildAnswerSummary(aep);
346
+ const userPrompt = `Fill all visible form fields with this candidate data:
347
+
348
+ ${answers}
349
+
350
+ Fill each visible input field on the current page. Do not advance to the next page.`;
351
+
352
+ try {
353
+ const result = await runActionLoop(page, systemPrompt, userPrompt, apiKey, MAX_ITERATIONS);
354
+ return { success: true, filled: result.actionsExecuted };
355
+ } catch (e) {
356
+ console.error('[vision] Full fill error:', e.message);
357
+ return { success: false, filled: 0, reason: e.message };
358
+ }
359
+ }
360
+
361
+ function buildAnswerSummary(aep) {
362
+ const p = aep?.profile_fill || {};
363
+ const lines = [
364
+ `Full name: ${p.first_name} ${p.last_name}`,
365
+ `Email: ${p.email}`,
366
+ `Phone: ${p.phone}`,
367
+ p.linkedin && `LinkedIn: ${p.linkedin}`,
368
+ p.github && `GitHub: ${p.github}`,
369
+ p.portfolio && `Portfolio/Website: ${p.portfolio}`,
370
+ p.school && `Education: ${p.school}`,
371
+ p.gpa && `GPA: ${p.gpa}`,
372
+ p.sponsorship_text && `Visa/Sponsorship: ${p.sponsorship_text}`,
373
+ p.relocation_text && `Relocation: ${p.relocation_text}`,
374
+ p.salary && `Salary expectation: ${p.salary}`,
375
+ p.gender && `Gender: ${p.gender}`,
376
+ p.race && `Race/Ethnicity: ${p.race}`,
377
+ p.veteran && `Veteran status: ${p.veteran}`,
378
+ p.disability && `Disability: ${p.disability}`,
379
+ ].filter(Boolean);
380
+
381
+ if (aep?.cover_letter) {
382
+ lines.push(`\nCover Letter:\n${aep.cover_letter.slice(0, 600)}...`);
383
+ }
384
+
385
+ const fieldAnswers = (aep?.field_answers || []).slice(0, 10);
386
+ if (fieldAnswers.length > 0) {
387
+ lines.push('\nApplication-specific answers:');
388
+ for (const fa of fieldAnswers) {
389
+ if (fa.label && fa.value) {
390
+ lines.push(` "${fa.label}": ${fa.value.slice(0, 300)}`);
391
+ }
392
+ }
393
+ }
394
+
395
+ return lines.join('\n');
396
+ }
397
+
398
+ module.exports = { visionFill, visionNavigateAndSubmit, visionFillSkipped };