halo-agent 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/captcha.js CHANGED
@@ -17,49 +17,56 @@ const CAPSOLVER_API = 'https://api.capsolver.com';
17
17
  * Returns { detected, type, sitekey, pageUrl }
18
18
  */
19
19
  async function detectCaptcha(page) {
20
+ // The pageUrl CapSolver wants is the top-level URL the user sees, not an
21
+ // inner iframe URL. Anchor it here so every branch returns the same thing.
22
+ const pageUrl = page.url();
23
+
20
24
  // First try the top-level frame via evaluate
21
25
  const topResult = await page.evaluate(() => {
22
- // reCAPTCHA v2 via iframe src
26
+ // reCAPTCHA v2 via iframe src — also check size=invisible in the URL
23
27
  const rcFrame = document.querySelector('iframe[src*="recaptcha/api2"], iframe[src*="google.com/recaptcha"]');
24
28
  if (rcFrame) {
25
- const match = (rcFrame.src || '').match(/[?&]k=([^&]+)/);
26
- return { detected: true, type: 'recaptcha_v2', sitekey: match?.[1] || null, pageUrl: location.href };
29
+ const src = rcFrame.src || '';
30
+ const match = src.match(/[?&]k=([^&]+)/);
31
+ const isInvisible = /[?&]size=invisible/.test(src);
32
+ return { detected: true, type: 'recaptcha_v2', sitekey: match?.[1] || null, isInvisible };
27
33
  }
28
- // reCAPTCHA via data-sitekey
34
+ // reCAPTCHA via data-sitekey — check data-size="invisible" on the element
29
35
  const rcEl = document.querySelector('.g-recaptcha[data-sitekey], [data-sitekey]:not(.h-captcha)');
30
36
  if (rcEl) {
31
- return { detected: true, type: 'recaptcha_v2', sitekey: rcEl.getAttribute('data-sitekey'), pageUrl: location.href };
37
+ const isInvisible = rcEl.getAttribute('data-size') === 'invisible';
38
+ return { detected: true, type: 'recaptcha_v2', sitekey: rcEl.getAttribute('data-sitekey'), isInvisible };
32
39
  }
33
40
  // hCAPTCHA
34
41
  const hcEl = document.querySelector('.h-captcha[data-sitekey]');
35
42
  if (hcEl) {
36
- return { detected: true, type: 'hcaptcha', sitekey: hcEl.getAttribute('data-sitekey'), pageUrl: location.href };
43
+ return { detected: true, type: 'hcaptcha', sitekey: hcEl.getAttribute('data-sitekey'), isInvisible: false };
37
44
  }
38
45
  const hcFrame = document.querySelector('iframe[src*="hcaptcha.com"]');
39
46
  if (hcFrame) {
40
47
  const match = (hcFrame.src || '').match(/[?&]sitekey=([^&]+)/);
41
- return { detected: true, type: 'hcaptcha', sitekey: match?.[1] || null, pageUrl: location.href };
48
+ return { detected: true, type: 'hcaptcha', sitekey: match?.[1] || null, isInvisible: false };
42
49
  }
43
50
  // Cloudflare
44
51
  if (document.getElementById('cf-challenge-running') || document.querySelector('.cf-browser-verification')) {
45
- return { detected: true, type: 'cloudflare', sitekey: null, pageUrl: location.href };
52
+ return { detected: true, type: 'cloudflare', sitekey: null, isInvisible: false };
46
53
  }
47
54
  return null;
48
55
  });
49
56
 
50
- if (topResult) return topResult;
57
+ if (topResult) return { ...topResult, pageUrl };
51
58
 
52
- // Search all frames for reCAPTCHA (Ashby loads it inside a sandboxed iframe)
53
- const pageUrl = page.url();
59
+ // Search all frames for reCAPTCHA (Ashby/Greenhouse load it inside a sandboxed iframe)
54
60
  for (const frame of page.frames()) {
55
61
  if (frame === page.mainFrame()) continue;
56
62
  const frameSrc = frame.url();
57
63
 
58
- // If the frame itself is a reCAPTCHA anchor frame, extract sitekey from its URL
64
+ // If the frame itself is a reCAPTCHA anchor frame, extract sitekey + invisible from URL
59
65
  if (frameSrc.includes('recaptcha/api2/anchor') || frameSrc.includes('recaptcha/enterprise/anchor')) {
60
66
  const match = frameSrc.match(/[?&]k=([^&]+)/);
67
+ const isInvisible = /[?&]size=invisible/.test(frameSrc);
61
68
  if (match) {
62
- return { detected: true, type: 'recaptcha_v2', sitekey: match[1], pageUrl };
69
+ return { detected: true, type: 'recaptcha_v2', sitekey: match[1], pageUrl, isInvisible };
63
70
  }
64
71
  }
65
72
 
@@ -67,22 +74,29 @@ async function detectCaptcha(page) {
67
74
  try {
68
75
  const frameResult = await frame.evaluate(() => {
69
76
  const rcEl = document.querySelector('.g-recaptcha[data-sitekey], [data-sitekey]:not(.h-captcha)');
70
- if (rcEl) return { sitekey: rcEl.getAttribute('data-sitekey'), type: 'recaptcha_v2' };
77
+ if (rcEl) {
78
+ return {
79
+ sitekey: rcEl.getAttribute('data-sitekey'),
80
+ type: 'recaptcha_v2',
81
+ isInvisible: rcEl.getAttribute('data-size') === 'invisible',
82
+ };
83
+ }
71
84
  const rcFrame = document.querySelector('iframe[src*="recaptcha"]');
72
85
  if (rcFrame) {
73
- const match = (rcFrame.src || '').match(/[?&]k=([^&]+)/);
74
- return match ? { sitekey: match[1], type: 'recaptcha_v2' } : null;
86
+ const src = rcFrame.src || '';
87
+ const match = src.match(/[?&]k=([^&]+)/);
88
+ return match ? { sitekey: match[1], type: 'recaptcha_v2', isInvisible: /[?&]size=invisible/.test(src) } : null;
75
89
  }
76
90
  return null;
77
91
  }).catch(() => null);
78
92
 
79
93
  if (frameResult?.sitekey) {
80
- return { detected: true, type: frameResult.type, sitekey: frameResult.sitekey, pageUrl };
94
+ return { detected: true, type: frameResult.type, sitekey: frameResult.sitekey, pageUrl, isInvisible: !!frameResult.isInvisible };
81
95
  }
82
96
  } catch {}
83
97
  }
84
98
 
85
- return { detected: false, type: null, sitekey: null, pageUrl };
99
+ return { detected: false, type: null, sitekey: null, pageUrl, isInvisible: false };
86
100
  }
87
101
 
88
102
  /**
@@ -101,21 +115,26 @@ async function solveCaptcha(captchaInfo, apiKey) {
101
115
  ? 'HCaptchaTaskProxyless'
102
116
  : 'ReCaptchaV2TaskProxyless';
103
117
 
104
- console.log(`[captcha] Submitting ${taskType} task to CapSolver (sitekey: ${captchaInfo.sitekey.slice(0, 12)}...)`);
118
+ // CapSolver requires `isInvisible: true` for invisible reCAPTCHA — sending
119
+ // a normal v2 task for an invisible sitekey returns
120
+ // "Invalid input, please check captcha type or pageUrl and invisible".
121
+ const task = {
122
+ type: taskType,
123
+ websiteURL: captchaInfo.pageUrl,
124
+ websiteKey: captchaInfo.sitekey,
125
+ };
126
+ if (captchaInfo.type === 'recaptcha_v2' && captchaInfo.isInvisible) {
127
+ task.isInvisible = true;
128
+ }
129
+
130
+ console.log(`[captcha] Submitting ${taskType}${task.isInvisible ? ' (invisible)' : ''} task to CapSolver (sitekey: ${captchaInfo.sitekey.slice(0, 12)}..., url: ${captchaInfo.pageUrl})`);
105
131
 
106
132
  let taskId;
107
133
  try {
108
134
  const createRes = await fetch(`${CAPSOLVER_API}/createTask`, {
109
135
  method: 'POST',
110
136
  headers: { 'Content-Type': 'application/json' },
111
- body: JSON.stringify({
112
- clientKey: apiKey,
113
- task: {
114
- type: taskType,
115
- websiteURL: captchaInfo.pageUrl,
116
- websiteKey: captchaInfo.sitekey,
117
- },
118
- }),
137
+ body: JSON.stringify({ clientKey: apiKey, task }),
119
138
  });
120
139
  const createData = await createRes.json();
121
140
  if (createData.errorId !== 0) {
package/orchestrator.js CHANGED
@@ -381,13 +381,51 @@ async function runJob(queueItem, chromeConn, config, reportStatus) {
381
381
  const confirmScreenshot = await page.screenshot({ type: 'jpeg', quality: 70 });
382
382
  const confirmKey = await uploadScreenshot(config, confirmScreenshot, `confirm_${queueId}.jpg`);
383
383
 
384
+ // Verify-then-DONE: trusting waitForURL alone was wrong (the Chalk bug —
385
+ // Ashby rendered "Missing entry for required field: Name, Email, ..."
386
+ // inline without a URL change, and we marked DONE on a failed submit).
387
+ // The backend re-fetches the page through Firecrawl + extract and tells
388
+ // us if the submit actually went through. On Firecrawl failure or no
389
+ // key, the endpoint returns submitted:true so we don't deadlock — we're
390
+ // strictly more correct than before, not less.
391
+ const verdictUrl = page.url();
392
+ console.log(`[orchestrator] Verifying submission at ${verdictUrl}...`);
393
+ let verdict = { submitted: true, error_message: null, confirmation_text: null, source: 'unavailable' };
394
+ try {
395
+ const vRes = await fetch(`${config.apiUrl}/agent/verify-submit`, {
396
+ method: 'POST',
397
+ headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${config.token}` },
398
+ body: JSON.stringify({ queue_id: queueId, page_url: verdictUrl }),
399
+ });
400
+ if (vRes.ok) verdict = await vRes.json();
401
+ } catch (e) {
402
+ console.warn(`[orchestrator] verify-submit unavailable: ${e.message}`);
403
+ }
404
+
405
+ if (!verdict.submitted) {
406
+ const reason = verdict.error_message || 'Submission did not confirm — form may still have errors';
407
+ console.warn(`[orchestrator] Submission NOT verified. Reason: ${reason}`);
408
+ await reportStatus('NEEDS_ATTENTION', {
409
+ review_screenshot_r2_key: confirmKey || null,
410
+ needs_attention_reason: `Submit clicked but not confirmed: ${reason}`,
411
+ intervention_type: 'submit_failed',
412
+ step: 'VERIFY',
413
+ step_detail: reason.slice(0, 200),
414
+ fields_filled: cumulativeFilled,
415
+ });
416
+ // Do NOT clearCheckpoint — user may dismiss + re-queue, and a stale
417
+ // checkpoint would resume into the same failed state. The dismiss flow
418
+ // clears it (DELETE /apply-queue/:id sets form_checkpoint_json = NULL).
419
+ throw new Error(`Submission failed verification: ${reason}`);
420
+ }
421
+
384
422
  await reportStatus('DONE', {
385
423
  confirmation_screenshot_r2_key: confirmKey || null,
386
424
  fields_filled: cumulativeFilled,
387
425
  });
388
426
  await clearCheckpoint(config, queueId);
389
427
 
390
- console.log(`[orchestrator] Done: ${queueItem.company} - ${queueItem.title}`);
428
+ console.log(`[orchestrator] Done (verified): ${queueItem.company} - ${queueItem.title}${verdict.source === 'firecrawl' ? ' · firecrawl-verified' : ' · unverified'}`);
391
429
 
392
430
  // Post fill session data to backend for learning loop
393
431
  await postFillSession(config, {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "halo-agent",
3
- "version": "1.3.0",
3
+ "version": "1.3.2",
4
4
  "description": "HALO local apply agent — auto-fills job applications using your real Chrome session",
5
5
  "main": "index.js",
6
6
  "bin": {
package/scanPage.js CHANGED
@@ -207,8 +207,27 @@ const CONSENT_PATTERNS = [
207
207
  /privacy\s*policy/i,
208
208
  ];
209
209
 
210
+ // Normalize a label so trailing decorations don't break strict regex matches.
211
+ // Ashby/Greenhouse often render labels like "Name *", "Email (required)",
212
+ // "LinkedIn Profile — Required", "Phone *Required". Without this, the
213
+ // PROFILE_PATTERNS (which use ^anchored regexes for short fields like name)
214
+ // silently miss and the field falls through to 'custom', which means the
215
+ // agent skips filling it with profile data.
216
+ function normalizeLabel(raw) {
217
+ if (!raw) return '';
218
+ return raw
219
+ .toLowerCase()
220
+ .replace(/[*†‡]/g, ' ') // markers
221
+ .replace(/\((required|optional|mandatory)\)/g, ' ') // "(required)"
222
+ .replace(/\b(required|optional|mandatory)\b/g, ' ') // "required"
223
+ .replace(/[—–-]+\s*(required|optional)\s*$/g, ' ') // "— required"
224
+ .replace(/[:?]+$/g, '') // trailing : ?
225
+ .replace(/\s+/g, ' ')
226
+ .trim();
227
+ }
228
+
210
229
  function classifyField(field) {
211
- const label = field.label.toLowerCase().trim();
230
+ const label = normalizeLabel(field.label);
212
231
 
213
232
  // Consent checkboxes
214
233
  if (field.inputType === 'checkbox' && CONSENT_PATTERNS.some(r => r.test(label))) {
@@ -222,7 +241,7 @@ function classifyField(field) {
222
241
  if (regex.test(label)) return 'profile:' + name;
223
242
  }
224
243
 
225
- // Ashby system fields by name attribute
244
+ // Ashby system fields by name attribute (legacy form schema)
226
245
  if (field.name && field.name.startsWith('_systemfield_')) {
227
246
  const sfName = field.name.replace('_systemfield_', '');
228
247
  if (['name', 'email', 'phone', 'resume', 'linkedin', 'website'].includes(sfName)) {
@@ -323,11 +342,12 @@ async function scanAshby(page) {
323
342
  if (sfField === 'resume' || f.inputType === 'file') return { ...f, category: 'file:resume' };
324
343
  if (profileMap[sfField] !== undefined) return { ...f, category: 'profile:' + profileMap[sfField] };
325
344
  }
326
- // UUID-named fields are custom questions
327
- if (f.name && /^[0-9a-f-]{36}$/.test(f.name)) {
328
- return { ...f, category: 'custom' };
329
- }
330
345
  if (f.inputType === 'file') return { ...f, category: 'file:resume' };
346
+ // Modern Ashby uses UUID names for ALL fields (Name, Email, Phone, custom).
347
+ // Classify by LABEL first — that's the only signal that distinguishes
348
+ // a profile field from a custom question when the name attr is opaque.
349
+ // Don't shortcut UUID-named fields to 'custom' — that's exactly the
350
+ // Chalk bug where Name/Email/Phone/LinkedIn all fell through unfilled.
331
351
  return { ...f, category: classifyField(f) };
332
352
  }).filter(f => f.category !== 'ignore');
333
353
  }