halo-agent 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/scanPage.js ADDED
@@ -0,0 +1,606 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * scanPage.js — Per-ATS page scanner.
5
+ *
6
+ * Returns a normalized field map for every visible fillable field on the page:
7
+ * [ { selector, label, type, tag, inputType, value, category } ]
8
+ *
9
+ * category is one of:
10
+ * 'profile' — standard identity field (name, email, phone, resume, etc.)
11
+ * 'custom' — company-specific question (answered via HALO memory or AI)
12
+ * 'eeo' — Equal Employment Opportunity / demographic
13
+ * 'consent' — checkbox consent, marketing opt-in
14
+ * 'ignore' — captcha, hidden, already filled, utility inputs
15
+ *
16
+ * Each ATS has its own scanner because they embed fields differently:
17
+ * greenhouse — id= attrs are stable across ALL companies
18
+ * lever — name= attrs stable, full name in single field
19
+ * ashby — _systemfield_* system names, UUID names for custom fields
20
+ * workday — data-automation-id for nav/buttons, form fields in iframe (vision)
21
+ * icims — everything inside a named iframe
22
+ * generic — label-text walk for any other ATS
23
+ *
24
+ * IMPORTANT: this scanner runs on the live page via Playwright, not on saved HTML.
25
+ * It always returns what is actually visible right now.
26
+ */
27
+
28
+ // ─── Shared helpers ───────────────────────────────────────────────────────────
29
+
30
+ /**
31
+ * Run in-browser DOM scan. Returns raw field list before enrichment.
32
+ * This is the same getFieldLabel logic from extractPageFields but extended
33
+ * to also capture data-automation-id and to distinguish radio/checkbox groups.
34
+ */
35
+ async function domScan(page) {
36
+ return page.evaluate(() => {
37
+ function getLabel(el) {
38
+ // 1. Native label[for]
39
+ if (el.id) {
40
+ const lbl = document.querySelector('label[for="' + el.id + '"]');
41
+ if (lbl) return lbl.textContent.replace(/\s+/g, ' ').trim();
42
+ }
43
+ if (el.labels && el.labels[0]) return el.labels[0].textContent.replace(/\s+/g, ' ').trim();
44
+ // 2. aria-label / aria-labelledby
45
+ const ariaLabel = el.getAttribute('aria-label');
46
+ if (ariaLabel) return ariaLabel.trim();
47
+ const ariaLabelledBy = el.getAttribute('aria-labelledby');
48
+ if (ariaLabelledBy) {
49
+ const ids = ariaLabelledBy.split(/\s+/);
50
+ const text = ids.map(id => { const e = document.getElementById(id); return e ? e.textContent.trim() : ''; }).join(' ').trim();
51
+ if (text) return text;
52
+ }
53
+ // 3. data-label / data-title
54
+ if (el.dataset.label) return el.dataset.label.trim();
55
+ if (el.dataset.title) return el.dataset.title.trim();
56
+ // 4. Preceding siblings — prefer headings, then short generic containers.
57
+ // Ashby/Lever put <h3>Question text</h3> before the <textarea>, not a <label>.
58
+ let prev = el.previousElementSibling;
59
+ while (prev) {
60
+ const ptag = prev.tagName;
61
+ if (/^(LABEL|LEGEND)$/.test(ptag)) {
62
+ const t = prev.textContent.replace(/\s+/g, ' ').trim();
63
+ if (t) return t;
64
+ }
65
+ if (/^H[1-6]$/.test(ptag)) {
66
+ const t = prev.textContent.replace(/\s+/g, ' ').trim();
67
+ if (t && t.length < 200) return t;
68
+ }
69
+ if (/^(SPAN|DIV|P|STRONG|B)$/.test(ptag)) {
70
+ const t = prev.textContent.replace(/\s+/g, ' ').trim();
71
+ if (t && t.length > 0 && t.length < 100) return t;
72
+ }
73
+ prev = prev.previousElementSibling;
74
+ }
75
+ // 5. Walk up DOM ancestors — look for heading or label inside each level.
76
+ // Key: Ashby uses <div><h3>Question</h3><p>Description</p><textarea/></div>
77
+ let parent = el.parentElement;
78
+ let depth = 0;
79
+ while (parent && depth < 12) {
80
+ const labelEl = parent.querySelector('label, legend');
81
+ if (labelEl && !labelEl.contains(el)) {
82
+ const t = labelEl.textContent.replace(/\s+/g, ' ').trim();
83
+ if (t && t.length < 200) return t;
84
+ }
85
+ const headingEl = parent.querySelector('h1, h2, h3, h4, h5, h6, strong, b');
86
+ if (headingEl && !headingEl.contains(el)) {
87
+ const t = headingEl.textContent.replace(/\s+/g, ' ').trim();
88
+ if (t && t.length > 3 && t.length < 300) return t;
89
+ }
90
+ const classEl = parent.querySelector('[class*="label"], [class*="title"], [class*="question"], [class*="heading"], [class*="prompt"]');
91
+ if (classEl && !classEl.contains(el)) {
92
+ const t = classEl.textContent.replace(/\s+/g, ' ').trim();
93
+ if (t && t.length > 0 && t.length < 200) return t;
94
+ }
95
+ parent = parent.parentElement;
96
+ depth++;
97
+ }
98
+ // 6. Fallback: placeholder > name > id
99
+ return el.placeholder || el.name || el.id || '';
100
+ }
101
+
102
+ const results = [];
103
+ const seen = new Set();
104
+
105
+ document.querySelectorAll('input, textarea, select, [role="combobox"], [contenteditable="true"]').forEach(el => {
106
+ const type = (el.type || '').toLowerCase();
107
+ // Skip utility types
108
+ if (['hidden', 'submit', 'button', 'image', 'reset'].includes(type)) return;
109
+ // Skip invisible (but allow radio/checkbox which may be styled-hidden)
110
+ if (type !== 'radio' && type !== 'checkbox' && type !== 'file') {
111
+ const rect = el.getBoundingClientRect();
112
+ if (rect.width === 0 && rect.height === 0) return;
113
+ if (el.offsetParent === null && !el.closest('[role="dialog"]')) return;
114
+ }
115
+
116
+ const label = getLabel(el).replace(/^\*+|\*+$/g, '').trim();
117
+ const id = el.id || '';
118
+ const name = el.name || '';
119
+ const automationId = el.getAttribute('data-automation-id') || '';
120
+ const testId = el.getAttribute('data-testid') || '';
121
+ const role = el.getAttribute('role') || '';
122
+ const tag = el.tagName.toLowerCase();
123
+ const currentValue = el.value || '';
124
+
125
+ // Build a dedup key
126
+ const dedupKey = id || name || (label + ':' + type) || el.outerHTML.slice(0, 60);
127
+ if (seen.has(dedupKey)) return;
128
+ seen.add(dedupKey);
129
+
130
+ // Build the most stable selector
131
+ let selector = '';
132
+ if (id) selector = `#${id.replace(/([!"#$%&'()*+,.\/:;<=>?@\[\\\]^`{|}~])/g, '\\$1')}`;
133
+ else if (name) selector = `[name="${name}"]`;
134
+ else if (automationId) selector = `[data-automation-id="${automationId}"]`;
135
+ else if (testId) selector = `[data-testid="${testId}"]`;
136
+ else if (label) {
137
+ const escaped = label.replace(/['"\\]/g, '');
138
+ selector = `${tag}[aria-label="${escaped}"]`;
139
+ }
140
+
141
+ results.push({
142
+ tag,
143
+ inputType: type,
144
+ role,
145
+ id,
146
+ name,
147
+ automationId,
148
+ testId,
149
+ label,
150
+ selector,
151
+ currentValue,
152
+ isContentEditable: el.isContentEditable || false,
153
+ });
154
+ });
155
+
156
+ return results;
157
+ });
158
+ }
159
+
160
+ // ─── Profile field classifier ─────────────────────────────────────────────────
161
+
162
+ // Map from normalized label keywords to profile field name.
163
+ // Order matters — more specific patterns first.
164
+ const PROFILE_PATTERNS = [
165
+ // Lever-style: single full name field
166
+ { field: 'full_name', regex: /^(full\s*name|your\s*name|name)$/i },
167
+ // Standard first/last
168
+ { field: 'first_name', regex: /first[\s_-]?name|given[\s_-]?name|forename/i },
169
+ { field: 'last_name', regex: /last[\s_-]?name|family[\s_-]?name|surname/i },
170
+ { field: 'preferred_name',regex: /preferred[\s_-]?(first\s*)?name|goes\s*by/i },
171
+ { field: 'email', regex: /e[\s-]?mail/i },
172
+ { field: 'phone', regex: /phone|mobile|telephone|cell/i },
173
+ { field: 'linkedin', regex: /linkedin/i },
174
+ { field: 'github', regex: /github/i },
175
+ { field: 'twitter', regex: /twitter|x\.com/i },
176
+ { field: 'portfolio', regex: /portfolio|personal\s*(site|url|website)|website/i },
177
+ { field: 'location', regex: /^(location|city|current\s*location)$/i },
178
+ { field: 'address', regex: /address\s*(line\s*1)?|street/i },
179
+ { field: 'city', regex: /^city$/i },
180
+ { field: 'state', regex: /^(state|province|region)$/i },
181
+ { field: 'zip', regex: /zip|postal\s*code/i },
182
+ { field: 'country', regex: /^country$/i },
183
+ { field: 'salary', regex: /salary|compensation|expected\s*pay/i },
184
+ { field: 'start_date', regex: /start\s*date|available|earliest.*start/i },
185
+ { field: 'school', regex: /school|university|college|institution/i },
186
+ { field: 'degree', regex: /degree|qualification/i },
187
+ { field: 'gpa', regex: /gpa|grade\s*point/i },
188
+ { field: 'org', regex: /^(company|employer|organization|current\s*(company|employer))$/i },
189
+ ];
190
+
191
+ // EEO / demographic patterns
192
+ const EEO_PATTERNS = [
193
+ /gender|sex$/i,
194
+ /race|ethnicity|hispanic/i,
195
+ /veteran/i,
196
+ /disability|disabled/i,
197
+ /pronoun/i,
198
+ /equal\s*opportunity/i,
199
+ /voluntary\s*self/i,
200
+ /demographic/i,
201
+ ];
202
+
203
+ // Consent / noise patterns
204
+ const CONSENT_PATTERNS = [
205
+ /consent|agree|accept|opt.?in|marketing|newsletter/i,
206
+ /terms\s*(and|&)\s*(conditions|service)/i,
207
+ /privacy\s*policy/i,
208
+ ];
209
+
210
+ function classifyField(field) {
211
+ const label = field.label.toLowerCase().trim();
212
+
213
+ // Consent checkboxes
214
+ if (field.inputType === 'checkbox' && CONSENT_PATTERNS.some(r => r.test(label))) {
215
+ return 'consent';
216
+ }
217
+ // EEO
218
+ if (EEO_PATTERNS.some(r => r.test(label))) return 'eeo';
219
+
220
+ // Profile
221
+ for (const { field: name, regex } of PROFILE_PATTERNS) {
222
+ if (regex.test(label)) return 'profile:' + name;
223
+ }
224
+
225
+ // Ashby system fields by name attribute
226
+ if (field.name && field.name.startsWith('_systemfield_')) {
227
+ const sfName = field.name.replace('_systemfield_', '');
228
+ if (['name', 'email', 'phone', 'resume', 'linkedin', 'website'].includes(sfName)) {
229
+ return 'profile:' + (sfName === 'name' ? 'full_name' : sfName);
230
+ }
231
+ }
232
+
233
+ // Custom (company-specific question)
234
+ return 'custom';
235
+ }
236
+
237
+ // ─── ATS-specific scanners ────────────────────────────────────────────────────
238
+
239
+ /**
240
+ * Greenhouse scanner.
241
+ * Fields use id= that matches the field semantic name (first_name, last_name, email, phone, resume).
242
+ * Custom questions: id starts with 'question_', label from adjacent <label>.
243
+ */
244
+ async function scanGreenhouse(page) {
245
+ const raw = await domScan(page);
246
+ return raw.map(f => {
247
+ // Greenhouse id= IS the semantic name for system fields
248
+ const ghSystemIds = { first_name: 'first_name', last_name: 'last_name', preferred_name: 'preferred:preferred_name', email: 'email', phone: 'phone', resume: 'resume', cover_letter: 'cover_letter' };
249
+ if (f.id && ghSystemIds[f.id]) {
250
+ return { ...f, category: f.id === 'resume' || f.id === 'cover_letter' ? 'file:' + f.id : 'profile:' + ghSystemIds[f.id] };
251
+ }
252
+ // Custom question fields: id like question_12345678
253
+ if (f.id && /^question_\d+$/.test(f.id)) {
254
+ return { ...f, category: 'custom' };
255
+ }
256
+ // EEO/demographic dropdowns at bottom
257
+ if (f.id && /^(gender|hispanic_ethnicity|veteran_status|disability_status)$/.test(f.id)) {
258
+ return { ...f, category: 'eeo' };
259
+ }
260
+ // Degree/school fields
261
+ if (f.id && /^(degree|school|major)/.test(f.id)) {
262
+ return { ...f, category: 'profile:' + f.id.split('--')[0] };
263
+ }
264
+ return { ...f, category: classifyField(f) };
265
+ }).filter(f => f.category !== 'ignore' && f.inputType !== 'hidden');
266
+ }
267
+
268
+ /**
269
+ * Lever scanner.
270
+ * name= attributes are stable across ALL companies.
271
+ * Key difference: full name is a SINGLE field (name="name"), NOT first+last.
272
+ * Custom questions: name starts with "cards[" pattern.
273
+ */
274
+ async function scanLever(page) {
275
+ const raw = await domScan(page);
276
+ return raw.map(f => {
277
+ const leverSystemNames = {
278
+ name: 'profile:full_name',
279
+ email: 'profile:email',
280
+ phone: 'profile:phone',
281
+ location: 'profile:location',
282
+ org: 'profile:org',
283
+ 'urls[LinkedIn]': 'profile:linkedin',
284
+ 'urls[GitHub]': 'profile:github',
285
+ 'urls[Twitter]': 'profile:twitter',
286
+ 'urls[Portfolio]': 'profile:portfolio',
287
+ 'urls[Other]': 'profile:website',
288
+ comments: 'cover_letter',
289
+ };
290
+ if (f.name && leverSystemNames[f.name]) {
291
+ return { ...f, category: leverSystemNames[f.name] };
292
+ }
293
+ if (f.name === 'resume' || f.inputType === 'file') {
294
+ return { ...f, category: 'file:resume' };
295
+ }
296
+ // EEO selects
297
+ if (f.name && f.name.startsWith('eeo[')) {
298
+ return { ...f, category: 'eeo' };
299
+ }
300
+ // Consent checkboxes
301
+ if (f.name && f.name.startsWith('consent[')) {
302
+ return { ...f, category: 'consent' };
303
+ }
304
+ // Custom questions: name="cards[{uuid}][field{n}]"
305
+ if (f.name && f.name.startsWith('cards[')) {
306
+ return { ...f, category: 'custom' };
307
+ }
308
+ return { ...f, category: classifyField(f) };
309
+ }).filter(f => f.category !== 'ignore');
310
+ }
311
+
312
+ /**
313
+ * Ashby scanner.
314
+ * System fields: name="_systemfield_{name}" — stable across ALL companies.
315
+ * Custom fields: name="{uuid}" — company-specific, use label text to identify.
316
+ */
317
+ async function scanAshby(page) {
318
+ const raw = await domScan(page);
319
+ return raw.map(f => {
320
+ if (f.name && f.name.startsWith('_systemfield_')) {
321
+ const sfField = f.name.replace('_systemfield_', '');
322
+ const profileMap = { name: 'full_name', email: 'email', phone: 'phone', linkedin: 'linkedin', website: 'portfolio', resume: null };
323
+ if (sfField === 'resume' || f.inputType === 'file') return { ...f, category: 'file:resume' };
324
+ if (profileMap[sfField] !== undefined) return { ...f, category: 'profile:' + profileMap[sfField] };
325
+ }
326
+ // UUID-named fields are custom questions
327
+ if (f.name && /^[0-9a-f-]{36}$/.test(f.name)) {
328
+ return { ...f, category: 'custom' };
329
+ }
330
+ if (f.inputType === 'file') return { ...f, category: 'file:resume' };
331
+ return { ...f, category: classifyField(f) };
332
+ }).filter(f => f.category !== 'ignore');
333
+ }
334
+
335
+ /**
336
+ * Workday scanner.
337
+ * Form fields live inside a nested iframe (dynamically injected after auth).
338
+ * We attempt to pierce the iframe and scan its fields.
339
+ * Falls back to outer DOM scan if iframe isn't accessible.
340
+ * Navigation buttons use data-automation-id and are NOT included in the field scan
341
+ * (they're handled by findNextButton/findSubmitButton separately).
342
+ */
343
+ async function scanWorkday(page) {
344
+ // Try to find the application iframe
345
+ let targetPage = page;
346
+ try {
347
+ const frames = page.frames();
348
+ // Workday application form typically loads in a frame whose URL contains 'wd' or is a child of the main frame
349
+ const appFrame = frames.find(f => {
350
+ const url = f.url();
351
+ return url && url !== 'about:blank' && url !== '' && !url.includes('googleapis') && f !== page.mainFrame();
352
+ });
353
+ if (appFrame) {
354
+ // Use the frame as context for domScan
355
+ const frameFields = await appFrame.evaluate(() => {
356
+ // Same inline DOM walk — can't call domScan directly in frame context
357
+ const results = [];
358
+ document.querySelectorAll('input, textarea, select').forEach(el => {
359
+ const type = (el.type || '').toLowerCase();
360
+ if (['hidden', 'submit', 'button', 'image', 'reset'].includes(type)) return;
361
+ const rect = el.getBoundingClientRect();
362
+ if (rect.width === 0 && rect.height === 0) return;
363
+ const automationId = el.getAttribute('data-automation-id') || '';
364
+ const label = el.getAttribute('aria-label') || el.placeholder || automationId || el.name || el.id || '';
365
+ results.push({
366
+ tag: el.tagName.toLowerCase(),
367
+ inputType: type,
368
+ id: el.id || '',
369
+ name: el.name || '',
370
+ automationId,
371
+ label,
372
+ selector: automationId ? `[data-automation-id="${automationId}"]` : (el.id ? `#${el.id}` : `[name="${el.name}"]`),
373
+ currentValue: el.value || '',
374
+ });
375
+ });
376
+ return results;
377
+ }).catch(() => []);
378
+
379
+ if (frameFields.length > 0) {
380
+ return frameFields.map(f => ({ ...f, frameUrl: appFrame.url(), category: classifyField(f) }));
381
+ }
382
+ }
383
+ } catch {}
384
+
385
+ // Outer DOM fallback (job listing page / sign-in page)
386
+ const raw = await domScan(page);
387
+ return raw.map(f => ({ ...f, category: classifyField(f) }));
388
+ }
389
+
390
+ /**
391
+ * iCIMS scanner.
392
+ * All application fields live inside an iframe with a stable title or src pattern.
393
+ * We pierce the iframe using Playwright frameLocator.
394
+ */
395
+ async function scanICIMS(page) {
396
+ // Try named iframe selectors in priority order
397
+ const iframeSelectors = [
398
+ 'iframe[title*="iCIMS"]',
399
+ 'iframe[src*="icims.com"]',
400
+ 'iframe[id*="icims"]',
401
+ 'iframe[name*="icims"]',
402
+ 'iframe', // fallback: first iframe
403
+ ];
404
+
405
+ for (const iframeSel of iframeSelectors) {
406
+ try {
407
+ const frameCount = await page.locator(iframeSel).count();
408
+ if (frameCount === 0) continue;
409
+
410
+ const frame = page.frameLocator(iframeSel);
411
+ const innerFields = await frame.locator('input, textarea, select').evaluateAll(elements => {
412
+ return elements.map(el => {
413
+ const type = (el.type || '').toLowerCase();
414
+ if (['hidden', 'submit', 'button', 'image'].includes(type)) return null;
415
+ let label = '';
416
+ if (el.id) {
417
+ const lbl = document.querySelector('label[for="' + el.id + '"]');
418
+ if (lbl) label = lbl.textContent.trim();
419
+ }
420
+ if (!label) label = el.getAttribute('aria-label') || el.placeholder || el.name || el.id || '';
421
+ return {
422
+ tag: el.tagName.toLowerCase(),
423
+ inputType: type,
424
+ id: el.id || '',
425
+ name: el.name || '',
426
+ label: label.trim(),
427
+ selector: el.id ? `#${el.id}` : (el.name ? `[name="${el.name}"]` : null),
428
+ currentValue: el.value || '',
429
+ };
430
+ }).filter(Boolean);
431
+ }).catch(() => []);
432
+
433
+ if (innerFields.length > 0) {
434
+ return innerFields.map(f => ({
435
+ ...f,
436
+ iframeSelector: iframeSel,
437
+ category: classifyField(f),
438
+ }));
439
+ }
440
+ } catch {}
441
+ }
442
+
443
+ // Fallback to outer DOM
444
+ const raw = await domScan(page);
445
+ return raw.map(f => ({ ...f, category: classifyField(f) }));
446
+ }
447
+
448
+ /**
449
+ * Generic scanner for any ATS not explicitly handled.
450
+ * Uses the full DOM walk with label classification.
451
+ */
452
+ async function scanGeneric(page) {
453
+ const raw = await domScan(page);
454
+ return raw.map(f => ({ ...f, category: classifyField(f) }));
455
+ }
456
+
457
+ // ─── Main export ──────────────────────────────────────────────────────────────
458
+
459
+ /**
460
+ * Scan the current page and return a structured field map.
461
+ *
462
+ * @param {import('playwright').Page} page
463
+ * @param {string} ats - 'greenhouse' | 'lever' | 'ashby' | 'workday' | 'icims' | 'taleo' | ...
464
+ * @returns {Promise<ScannedField[]>}
465
+ *
466
+ * ScannedField shape:
467
+ * {
468
+ * selector: string, CSS selector to locate this element
469
+ * label: string, Human-readable label text
470
+ * tag: string, 'input' | 'textarea' | 'select'
471
+ * inputType: string, 'text' | 'email' | 'radio' | 'checkbox' | 'file' | ...
472
+ * name: string, name= attribute
473
+ * id: string, id= attribute
474
+ * automationId: string, data-automation-id= attribute (Workday)
475
+ * currentValue: string, current field value (empty if unfilled)
476
+ * category: string, 'profile:{field}' | 'custom' | 'eeo' | 'consent' | 'file:{field}'
477
+ * iframeSelector?: string, set if field is inside an iframe (iCIMS)
478
+ * }
479
+ */
480
+ async function scanPage(page, ats) {
481
+ const platform = (ats || 'generic').toLowerCase();
482
+ let fields;
483
+
484
+ try {
485
+ switch (platform) {
486
+ case 'greenhouse': fields = await scanGreenhouse(page); break;
487
+ case 'lever': fields = await scanLever(page); break;
488
+ case 'ashby': fields = await scanAshby(page); break;
489
+ case 'workday': fields = await scanWorkday(page); break;
490
+ case 'icims': fields = await scanICIMS(page); break;
491
+ default: fields = await scanGeneric(page); break;
492
+ }
493
+ } catch (e) {
494
+ console.warn(`[scanPage] Scanner failed for ${platform}: ${e.message} — falling back to generic`);
495
+ fields = await scanGeneric(page).catch(() => []);
496
+ }
497
+
498
+ // Filter out already-filled fields and utility noise
499
+ const out = fields.filter(f => {
500
+ if (!f.label && !f.selector) return false;
501
+ if (f.inputType === 'hidden') return false;
502
+ // Skip recaptcha textarea
503
+ if (f.name && f.name.includes('g-recaptcha')) return false;
504
+ return true;
505
+ });
506
+
507
+ console.log(`[scanPage] ${platform}: found ${out.length} fields (${out.filter(f => f.category.startsWith('profile')).length} profile, ${out.filter(f => f.category === 'custom').length} custom, ${out.filter(f => f.category === 'eeo').length} eeo)`);
508
+ return out;
509
+ }
510
+
511
+ /**
512
+ * Given the scanned fields and an AEP profile, return what value to fill for each field.
513
+ * - Profile fields: look up from profile object
514
+ * - Custom fields: look up from field_answers (AI pre-generated or on-demand)
515
+ * - Returns null for fields that have no answer yet (caller fetches AI answer)
516
+ *
517
+ * @param {ScannedField[]} fields
518
+ * @param {object} aep - Agent Execution Packet (profile_fill, field_answers, cover_letter)
519
+ * @returns {{ field: ScannedField, value: string|null, source: string }[]}
520
+ */
521
+ function resolveFieldValues(fields, aep) {
522
+ const profile = aep.profile_fill || {};
523
+ const fieldAnswers = aep.field_answers || [];
524
+
525
+ // Build label -> answer map from pre-generated AEP answers (case-insensitive)
526
+ const answerMap = new Map();
527
+ for (const fa of fieldAnswers) {
528
+ if (fa.label) answerMap.set(fa.label.toLowerCase().trim(), { value: fa.value, source: fa.source || 'ai' });
529
+ if (fa.field_id) answerMap.set(fa.field_id.toLowerCase().trim(), { value: fa.value, source: fa.source || 'ai' });
530
+ }
531
+
532
+ // Profile field resolver
533
+ const profileValues = {
534
+ full_name: [profile.first_name, profile.last_name].filter(Boolean).join(' '),
535
+ first_name: profile.first_name || '',
536
+ last_name: profile.last_name || '',
537
+ preferred_name: profile.preferred_name || profile.first_name || '',
538
+ email: profile.email || '',
539
+ phone: profile.phone || '',
540
+ linkedin: profile.linkedin || '',
541
+ github: profile.github || '',
542
+ twitter: profile.twitter || '',
543
+ portfolio: profile.portfolio || profile.website || '',
544
+ website: profile.portfolio || profile.website || '',
545
+ location: profile.city || profile.location || '',
546
+ address: profile.address || '',
547
+ city: profile.city || '',
548
+ state: profile.state || '',
549
+ zip: profile.zip || profile.postal || '',
550
+ country: profile.country || 'United States',
551
+ org: profile.current_company || profile.org || '',
552
+ school: profile.school || '',
553
+ degree: profile.degree || '',
554
+ gpa: profile.gpa || '',
555
+ salary: profile.desired_salary || '',
556
+ start_date: profile.start_date || 'Immediately',
557
+ };
558
+
559
+ return fields.map(field => {
560
+ const cat = field.category;
561
+
562
+ // File upload fields — handled separately by uploadResume()
563
+ if (cat.startsWith('file:')) {
564
+ return { field, value: null, source: 'file_upload' };
565
+ }
566
+
567
+ // EEO/consent — skip (handled by vision or user)
568
+ if (cat === 'eeo' || cat === 'consent') {
569
+ return { field, value: null, source: 'skip' };
570
+ }
571
+
572
+ // Profile fields
573
+ if (cat.startsWith('profile:')) {
574
+ const pfKey = cat.slice('profile:'.length);
575
+ const val = profileValues[pfKey] || '';
576
+ // Also check answerMap in case AEP overrides a profile field
577
+ const override = answerMap.get((field.label || '').toLowerCase().trim());
578
+ return { field, value: override?.value || val || null, source: val ? 'profile' : 'missing' };
579
+ }
580
+
581
+ // Cover letter special case
582
+ if (cat === 'cover_letter') {
583
+ return { field, value: aep.cover_letter || null, source: 'cover_letter' };
584
+ }
585
+
586
+ // Custom question — look up in answer map by label (exact then fuzzy)
587
+ if (cat === 'custom') {
588
+ const labelKey = (field.label || '').toLowerCase().trim();
589
+ // Exact match
590
+ const exact = answerMap.get(labelKey);
591
+ if (exact) return { field, value: exact.value, source: exact.source };
592
+ // Fuzzy: check if any answer label is contained in this label or vice versa
593
+ for (const [key, ans] of answerMap) {
594
+ if (key.length > 5 && (labelKey.includes(key) || key.includes(labelKey))) {
595
+ return { field, value: ans.value, source: ans.source + ':fuzzy' };
596
+ }
597
+ }
598
+ // No answer found — needs AI
599
+ return { field, value: null, source: 'needs_ai' };
600
+ }
601
+
602
+ return { field, value: null, source: 'unknown' };
603
+ });
604
+ }
605
+
606
+ module.exports = { scanPage, resolveFieldValues, classifyField };