calibrcv 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,125 @@
1
+ import { callOllama } from '../providers/ollama.js';
2
+ import { callGroq } from '../providers/groq.js';
3
+ import { callGemini } from '../providers/gemini.js';
4
+ import { callOpenRouter } from '../providers/openrouter.js';
5
+
6
+ export class AllProvidersFailedError extends Error {
7
+ constructor(errors) {
8
+ super('All AI providers exhausted');
9
+ this.name = 'AllProvidersFailedError';
10
+ this.errors = errors;
11
+ }
12
+ }
13
+
14
+ export class ParseError extends Error {
15
+ constructor(rawText) {
16
+ super('Failed to parse AI response as JSON');
17
+ this.name = 'ParseError';
18
+ this.rawText = rawText;
19
+ }
20
+ }
21
+
22
+ let activeProviders = null;
23
+
24
+ /**
25
+ * Configure which providers are active and in what order.
26
+ * @param {{ provider?: string }} options - CLI options
27
+ */
28
+ export function configureProviders(options = {}) {
29
+ const forced = options.provider;
30
+
31
+ if (forced) {
32
+ const map = { ollama: callOllama, groq: callGroq, gemini: callGemini, openrouter: callOpenRouter };
33
+ if (!map[forced]) {
34
+ throw new Error(`Unknown provider: ${forced}. Use: ollama, groq, gemini, openrouter`);
35
+ }
36
+ activeProviders = [{ name: forced, fn: map[forced] }];
37
+ return;
38
+ }
39
+
40
+ // Auto-detect: build waterfall from available providers
41
+ const providers = [];
42
+
43
+ providers.push({ name: 'ollama', fn: callOllama });
44
+
45
+ if (process.env.GROQ_API_KEY) {
46
+ providers.push({ name: 'groq', fn: callGroq });
47
+ }
48
+ if (process.env.GEMINI_API_KEY) {
49
+ providers.push({ name: 'gemini', fn: callGemini });
50
+ }
51
+ if (process.env.OPENROUTER_API_KEY) {
52
+ providers.push({ name: 'openrouter', fn: callOpenRouter });
53
+ }
54
+
55
+ activeProviders = providers;
56
+ }
57
+
58
+ /**
59
+ * Route an AI call through the provider waterfall.
60
+ * @param {string} systemPrompt
61
+ * @param {string} userMessage
62
+ * @param {object} options - { responseFormat: 'json'|'text', taskName: string }
63
+ * @returns {Promise<string>}
64
+ */
65
+ export async function callAI(systemPrompt, userMessage, options = {}) {
66
+ if (!activeProviders) {
67
+ configureProviders();
68
+ }
69
+
70
+ const errors = [];
71
+
72
+ for (const p of activeProviders) {
73
+ try {
74
+ return await p.fn(systemPrompt, userMessage, options);
75
+ } catch (err) {
76
+ errors.push({ provider: p.name, error: err.message });
77
+ }
78
+ }
79
+
80
+ throw new AllProvidersFailedError(errors);
81
+ }
82
+
83
+ /**
84
+ * Parse text as JSON with multiple fallback strategies.
85
+ * @param {string} text
86
+ * @returns {object}
87
+ */
88
+ export function parseJSONSafely(text) {
89
+ if (!text || typeof text !== 'string') {
90
+ throw new ParseError(text);
91
+ }
92
+
93
+ // Attempt 1: Direct parse
94
+ try { return JSON.parse(text); } catch (_) { /* continue */ }
95
+
96
+ // Attempt 2: Strip markdown code fences
97
+ const stripped = text
98
+ .replace(/^```(?:json)?\n?/m, '')
99
+ .replace(/\n?```$/m, '')
100
+ .trim();
101
+ try { return JSON.parse(stripped); } catch (_) { /* continue */ }
102
+
103
+ // Attempt 3: Extract first {...} block
104
+ const objMatch = stripped.match(/\{[\s\S]*\}/);
105
+ if (objMatch) {
106
+ try { return JSON.parse(objMatch[0]); } catch (_) { /* continue */ }
107
+ }
108
+
109
+ // Attempt 4: Extract first [...] block
110
+ const arrMatch = stripped.match(/\[[\s\S]*\]/);
111
+ if (arrMatch) {
112
+ try { return JSON.parse(arrMatch[0]); } catch (_) { /* continue */ }
113
+ }
114
+
115
+ // Attempt 5: Remove trailing commas
116
+ const noTrailingCommas = stripped
117
+ .replace(/,\s*}/g, '}')
118
+ .replace(/,\s*]/g, ']');
119
+ const objMatch2 = noTrailingCommas.match(/\{[\s\S]*\}/);
120
+ if (objMatch2) {
121
+ try { return JSON.parse(objMatch2[0]); } catch (_) { /* continue */ }
122
+ }
123
+
124
+ throw new ParseError(text);
125
+ }
@@ -0,0 +1,479 @@
1
+ import natural from 'natural';
2
+ import { HBS_VERBS } from '../constants/hbs-verbs.js';
3
+
4
+ const METRIC_REGEX = /\d+[\%\$]?|\$\d+|\d+[xX]|\d+ms|\d+\+|\d+[KkMmBb]|\€\d+/g;
5
+
6
+ /**
7
+ * CalibrCV's proprietary ATS Scoring Engine.
8
+ * Pure algorithmic scoring — no AI calls, no external APIs.
9
+ * 5 categories, 100 points total.
10
+ */
11
+ class ATSScorer {
12
+
13
+ /**
14
+ * Score a resume against CalibrCV's quality standards.
15
+ *
16
+ * @param {string} resumeText - Raw text of the resume.
17
+ * @param {object} resumeJSON - Structured JSON resume object.
18
+ * @param {string|null} [jobDescriptionText] - Optional job description for keyword matching.
19
+ * @returns {ScoreReport}
20
+ */
21
+ score(resumeText, resumeJSON, jobDescriptionText = null) {
22
+ const categories = {
23
+ structural: this.scoreStructural(resumeJSON),
24
+ keywords: this.scoreKeywords(resumeJSON, jobDescriptionText),
25
+ content_quality: this.scoreContentQuality(resumeJSON),
26
+ parsability: this.scoreParsability(resumeText),
27
+ completeness: this.scoreCompleteness(resumeJSON),
28
+ };
29
+
30
+ const total = Object.values(categories).reduce((sum, c) => sum + c.score, 0);
31
+ const violations = this.detectLawViolations(resumeJSON);
32
+ const recommendations = this.generateRecommendations(categories, violations);
33
+ const letterGrade = this.getLetterGrade(total);
34
+
35
+ return {
36
+ total,
37
+ letter_grade: letterGrade,
38
+ categories,
39
+ recommendations,
40
+ calibrcv_law_violations: violations,
41
+ };
42
+ }
43
+
44
+ /**
45
+ * STRUCTURAL INTEGRITY: 0-20 points
46
+ * Checks presence and completeness of required resume sections.
47
+ */
48
+ scoreStructural(resumeJSON) {
49
+ let score = 0;
50
+ const issues = [];
51
+
52
+ // +5: has education array with ≥1 entry
53
+ if (resumeJSON.education?.length >= 1) {
54
+ score += 5;
55
+ } else {
56
+ issues.push('Missing education section');
57
+ }
58
+
59
+ // +5: has experience array with ≥1 entry
60
+ if (resumeJSON.experience?.length >= 1) {
61
+ score += 5;
62
+ } else {
63
+ issues.push('Missing experience section');
64
+ }
65
+
66
+ // +5: has skills object with both fields
67
+ if (resumeJSON.skills?.quantitative_stack && resumeJSON.skills?.analytic_domain) {
68
+ score += 5;
69
+ } else {
70
+ issues.push('Missing or incomplete skills section');
71
+ }
72
+
73
+ // +5: all experience and education entries have non-empty dates
74
+ const allDated = [
75
+ ...(resumeJSON.experience || []),
76
+ ...(resumeJSON.education || []),
77
+ ].every(entry => entry.dates && entry.dates.trim().length > 0);
78
+ if (allDated) {
79
+ score += 5;
80
+ } else {
81
+ issues.push('Some entries are missing dates');
82
+ }
83
+
84
+ return { score, max: 20, label: 'Structural Integrity', issues };
85
+ }
86
+
87
+ /**
88
+ * KEYWORD DENSITY: 0-30 points
89
+ * With JD: TF-IDF keyword matching.
90
+ * Without JD: Lexical richness (type/token ratio).
91
+ */
92
+ scoreKeywords(resumeJSON, jobDescriptionText) {
93
+ const allResumeText = this.extractAllText(resumeJSON);
94
+
95
+ if (jobDescriptionText && jobDescriptionText.trim().length > 50) {
96
+ return this.scoreKeywordsWithJD(allResumeText, jobDescriptionText);
97
+ }
98
+
99
+ return this.scoreKeywordsWithoutJD(allResumeText);
100
+ }
101
+
102
+ scoreKeywordsWithJD(resumeText, jdText) {
103
+ const tfidf = new natural.TfIdf();
104
+ tfidf.addDocument(jdText.toLowerCase());
105
+ tfidf.addDocument(resumeText.toLowerCase());
106
+
107
+ // Extract top 30 terms from the JD
108
+ const jdTerms = [];
109
+ tfidf.listTerms(0).forEach(item => {
110
+ if (item.term.length > 2 && jdTerms.length < 30) {
111
+ jdTerms.push(item.term);
112
+ }
113
+ });
114
+
115
+ const resumeLower = resumeText.toLowerCase();
116
+ const matchedKeywords = [];
117
+ const missingKeywords = [];
118
+
119
+ for (const term of jdTerms) {
120
+ if (resumeLower.includes(term)) {
121
+ matchedKeywords.push(term);
122
+ } else {
123
+ missingKeywords.push(term);
124
+ }
125
+ }
126
+
127
+ const matchRate = jdTerms.length > 0 ? matchedKeywords.length / jdTerms.length : 0;
128
+ const score = Math.round(matchRate * 30);
129
+
130
+ return {
131
+ score: Math.min(score, 30),
132
+ max: 30,
133
+ label: 'Keyword Density',
134
+ matched_keywords: matchedKeywords,
135
+ missing_keywords: missingKeywords.slice(0, 10),
136
+ match_rate: Math.round(matchRate * 100),
137
+ };
138
+ }
139
+
140
+ scoreKeywordsWithoutJD(resumeText) {
141
+ const tokenizer = new natural.WordTokenizer();
142
+ const tokens = tokenizer.tokenize(resumeText.toLowerCase());
143
+
144
+ if (tokens.length === 0) {
145
+ return { score: 0, max: 30, label: 'Keyword Density', match_rate: 0 };
146
+ }
147
+
148
+ const uniqueTokens = new Set(tokens);
149
+ const richness = uniqueTokens.size / tokens.length;
150
+
151
+ // Richness typically ranges 0.3-0.7 for resumes
152
+ // Normalize: 0.3 = 0 points, 0.6+ = 30 points
153
+ const normalized = Math.min(Math.max((richness - 0.3) / 0.3, 0), 1);
154
+ const score = Math.round(normalized * 30);
155
+
156
+ return {
157
+ score: Math.min(score, 30),
158
+ max: 30,
159
+ label: 'Keyword Density',
160
+ lexical_richness: Math.round(richness * 100),
161
+ unique_terms: uniqueTokens.size,
162
+ total_terms: tokens.length,
163
+ };
164
+ }
165
+
166
+ /**
167
+ * CONTENT QUALITY: 0-25 points
168
+ * HBS verb compliance, metric presence, bullet length sweet spot.
169
+ */
170
+ scoreContentQuality(resumeJSON) {
171
+ const allBullets = [
172
+ ...(resumeJSON.experience?.flatMap(e => e.bullets || []) || []),
173
+ ...(resumeJSON.projects?.flatMap(p => p.bullets || []) || []),
174
+ ];
175
+
176
+ if (allBullets.length === 0) {
177
+ return { score: 0, max: 25, label: 'Content Quality', verb_compliance_rate: 0, metric_rate: 0, avg_bullet_length: 0 };
178
+ }
179
+
180
+ const total = allBullets.length;
181
+
182
+ // HBS Verb compliance (0-10)
183
+ let compliantCount = 0;
184
+ for (const bullet of allBullets) {
185
+ const firstWord = bullet.trim().split(/\s+/)[0].toLowerCase().replace(/[^a-z]/g, '');
186
+ if (HBS_VERBS.includes(firstWord)) {
187
+ compliantCount++;
188
+ }
189
+ }
190
+ const verbScore = Math.round((compliantCount / total) * 10);
191
+ const verbComplianceRate = Math.round((compliantCount / total) * 100);
192
+
193
+ // Metric presence (0-10)
194
+ let bulletsWithMetrics = 0;
195
+ for (const bullet of allBullets) {
196
+ if (METRIC_REGEX.test(bullet)) {
197
+ bulletsWithMetrics++;
198
+ }
199
+ METRIC_REGEX.lastIndex = 0; // Reset regex state
200
+ }
201
+ const metricScore = Math.round((bulletsWithMetrics / total) * 10);
202
+ const metricRate = Math.round((bulletsWithMetrics / total) * 100);
203
+
204
+ // Length sweet spot (0-5)
205
+ const totalLength = allBullets.reduce((sum, b) => sum + b.length, 0);
206
+ const avgLength = Math.round(totalLength / total);
207
+ let lengthScore = 0;
208
+ if (avgLength >= 55 && avgLength <= 100) {
209
+ lengthScore = 5;
210
+ } else if (avgLength >= 45 && avgLength <= 120) {
211
+ lengthScore = 3;
212
+ }
213
+
214
+ return {
215
+ score: verbScore + metricScore + lengthScore,
216
+ max: 25,
217
+ label: 'Content Quality',
218
+ verb_compliance_rate: verbComplianceRate,
219
+ metric_rate: metricRate,
220
+ avg_bullet_length: avgLength,
221
+ };
222
+ }
223
+
224
+ /**
225
+ * PARSABILITY: 0-15 points
226
+ * Deducts for characters that break ATS parsers.
227
+ */
228
+ scoreParsability(resumeText) {
229
+ let score = 15;
230
+ const issuesFound = [];
231
+
232
+ // -5: box-drawing characters
233
+ if (/[│┤├─┼╌╎]/g.test(resumeText)) {
234
+ score -= 5;
235
+ issuesFound.push('Box-drawing characters found (breaks ATS parsers)');
236
+ }
237
+
238
+ // -5: em dashes
239
+ if (/—/.test(resumeText)) {
240
+ score -= 5;
241
+ issuesFound.push('Em dashes found (use hyphens or semicolons instead)');
242
+ }
243
+
244
+ // -3: smart/curly quotes
245
+ if (/[\u2018\u2019\u201C\u201D]/.test(resumeText)) {
246
+ score -= 3;
247
+ issuesFound.push('Smart/curly quotes found (use straight quotes)');
248
+ }
249
+
250
+ // -2: null bytes or control characters
251
+ if (/[\x00-\x08\x0B\x0C\x0E-\x1F]/.test(resumeText)) {
252
+ score -= 2;
253
+ issuesFound.push('Control characters found (indicates corrupted text)');
254
+ }
255
+
256
+ return {
257
+ score: Math.max(0, score),
258
+ max: 15,
259
+ label: 'Parsability',
260
+ issues_found: issuesFound,
261
+ };
262
+ }
263
+
264
+ /**
265
+ * COMPLETENESS: 0-10 points
266
+ * Checks presence of contact info and skills breadth.
267
+ */
268
+ scoreCompleteness(resumeJSON) {
269
+ let score = 0;
270
+ const missingFields = [];
271
+
272
+ // +2: email
273
+ if (resumeJSON.contact?.email?.trim()) {
274
+ score += 2;
275
+ } else {
276
+ missingFields.push('email');
277
+ }
278
+
279
+ // +2: phone
280
+ if (resumeJSON.contact?.phone?.trim()) {
281
+ score += 2;
282
+ } else {
283
+ missingFields.push('phone');
284
+ }
285
+
286
+ // +2: linkedin
287
+ if (resumeJSON.contact?.linkedin?.trim()) {
288
+ score += 2;
289
+ } else {
290
+ missingFields.push('linkedin');
291
+ }
292
+
293
+ // +2: location
294
+ if (resumeJSON.contact?.location?.trim()) {
295
+ score += 2;
296
+ } else {
297
+ missingFields.push('location');
298
+ }
299
+
300
+ // +2: skills breadth (≥3 comma-separated values in quantitative_stack)
301
+ const stackSkills = resumeJSON.skills?.quantitative_stack?.split(',').filter(s => s.trim().length > 0);
302
+ if (stackSkills?.length >= 3) {
303
+ score += 2;
304
+ } else {
305
+ missingFields.push('quantitative_stack (needs ≥3 skills)');
306
+ }
307
+
308
+ return {
309
+ score,
310
+ max: 10,
311
+ label: 'Completeness',
312
+ missing_fields: missingFields,
313
+ };
314
+ }
315
+
316
+ /**
317
+ * Detect violations of CalibrCV's 8 Laws.
318
+ * @returns {string[]}
319
+ */
320
+ detectLawViolations(resumeJSON) {
321
+ const violations = [];
322
+ const allBullets = [
323
+ ...(resumeJSON.experience?.flatMap(e => e.bullets || []) || []),
324
+ ...(resumeJSON.projects?.flatMap(p => p.bullets || []) || []),
325
+ ];
326
+
327
+ allBullets.forEach((bullet, i) => {
328
+ // LAW 1: 100-character limit
329
+ if (bullet.length >= 100) {
330
+ violations.push(`LAW 1: Bullet ${i + 1} is ${bullet.length} chars: "${bullet.slice(0, 40)}..."`);
331
+ }
332
+
333
+ // LAW 5: No em dashes
334
+ if (/—/.test(bullet)) {
335
+ violations.push(`LAW 5: Em dash in bullet ${i + 1}: "${bullet.slice(0, 40)}..."`);
336
+ }
337
+
338
+ // LAW 2: HBS verb start
339
+ const firstWord = bullet.trim().split(/\s+/)[0].toLowerCase().replace(/[^a-z]/g, '');
340
+ if (firstWord && !HBS_VERBS.includes(firstWord)) {
341
+ violations.push(`LAW 2: Bullet ${i + 1} starts with non-HBS verb "${firstWord}"`);
342
+ }
343
+ });
344
+
345
+ // LAW 3: Summary pronouns
346
+ if (resumeJSON.summary) {
347
+ const pronounPatterns = [
348
+ { regex: /\bI\b/, label: 'I' },
349
+ { regex: /\bmy\b/i, label: 'my' },
350
+ { regex: /\bme\b/i, label: 'me' },
351
+ { regex: /\bour\b/i, label: 'our' },
352
+ { regex: /\bwe\b/i, label: 'we' },
353
+ ];
354
+
355
+ for (const { regex, label } of pronounPatterns) {
356
+ if (regex.test(resumeJSON.summary)) {
357
+ violations.push(`LAW 3: Pronoun "${label}" found in summary`);
358
+ }
359
+ }
360
+
361
+ // LAW 5: Em dash in summary
362
+ if (/—/.test(resumeJSON.summary)) {
363
+ violations.push('LAW 5: Em dash in summary');
364
+ }
365
+ }
366
+
367
+ // LAW 6: Skills section — exactly two rows
368
+ if (resumeJSON.skills) {
369
+ const keys = Object.keys(resumeJSON.skills);
370
+ if (keys.length !== 2) {
371
+ violations.push(`LAW 6: Skills section has ${keys.length} rows instead of exactly 2`);
372
+ }
373
+ }
374
+
375
+ // LAW 7: Bullet counts
376
+ if (resumeJSON.experience) {
377
+ for (const exp of resumeJSON.experience) {
378
+ const count = exp.bullets?.length || 0;
379
+ if (count < 2 || count > 3) {
380
+ violations.push(`LAW 7: Experience "${exp.title}" has ${count} bullets (should be 2-3)`);
381
+ }
382
+ }
383
+ }
384
+ if (resumeJSON.projects) {
385
+ for (const proj of resumeJSON.projects) {
386
+ const count = proj.bullets?.length || 0;
387
+ if (count !== 2) {
388
+ violations.push(`LAW 7: Project "${proj.name}" has ${count} bullets (should be exactly 2)`);
389
+ }
390
+ }
391
+ }
392
+
393
+ return violations;
394
+ }
395
+
396
+ /**
397
+ * Generate top 5 actionable recommendations.
398
+ */
399
+ generateRecommendations(categories, violations) {
400
+ const recommendations = [];
401
+
402
+ // If there are law violations, that's priority #1
403
+ if (violations.length > 0) {
404
+ recommendations.push(`Fix ${violations.length} CalibrCV Law violation(s) first — these directly hurt your ATS score.`);
405
+ }
406
+
407
+ // Sort categories by deficit (max - score) descending
408
+ const sorted = Object.entries(categories)
409
+ .map(([key, cat]) => ({ key, ...cat, deficit: cat.max - cat.score }))
410
+ .sort((a, b) => b.deficit - a.deficit);
411
+
412
+ for (const cat of sorted) {
413
+ if (recommendations.length >= 5) break;
414
+
415
+ if (cat.key === 'structural' && cat.deficit > 0) {
416
+ if (cat.issues?.length > 0) {
417
+ recommendations.push(`Improve structure: ${cat.issues[0]}.`);
418
+ }
419
+ }
420
+
421
+ if (cat.key === 'keywords' && cat.deficit > 5) {
422
+ if (cat.missing_keywords?.length > 0) {
423
+ recommendations.push(`Add missing keywords: ${cat.missing_keywords.slice(0, 3).join(', ')}. These appear in the job description but not your resume.`);
424
+ } else {
425
+ recommendations.push('Increase keyword diversity — use more varied, industry-specific terminology.');
426
+ }
427
+ }
428
+
429
+ if (cat.key === 'content_quality' && cat.deficit > 5) {
430
+ if (cat.verb_compliance_rate < 70) {
431
+ recommendations.push('Start more bullets with strong HBS action verbs (e.g., Architected, Engineered, Deployed).');
432
+ }
433
+ if (cat.metric_rate < 50) {
434
+ recommendations.push('Quantify more achievements — add specific numbers, percentages, or dollar amounts to bullets.');
435
+ }
436
+ }
437
+
438
+ if (cat.key === 'parsability' && cat.deficit > 0) {
439
+ recommendations.push(`Fix parsability issues: ${cat.issues_found?.join('; ') || 'Remove special characters that break ATS parsers'}.`);
440
+ }
441
+
442
+ if (cat.key === 'completeness' && cat.deficit > 0) {
443
+ if (cat.missing_fields?.length > 0) {
444
+ recommendations.push(`Add missing contact info: ${cat.missing_fields.join(', ')}.`);
445
+ }
446
+ }
447
+ }
448
+
449
+ return recommendations.slice(0, 5);
450
+ }
451
+
452
+ /**
453
+ * Map numeric score to letter grade.
454
+ */
455
+ getLetterGrade(score) {
456
+ if (score >= 90) return { grade: 'A+', label: 'Exceptional' };
457
+ if (score >= 80) return { grade: 'A', label: 'Strong' };
458
+ if (score >= 70) return { grade: 'B', label: 'Good' };
459
+ if (score >= 60) return { grade: 'C', label: 'Average' };
460
+ return { grade: 'D', label: 'Needs Work' };
461
+ }
462
+
463
+ /**
464
+ * Extract all meaningful text from a resume JSON for analysis.
465
+ */
466
+ extractAllText(resumeJSON) {
467
+ return [
468
+ resumeJSON.summary || '',
469
+ ...(resumeJSON.experience?.flatMap(e => [e.title || '', ...(e.bullets || [])]) || []),
470
+ ...(resumeJSON.projects?.flatMap(p => [p.name || '', ...(p.bullets || [])]) || []),
471
+ ...(resumeJSON.education?.flatMap(e => [e.institution || '', e.degree || '', ...(e.bullets || [])]) || []),
472
+ resumeJSON.skills?.quantitative_stack || '',
473
+ resumeJSON.skills?.analytic_domain || '',
474
+ ].join(' ');
475
+ }
476
+ }
477
+
478
+ export const atsScorer = new ATSScorer();
479
+ export { ATSScorer };
@@ -0,0 +1,105 @@
1
+ import * as cheerio from 'cheerio';
2
+
3
+ /**
4
+ * Scrape a job description from a URL. Cheerio-only, no browser automation.
5
+ * Supports LinkedIn public API and generic HTML pages.
6
+ *
7
+ * @param {string} url - Job posting URL.
8
+ * @returns {Promise<{ title: string, company: string, description: string }>}
9
+ */
10
+ export async function scrapeJobUrl(url) {
11
+ // LinkedIn: try public guest API first
12
+ const linkedInMatch = url.match(/linkedin\.com\/jobs\/view\/(\d+)/);
13
+ if (linkedInMatch) {
14
+ try {
15
+ return await scrapeLinkedInPublic(linkedInMatch[1]);
16
+ } catch (_) {
17
+ // Fall through to generic scraper
18
+ }
19
+ }
20
+
21
+ return await scrapeGeneric(url);
22
+ }
23
+
24
+ async function scrapeLinkedInPublic(jobId) {
25
+ const apiUrl = `https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/${jobId}`;
26
+ const resp = await fetch(apiUrl, {
27
+ headers: {
28
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
29
+ 'Accept': 'text/html',
30
+ },
31
+ });
32
+
33
+ if (!resp.ok) {
34
+ throw new Error(`LinkedIn API returned ${resp.status}`);
35
+ }
36
+
37
+ const html = await resp.text();
38
+ const $ = cheerio.load(html);
39
+
40
+ const title = $('.top-card-layout__title').text().trim()
41
+ || $('h1').first().text().trim()
42
+ || 'Unknown Title';
43
+
44
+ const company = $('.topcard__org-name-link').text().trim()
45
+ || $('.top-card-layout__second-subline span').first().text().trim()
46
+ || 'Unknown Company';
47
+
48
+ const description = $('.description__text').text().trim()
49
+ || $('.show-more-less-html__markup').text().trim()
50
+ || $('article').text().trim()
51
+ || '';
52
+
53
+ return {
54
+ title,
55
+ company,
56
+ description: description.slice(0, 8000),
57
+ };
58
+ }
59
+
60
+ async function scrapeGeneric(url) {
61
+ const resp = await fetch(url, {
62
+ headers: {
63
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
64
+ 'Accept': 'text/html',
65
+ },
66
+ redirect: 'follow',
67
+ });
68
+
69
+ if (!resp.ok) {
70
+ throw new Error(`Failed to fetch ${url}: HTTP ${resp.status}`);
71
+ }
72
+
73
+ const html = await resp.text();
74
+ const $ = cheerio.load(html);
75
+
76
+ // Remove noise
77
+ $('script, style, nav, footer, header, iframe').remove();
78
+
79
+ const title = $('h1').first().text().trim() || $('title').text().trim() || 'Unknown Title';
80
+ const company = $('meta[property="og:site_name"]').attr('content') || '';
81
+
82
+ // Try common job description selectors
83
+ let description = '';
84
+ const selectors = [
85
+ '.job-description', '#job-description', '[data-testid="job-description"]',
86
+ '.description', '.posting-details', '.job-details', 'article', 'main',
87
+ ];
88
+ for (const sel of selectors) {
89
+ const text = $(sel).text().trim();
90
+ if (text.length > 100) {
91
+ description = text;
92
+ break;
93
+ }
94
+ }
95
+
96
+ if (!description) {
97
+ description = $('body').text().trim();
98
+ }
99
+
100
+ return {
101
+ title,
102
+ company,
103
+ description: description.slice(0, 8000),
104
+ };
105
+ }