@pranavraut033/ats-checker 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  'use strict';
2
2
 
3
3
  // src/pdf/index.ts
4
- async function extractTextFromPDF(data) {
4
+ async function extractTextFromPDF(data, options) {
5
5
  let pdfjsLib;
6
6
  try {
7
7
  pdfjsLib = await import('pdfjs-dist');
@@ -49,7 +49,16 @@ async function extractTextFromPDF(data) {
49
49
  const columnTexts = columns.map((col) => renderColumn(col));
50
50
  pages.push(columnTexts.filter(Boolean).join("\n"));
51
51
  }
52
- return pages.join("\n");
52
+ const text = pages.join("\n");
53
+ const minTextLength = options?.minTextLength ?? 100;
54
+ if (options?.ocrFallback && text.trim().length < minTextLength) {
55
+ try {
56
+ const ocrText = await options.ocrFallback(bytes);
57
+ if (ocrText.trim().length > text.trim().length) return ocrText;
58
+ } catch {
59
+ }
60
+ }
61
+ return text;
53
62
  }
54
63
  function renderColumn(items) {
55
64
  const Y_TOLERANCE = 2;
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pdf/index.ts"],"names":[],"mappings":";;;AASA,eAAsB,mBACpB,IAAA,EACiB;AAEjB,EAAA,IAAI,QAAA;AACJ,EAAA,IAAI;AACF,IAAA,QAAA,GAAW,MAAM,OAAO,YAAY,CAAA;AAAA,EACtC,CAAA,CAAA,MAAQ;AACN,IAAA,MAAM,IAAI,KAAA;AAAA,MACR;AAAA,KACF;AAAA,EACF;AAEA,EAAA,MAAM,QACJ,IAAA,YAAgB,WAAA,GAAc,IAAI,UAAA,CAAW,IAAI,CAAA,GAAI,IAAA;AAEvD,EAAA,MAAM,GAAA,GAAM,MAAM,QAAA,CAAS,WAAA,CAAY,EAAE,IAAA,EAAM,KAAA,EAAO,CAAA,CAAE,OAAA;AACxD,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,IAAK,GAAA,CAAI,UAAU,CAAA,EAAA,EAAK;AACtC,IAAA,MAAM,IAAA,GAAO,MAAM,GAAA,CAAI,OAAA,CAAQ,CAAC,CAAA;AAChC,IAAA,MAAM,OAAA,GAAU,MAAM,IAAA,CAAK,cAAA,EAAe;AAG1C,IAAA,MAAM,QAAmB,EAAC;AAE1B,IAAA,KAAA,MAAW,IAAA,IAAQ,QAAQ,KAAA,EAAO;AAChC,MAAA,IAAI,EAAE,KAAA,IAAS,IAAA,CAAA,IAAS,CAAC,IAAA,CAAK,GAAA,CAAI,MAAK,EAAG;AAC1C,MAAA,MAAM,YAAkC,KAAA,CAAM,OAAA;AAAA,QAC3C,IAAA,CAAkC;AAAA,OACrC,GACK,KAAiC,SAAA,GAClC,MAAA;AAEJ,MAAA,IAAI,CAAC,SAAA,EAAW;AAEd,QAAA,KAAA,CAAM,IAAA,CAAK,EAAE,CAAA,EAAG,CAAA,EAAG,GAAG,CAAA,EAAG,GAAA,EAAK,IAAA,CAAK,GAAA,EAAK,CAAA;AAAA,MAC1C,CAAA,MAAO;AACL,QAAA,KAAA,CAAM,IAAA,CAAK,EAAE,CAAA,EAAG,SAAA,CAAU,CAAC,CAAA,EAAG,CAAA,EAAG,SAAA,CAAU,CAAC,CAAA,EAAG,GAAA,EAAK,IAAA,CAAK,KAAK,CAAA;AAAA,MAChE;AAAA,IACF;AAWA,IAAA,MAAM,oBAAA,GAAuB,EAAA;AAC7B,IAAA,MAAM,aAAa,CAAC,GAAG,IAAI,GAAA,CAAI,MAAM,GAAA,CAAI,CAAC,EAAA,KAAO,IAAA,CAAK,MAAM,EAAA,CAAG,CAAC,CAAC,CAAC,CAAC,CAAA,CAAE,IAAA;AAAA,MACnE,CAAC,CAAA,EAAG,CAAA,KAAM,CAAA,GAAI;AAAA,KAChB;AAEA,IAAA,IAAI,cAAA,GAAgC,IAAA;AACpC,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,UAAA,CAAW,QAAQ,CAAA,EAAA,EAAK;AAC1C,MAAA,MAAM,MAAM,UAAA,CAAW,CAAC,CAAA,GAAI,UAAA,CAAW,IAAI,CAAC,CAAA;AAC5C,MAAA,IAAI,MAAM,MAAA,EAAQ;AAChB,QAAA,MAAA,GAAS,GAAA;AACT,QAAA,cAAA,GAAA,CAAkB,WAAW,CAAA,GAAI,CAAC,CAAA,GAAI,UAAA,CAAW,CAAC,CAAA,IAAK,CAAA;AAAA,MACzD;AAAA,IACF;AACA,IAAA,IAAI,MAAA,GAAS,sBAAsB,cAAA,GAAiB,IAAA;AAEpD,IAAA,MAAM,OAAA,GACJ,mBAAmB,IAAA,GACf;AAAA,MACE,MAAM,MAAA,CAAO,CAAC,EAAA,KAAO,EAAA,CAAG,IAAI,cAAe,CAAA;AAAA,MAC3C,MAAM,MAAA,CAAO,CAAC,EAAA,KAAO,EAAA,CAAG,KAAK,cAAe;AAAA,KAC9C,GACA,CAAC,KAAK,CAAA;AAEZ,IAAA,MAAM,cAAc,OAAA,CAAQ,GAAA,CAAI,CAAC,GAAA,KAAQ,YAAA,CAAa,GAAG,CAAC,CAAA;AAC1D,IAAA,KAAA,CAAM,KAAK,WAAA,CAAY,MAAA,CAAO,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA;AAAA,EACnD;AAEA,EAAA,OAAO,KAAA,CAAM,KAAK,IAAI,CAAA;AACxB;AAEA,SAAS,aAAa,KAAA,EAA6D;AACjF,EAAA,MAAM,WAAA,GAAc,CAAA;AACpB,EAAA,MAAM,OAAA,uBAA8D,GAAA,EAAI;AACxE,EAAA,MAAM,YAAsB,EAAC;AAE7B,EAAA,KAAA,MAAW,EAAE,CAAA,EAAG,CAAA,EAAG,GAAA,MAAS,KAAA,EAAO;AACjC,IAAA,IAAI,SAAA;AACJ,IAAA,KAAA,MAAW,OAAO,SAAA,EAAW;AAC3B,MAAA,IAAI,IAAA,CAAK,GAAA,CAAI,GAAA,GAAM,CAAC,KAAK,WAAA,EAAa;AACpC,QAAA,SAAA,GAAY,GAAA;AACZ,QAAA;AAAA,MACF;AAAA,IACF;AACA,IAAA,IAAI,cAAc,MAAA,EAAW;AAC3B,MAAA,SAAA,GAAY,CAAA;AACZ,MAAA,SAAA,CAAU,KAAK,CAAC,CAAA;AAChB,MAAA,OAAA,CAAQ,GAAA,CAAI,CAAA,EAAG,EAAE,CAAA;AAAA,IACnB;AACA,IAAA,OAAA,CAAQ,IAAI,SAAS,CAAA,CAAG,KAAK,EAAE,CAAA,EAAG,KAAK,CAAA;AAAA,EACzC;AAGA,EAAA,SAAA,CAAU,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,IAAI,CAAC,CAAA;AAE9B,EAAA,OAAO,SAAA,CACJ,GAAA;AAAA,IAAI,CAAC,GAAA,KAAA,CACH,OAAA,CAAQ,GAAA,CAAI,GAAG,CAAA,IAAK,EAAC,EACnB,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,EAAE,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,CACxB,GAAA,CAAI,CAAC,EAAA,KAAO,EAAA,CAAG,GAAG,CAAA,CAClB,IAAA,CAAK,GAAG,CAAA,CACR,OAAA,CAAQ,WAAA,EAAa,GAAG,EACxB,IAAA;AAAK,GACV,CACC,MAAA,CAAO,OAAO,CAAA,CACd,KAAK,IAAI,CAAA;AACd","file":"index.cjs","sourcesContent":["/**\n * Extract plain text from a PDF buffer.\n *\n * Requires `pdfjs-dist` to be installed (optional peerDependency):\n * npm install pdfjs-dist\n *\n * @param data - Raw PDF bytes as Uint8Array or ArrayBuffer\n * @returns Extracted text, ready to pass as `resumeText` to analyzeResume\n */\nexport async function extractTextFromPDF(\n data: Uint8Array | ArrayBuffer\n): Promise<string> {\n // ponytail: lazy import keeps core zero-dep; missing peer throws with clear message\n let pdfjsLib: typeof import(\"pdfjs-dist\");\n try {\n pdfjsLib = await import(\"pdfjs-dist\");\n } catch {\n throw new Error(\n \"pdfjs-dist is required for PDF extraction. Install it: npm install pdfjs-dist\"\n );\n }\n\n const bytes =\n data instanceof ArrayBuffer ? new Uint8Array(data) : data;\n\n const doc = await pdfjsLib.getDocument({ data: bytes }).promise;\n const pages: string[] = [];\n\n for (let i = 1; i <= doc.numPages; i++) {\n const page = await doc.getPage(i);\n const content = await page.getTextContent();\n\n type RawItem = { x: number; y: number; str: string };\n const items: RawItem[] = [];\n\n for (const item of content.items) {\n if (!(\"str\" in item) || !item.str.trim()) continue;\n const transform: number[] | undefined = Array.isArray(\n (item as { transform?: number[] }).transform\n )\n ? (item as { transform: number[] }).transform\n : undefined;\n\n if (!transform) {\n // No positional info (unit-test mocks) — treat as single-column item\n items.push({ x: 0, y: 0, str: item.str });\n } else {\n items.push({ x: transform[4], y: transform[5], str: item.str });\n }\n }\n\n // Detect column boundary: find the largest x-gap among item start positions.\n // If it exceeds COLUMN_GAP_THRESHOLD, split into left / right columns and\n // process each independently so headers in different columns don't merge.\n // ponytail: single largest-gap heuristic handles the common 2-column resume;\n // n-column needs k-means on x-distribution — upgrade if this proves insufficient.\n // Column boundary heuristic: the largest gap in item x-positions.\n // Real PDF column gutters show as a gap >>80px; normal word spacing is <50px.\n // ponytail: magic number calibrated to PranavRaut2026.pdf (104px gap); raise\n // if single-column PDFs with wide indentation start getting falsely split.\n const COLUMN_GAP_THRESHOLD = 80;\n const xPositions = [...new Set(items.map((it) => Math.round(it.x)))].sort(\n (a, b) => a - b\n );\n\n let columnBoundary: number | null = null;\n let maxGap = 0;\n for (let j = 1; j < xPositions.length; j++) {\n const gap = xPositions[j] - xPositions[j - 1];\n if (gap > maxGap) {\n maxGap = gap;\n columnBoundary = (xPositions[j - 1] + xPositions[j]) / 2;\n }\n }\n if (maxGap < COLUMN_GAP_THRESHOLD) columnBoundary = null;\n\n const columns =\n columnBoundary !== null\n ? [\n items.filter((it) => it.x < columnBoundary!),\n items.filter((it) => it.x >= columnBoundary!),\n ]\n : [items];\n\n const columnTexts = columns.map((col) => renderColumn(col));\n pages.push(columnTexts.filter(Boolean).join(\"\\n\"));\n }\n\n return pages.join(\"\\n\");\n}\n\nfunction renderColumn(items: Array<{ x: number; y: number; str: string }>): string {\n const Y_TOLERANCE = 2;\n const lineMap: Map<number, Array<{ x: number; str: string }>> = new Map();\n const lineOrder: number[] = [];\n\n for (const { x, y, str } of items) {\n let bucketKey: number | undefined;\n for (const key of lineOrder) {\n if (Math.abs(key - y) <= Y_TOLERANCE) {\n bucketKey = key;\n break;\n }\n }\n if (bucketKey === undefined) {\n bucketKey = y;\n lineOrder.push(y);\n lineMap.set(y, []);\n }\n lineMap.get(bucketKey)!.push({ x, str });\n }\n\n // pdfjs y=0 is bottom of page — sort descending so top comes first\n lineOrder.sort((a, b) => b - a);\n\n return lineOrder\n .map((key) =>\n (lineMap.get(key) ?? [])\n .sort((a, b) => a.x - b.x)\n .map((it) => it.str)\n .join(\" \")\n .replace(/[^\\S\\n]+/g, \" \")\n .trim()\n )\n .filter(Boolean)\n .join(\"\\n\");\n}\n"]}
1
+ {"version":3,"sources":["../../src/pdf/index.ts"],"names":[],"mappings":";;;AAwBA,eAAsB,kBAAA,CACpB,MACA,OAAA,EACiB;AAEjB,EAAA,IAAI,QAAA;AACJ,EAAA,IAAI;AACF,IAAA,QAAA,GAAW,MAAM,OAAO,YAAY,CAAA;AAAA,EACtC,CAAA,CAAA,MAAQ;AACN,IAAA,MAAM,IAAI,KAAA;AAAA,MACR;AAAA,KACF;AAAA,EACF;AAEA,EAAA,MAAM,QACJ,IAAA,YAAgB,WAAA,GAAc,IAAI,UAAA,CAAW,IAAI,CAAA,GAAI,IAAA;AAEvD,EAAA,MAAM,GAAA,GAAM,MAAM,QAAA,CAAS,WAAA,CAAY,EAAE,IAAA,EAAM,KAAA,EAAO,CAAA,CAAE,OAAA;AACxD,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,IAAK,GAAA,CAAI,UAAU,CAAA,EAAA,EAAK;AACtC,IAAA,MAAM,IAAA,GAAO,MAAM,GAAA,CAAI,OAAA,CAAQ,CAAC,CAAA;AAChC,IAAA,MAAM,OAAA,GAAU,MAAM,IAAA,CAAK,cAAA,EAAe;AAG1C,IAAA,MAAM,QAAmB,EAAC;AAE1B,IAAA,KAAA,MAAW,IAAA,IAAQ,QAAQ,KAAA,EAAO;AAChC,MAAA,IAAI,EAAE,KAAA,IAAS,IAAA,CAAA,IAAS,CAAC,IAAA,CAAK,GAAA,CAAI,MAAK,EAAG;AAC1C,MAAA,MAAM,YAAkC,KAAA,CAAM,OAAA;AAAA,QAC3C,IAAA,CAAkC;AAAA,OACrC,GACK,KAAiC,SAAA,GAClC,MAAA;AAEJ,MAAA,IAAI,CAAC,SAAA,EAAW;AAEd,QAAA,KAAA,CAAM,IAAA,CAAK,EAAE,CAAA,EAAG,CAAA,EAAG,GAAG,CAAA,EAAG,GAAA,EAAK,IAAA,CAAK,GAAA,EAAK,CAAA;AAAA,MAC1C,CAAA,MAAO;AACL,QAAA,KAAA,CAAM,IAAA,CAAK,EAAE,CAAA,EAAG,SAAA,CAAU,CAAC,CAAA,EAAG,CAAA,EAAG,SAAA,CAAU,CAAC,CAAA,EAAG,GAAA,EAAK,IAAA,CAAK,KAAK,CAAA;AAAA,MAChE;AAAA,IACF;AAWA,IAAA,MAAM,oBAAA,GAAuB,EAAA;AAC7B,IAAA,MAAM,aAAa,CAAC,GAAG,IAAI,GAAA,CAAI,MAAM,GAAA,CAAI,CAAC,EAAA,KAAO,IAAA,CAAK,MAAM,EAAA,CAAG,CAAC,CAAC,CAAC,CAAC,CAAA,CAAE,IAAA;AAAA,MACnE,CAAC,CAAA,EAAG,CAAA,KAAM,CAAA,GAAI;AAAA,KAChB;AAEA,IAAA,IAAI,cAAA,GAAgC,IAAA;AACpC,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,UAAA,CAAW,QAAQ,CAAA,EAAA,EAAK;AAC1C,MAAA,MAAM,MAAM,UAAA,CAAW,CAAC,CAAA,GAAI,UAAA,CAAW,IAAI,CAAC,CAAA;AAC5C,MAAA,IAAI,MAAM,MAAA,EAAQ;AAChB,QAAA,MAAA,GAAS,GAAA;AACT,QAAA,cAAA,GAAA,CAAkB,WAAW,CAAA,GAAI,CAAC,CAAA,GAAI,UAAA,CAAW,CAAC,CAAA,IAAK,CAAA;AAAA,MACzD;AAAA,IACF;AACA,IAAA,IAAI,MAAA,GAAS,sBAAsB,cAAA,GAAiB,IAAA;AAEpD,IAAA,MAAM,OAAA,GACJ,mBAAmB,IAAA,GACf;AAAA,MACE,MAAM,MAAA,CAAO,CAAC,EAAA,KAAO,EAAA,CAAG,IAAI,cAAe,CAAA;AAAA,MAC3C,MAAM,MAAA,CAAO,CAAC,EAAA,KAAO,EAAA,CAAG,KAAK,cAAe;AAAA,KAC9C,GACA,CAAC,KAAK,CAAA;AAEZ,IAAA,MAAM,cAAc,OAAA,CAAQ,GAAA,CAAI,CAAC,GAAA,KAAQ,YAAA,CAAa,GAAG,CAAC,CAAA;AAC1D,IAAA,KAAA,CAAM,KAAK,WAAA,CAAY,MAAA,CAAO,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA;AAAA,EACnD;AAEA,EAAA,MAAM,IAAA,GAAO,KAAA,CAAM,IAAA,CAAK,IAAI,CAAA;AAI5B,EAAA,MAAM,aAAA,GAAgB,SAAS,aAAA,IAAiB,GAAA;AAChD,EAAA,IAAI,SAAS,WAAA,IAAe,IAAA,CAAK,IAAA,EAAK,CAAE,SAAS,aAAA,EAAe;AAC9D,IAAA,IAAI;AACF,MAAA,MAAM,OAAA,GAAU,MAAM,OAAA,CAAQ,WAAA,CAAY,KAAK,CAAA;AAC/C,MAAA,IAAI,OAAA,CAAQ,MAAK,CAAE,MAAA,GAAS,KAAK,IAAA,EAAK,CAAE,QAAQ,OAAO,OAAA;AAAA,IACzD,CAAA,CAAA,MAAQ;AAAA,IAER;AAAA,EACF;AAEA,EAAA,OAAO,IAAA;AACT;AAEA,SAAS,aAAa,KAAA,EAA6D;AACjF,EAAA,MAAM,WAAA,GAAc,CAAA;AACpB,EAAA,MAAM,OAAA,uBAA8D,GAAA,EAAI;AACxE,EAAA,MAAM,YAAsB,EAAC;AAE7B,EAAA,KAAA,MAAW,EAAE,CAAA,EAAG,CAAA,EAAG,GAAA,MAAS,KAAA,EAAO;AACjC,IAAA,IAAI,SAAA;AACJ,IAAA,KAAA,MAAW,OAAO,SAAA,EAAW;AAC3B,MAAA,IAAI,IAAA,CAAK,GAAA,CAAI,GAAA,GAAM,CAAC,KAAK,WAAA,EAAa;AACpC,QAAA,SAAA,GAAY,GAAA;AACZ,QAAA;AAAA,MACF;AAAA,IACF;AACA,IAAA,IAAI,cAAc,MAAA,EAAW;AAC3B,MAAA,SAAA,GAAY,CAAA;AACZ,MAAA,SAAA,CAAU,KAAK,CAAC,CAAA;AAChB,MAAA,OAAA,CAAQ,GAAA,CAAI,CAAA,EAAG,EAAE,CAAA;AAAA,IACnB;AACA,IAAA,OAAA,CAAQ,IAAI,SAAS,CAAA,CAAG,KAAK,EAAE,CAAA,EAAG,KAAK,CAAA;AAAA,EACzC;AAGA,EAAA,SAAA,CAAU,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,IAAI,CAAC,CAAA;AAE9B,EAAA,OAAO,SAAA,CACJ,GAAA;AAAA,IAAI,CAAC,GAAA,KAAA,CACH,OAAA,CAAQ,GAAA,CAAI,GAAG,CAAA,IAAK,EAAC,EACnB,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,EAAE,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,CACxB,GAAA,CAAI,CAAC,EAAA,KAAO,EAAA,CAAG,GAAG,CAAA,CAClB,IAAA,CAAK,GAAG,CAAA,CACR,OAAA,CAAQ,WAAA,EAAa,GAAG,EACxB,IAAA;AAAK,GACV,CACC,MAAA,CAAO,OAAO,CAAA,CACd,KAAK,IAAI,CAAA;AACd","file":"index.cjs","sourcesContent":["/**\n * Caller-supplied OCR implementation, invoked only when the PDF's text layer\n * comes back too short to be useful (scanned/image PDFs). The caller owns\n * the actual OCR engine and dependency — this library never bundles one.\n */\nexport type OCRClient = (data: Uint8Array) => Promise<string>;\n\nexport interface ExtractTextOptions {\n /** Called with the raw PDF bytes when text-layer extraction is too short. */\n ocrFallback?: OCRClient;\n /** Threshold (trimmed char count) below which ocrFallback is tried. Default 100, matching resume.parser.ts's scanned-PDF warning. */\n minTextLength?: number;\n}\n\n/**\n * Extract plain text from a PDF buffer.\n *\n * Requires `pdfjs-dist` to be installed (optional peerDependency):\n * npm install pdfjs-dist\n *\n * @param data - Raw PDF bytes as Uint8Array or ArrayBuffer\n * @param options - Optional OCR fallback for scanned/image PDFs\n * @returns Extracted text, ready to pass as `resumeText` to analyzeResume\n */\nexport async function extractTextFromPDF(\n data: Uint8Array | ArrayBuffer,\n options?: ExtractTextOptions\n): Promise<string> {\n // ponytail: lazy import keeps core zero-dep; missing peer throws with clear message\n let pdfjsLib: typeof import(\"pdfjs-dist\");\n try {\n pdfjsLib = await import(\"pdfjs-dist\");\n } catch {\n throw new Error(\n \"pdfjs-dist is required for PDF extraction. Install it: npm install pdfjs-dist\"\n );\n }\n\n const bytes =\n data instanceof ArrayBuffer ? new Uint8Array(data) : data;\n\n const doc = await pdfjsLib.getDocument({ data: bytes }).promise;\n const pages: string[] = [];\n\n for (let i = 1; i <= doc.numPages; i++) {\n const page = await doc.getPage(i);\n const content = await page.getTextContent();\n\n type RawItem = { x: number; y: number; str: string };\n const items: RawItem[] = [];\n\n for (const item of content.items) {\n if (!(\"str\" in item) || !item.str.trim()) continue;\n const transform: number[] | undefined = Array.isArray(\n (item as { transform?: number[] }).transform\n )\n ? (item as { transform: number[] }).transform\n : undefined;\n\n if (!transform) {\n // No positional info (unit-test mocks) — treat as single-column item\n items.push({ x: 0, y: 0, str: item.str });\n } else {\n items.push({ x: transform[4], y: transform[5], str: item.str });\n }\n }\n\n // Detect column boundary: find the largest x-gap among item start positions.\n // If it exceeds COLUMN_GAP_THRESHOLD, split into left / right columns and\n // process each independently so headers in different columns don't merge.\n // ponytail: single largest-gap heuristic handles the common 2-column resume;\n // n-column needs k-means on x-distribution — upgrade if this proves insufficient.\n // Column boundary heuristic: the largest gap in item x-positions.\n // Real PDF column gutters show as a gap >>80px; normal word spacing is <50px.\n // ponytail: magic number calibrated to PranavRaut2026.pdf (104px gap); raise\n // if single-column PDFs with wide indentation start getting falsely split.\n const COLUMN_GAP_THRESHOLD = 80;\n const xPositions = [...new Set(items.map((it) => Math.round(it.x)))].sort(\n (a, b) => a - b\n );\n\n let columnBoundary: number | null = null;\n let maxGap = 0;\n for (let j = 1; j < xPositions.length; j++) {\n const gap = xPositions[j] - xPositions[j - 1];\n if (gap > maxGap) {\n maxGap = gap;\n columnBoundary = (xPositions[j - 1] + xPositions[j]) / 2;\n }\n }\n if (maxGap < COLUMN_GAP_THRESHOLD) columnBoundary = null;\n\n const columns =\n columnBoundary !== null\n ? [\n items.filter((it) => it.x < columnBoundary!),\n items.filter((it) => it.x >= columnBoundary!),\n ]\n : [items];\n\n const columnTexts = columns.map((col) => renderColumn(col));\n pages.push(columnTexts.filter(Boolean).join(\"\\n\"));\n }\n\n const text = pages.join(\"\\n\");\n\n // ponytail: OCR is the caller's engine/dependency — we only decide *when*\n // to ask for it (text layer too short) and pick the better of the two results.\n const minTextLength = options?.minTextLength ?? 100;\n if (options?.ocrFallback && text.trim().length < minTextLength) {\n try {\n const ocrText = await options.ocrFallback(bytes);\n if (ocrText.trim().length > text.trim().length) return ocrText;\n } catch {\n // OCR failure falls back to the text-layer result, never throws.\n }\n }\n\n return text;\n}\n\nfunction renderColumn(items: Array<{ x: number; y: number; str: string }>): string {\n const Y_TOLERANCE = 2;\n const lineMap: Map<number, Array<{ x: number; str: string }>> = new Map();\n const lineOrder: number[] = [];\n\n for (const { x, y, str } of items) {\n let bucketKey: number | undefined;\n for (const key of lineOrder) {\n if (Math.abs(key - y) <= Y_TOLERANCE) {\n bucketKey = key;\n break;\n }\n }\n if (bucketKey === undefined) {\n bucketKey = y;\n lineOrder.push(y);\n lineMap.set(y, []);\n }\n lineMap.get(bucketKey)!.push({ x, str });\n }\n\n // pdfjs y=0 is bottom of page — sort descending so top comes first\n lineOrder.sort((a, b) => b - a);\n\n return lineOrder\n .map((key) =>\n (lineMap.get(key) ?? [])\n .sort((a, b) => a.x - b.x)\n .map((it) => it.str)\n .join(\" \")\n .replace(/[^\\S\\n]+/g, \" \")\n .trim()\n )\n .filter(Boolean)\n .join(\"\\n\");\n}\n"]}
@@ -1,3 +1,15 @@
1
+ /**
2
+ * Caller-supplied OCR implementation, invoked only when the PDF's text layer
3
+ * comes back too short to be useful (scanned/image PDFs). The caller owns
4
+ * the actual OCR engine and dependency — this library never bundles one.
5
+ */
6
+ type OCRClient = (data: Uint8Array) => Promise<string>;
7
+ interface ExtractTextOptions {
8
+ /** Called with the raw PDF bytes when text-layer extraction is too short. */
9
+ ocrFallback?: OCRClient;
10
+ /** Threshold (trimmed char count) below which ocrFallback is tried. Default 100, matching resume.parser.ts's scanned-PDF warning. */
11
+ minTextLength?: number;
12
+ }
1
13
  /**
2
14
  * Extract plain text from a PDF buffer.
3
15
  *
@@ -5,8 +17,9 @@
5
17
  * npm install pdfjs-dist
6
18
  *
7
19
  * @param data - Raw PDF bytes as Uint8Array or ArrayBuffer
20
+ * @param options - Optional OCR fallback for scanned/image PDFs
8
21
  * @returns Extracted text, ready to pass as `resumeText` to analyzeResume
9
22
  */
10
- declare function extractTextFromPDF(data: Uint8Array | ArrayBuffer): Promise<string>;
23
+ declare function extractTextFromPDF(data: Uint8Array | ArrayBuffer, options?: ExtractTextOptions): Promise<string>;
11
24
 
12
- export { extractTextFromPDF };
25
+ export { type ExtractTextOptions, type OCRClient, extractTextFromPDF };
@@ -1,3 +1,15 @@
1
+ /**
2
+ * Caller-supplied OCR implementation, invoked only when the PDF's text layer
3
+ * comes back too short to be useful (scanned/image PDFs). The caller owns
4
+ * the actual OCR engine and dependency — this library never bundles one.
5
+ */
6
+ type OCRClient = (data: Uint8Array) => Promise<string>;
7
+ interface ExtractTextOptions {
8
+ /** Called with the raw PDF bytes when text-layer extraction is too short. */
9
+ ocrFallback?: OCRClient;
10
+ /** Threshold (trimmed char count) below which ocrFallback is tried. Default 100, matching resume.parser.ts's scanned-PDF warning. */
11
+ minTextLength?: number;
12
+ }
1
13
  /**
2
14
  * Extract plain text from a PDF buffer.
3
15
  *
@@ -5,8 +17,9 @@
5
17
  * npm install pdfjs-dist
6
18
  *
7
19
  * @param data - Raw PDF bytes as Uint8Array or ArrayBuffer
20
+ * @param options - Optional OCR fallback for scanned/image PDFs
8
21
  * @returns Extracted text, ready to pass as `resumeText` to analyzeResume
9
22
  */
10
- declare function extractTextFromPDF(data: Uint8Array | ArrayBuffer): Promise<string>;
23
+ declare function extractTextFromPDF(data: Uint8Array | ArrayBuffer, options?: ExtractTextOptions): Promise<string>;
11
24
 
12
- export { extractTextFromPDF };
25
+ export { type ExtractTextOptions, type OCRClient, extractTextFromPDF };
@@ -1,5 +1,5 @@
1
1
  // src/pdf/index.ts
2
- async function extractTextFromPDF(data) {
2
+ async function extractTextFromPDF(data, options) {
3
3
  let pdfjsLib;
4
4
  try {
5
5
  pdfjsLib = await import('pdfjs-dist');
@@ -47,7 +47,16 @@ async function extractTextFromPDF(data) {
47
47
  const columnTexts = columns.map((col) => renderColumn(col));
48
48
  pages.push(columnTexts.filter(Boolean).join("\n"));
49
49
  }
50
- return pages.join("\n");
50
+ const text = pages.join("\n");
51
+ const minTextLength = options?.minTextLength ?? 100;
52
+ if (options?.ocrFallback && text.trim().length < minTextLength) {
53
+ try {
54
+ const ocrText = await options.ocrFallback(bytes);
55
+ if (ocrText.trim().length > text.trim().length) return ocrText;
56
+ } catch {
57
+ }
58
+ }
59
+ return text;
51
60
  }
52
61
  function renderColumn(items) {
53
62
  const Y_TOLERANCE = 2;
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/pdf/index.ts"],"names":[],"mappings":";AASA,eAAsB,mBACpB,IAAA,EACiB;AAEjB,EAAA,IAAI,QAAA;AACJ,EAAA,IAAI;AACF,IAAA,QAAA,GAAW,MAAM,OAAO,YAAY,CAAA;AAAA,EACtC,CAAA,CAAA,MAAQ;AACN,IAAA,MAAM,IAAI,KAAA;AAAA,MACR;AAAA,KACF;AAAA,EACF;AAEA,EAAA,MAAM,QACJ,IAAA,YAAgB,WAAA,GAAc,IAAI,UAAA,CAAW,IAAI,CAAA,GAAI,IAAA;AAEvD,EAAA,MAAM,GAAA,GAAM,MAAM,QAAA,CAAS,WAAA,CAAY,EAAE,IAAA,EAAM,KAAA,EAAO,CAAA,CAAE,OAAA;AACxD,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,IAAK,GAAA,CAAI,UAAU,CAAA,EAAA,EAAK;AACtC,IAAA,MAAM,IAAA,GAAO,MAAM,GAAA,CAAI,OAAA,CAAQ,CAAC,CAAA;AAChC,IAAA,MAAM,OAAA,GAAU,MAAM,IAAA,CAAK,cAAA,EAAe;AAG1C,IAAA,MAAM,QAAmB,EAAC;AAE1B,IAAA,KAAA,MAAW,IAAA,IAAQ,QAAQ,KAAA,EAAO;AAChC,MAAA,IAAI,EAAE,KAAA,IAAS,IAAA,CAAA,IAAS,CAAC,IAAA,CAAK,GAAA,CAAI,MAAK,EAAG;AAC1C,MAAA,MAAM,YAAkC,KAAA,CAAM,OAAA;AAAA,QAC3C,IAAA,CAAkC;AAAA,OACrC,GACK,KAAiC,SAAA,GAClC,MAAA;AAEJ,MAAA,IAAI,CAAC,SAAA,EAAW;AAEd,QAAA,KAAA,CAAM,IAAA,CAAK,EAAE,CAAA,EAAG,CAAA,EAAG,GAAG,CAAA,EAAG,GAAA,EAAK,IAAA,CAAK,GAAA,EAAK,CAAA;AAAA,MAC1C,CAAA,MAAO;AACL,QAAA,KAAA,CAAM,IAAA,CAAK,EAAE,CAAA,EAAG,SAAA,CAAU,CAAC,CAAA,EAAG,CAAA,EAAG,SAAA,CAAU,CAAC,CAAA,EAAG,GAAA,EAAK,IAAA,CAAK,KAAK,CAAA;AAAA,MAChE;AAAA,IACF;AAWA,IAAA,MAAM,oBAAA,GAAuB,EAAA;AAC7B,IAAA,MAAM,aAAa,CAAC,GAAG,IAAI,GAAA,CAAI,MAAM,GAAA,CAAI,CAAC,EAAA,KAAO,IAAA,CAAK,MAAM,EAAA,CAAG,CAAC,CAAC,CAAC,CAAC,CAAA,CAAE,IAAA;AAAA,MACnE,CAAC,CAAA,EAAG,CAAA,KAAM,CAAA,GAAI;AAAA,KAChB;AAEA,IAAA,IAAI,cAAA,GAAgC,IAAA;AACpC,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,UAAA,CAAW,QAAQ,CAAA,EAAA,EAAK;AAC1C,MAAA,MAAM,MAAM,UAAA,CAAW,CAAC,CAAA,GAAI,UAAA,CAAW,IAAI,CAAC,CAAA;AAC5C,MAAA,IAAI,MAAM,MAAA,EAAQ;AAChB,QAAA,MAAA,GAAS,GAAA;AACT,QAAA,cAAA,GAAA,CAAkB,WAAW,CAAA,GAAI,CAAC,CAAA,GAAI,UAAA,CAAW,CAAC,CAAA,IAAK,CAAA;AAAA,MACzD;AAAA,IACF;AACA,IAAA,IAAI,MAAA,GAAS,sBAAsB,cAAA,GAAiB,IAAA;AAEpD,IAAA,MAAM,OAAA,GACJ,mBAAmB,IAAA,GACf;AAAA,MACE,MAAM,MAAA,CAAO,CAAC,EAAA,KAAO,EAAA,CAAG,IAAI,cAAe,CAAA;AAAA,MAC3C,MAAM,MAAA,CAAO,CAAC,EAAA,KAAO,EAAA,CAAG,KAAK,cAAe;AAAA,KAC9C,GACA,CAAC,KAAK,CAAA;AAEZ,IAAA,MAAM,cAAc,OAAA,CAAQ,GAAA,CAAI,CAAC,GAAA,KAAQ,YAAA,CAAa,GAAG,CAAC,CAAA;AAC1D,IAAA,KAAA,CAAM,KAAK,WAAA,CAAY,MAAA,CAAO,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA;AAAA,EACnD;AAEA,EAAA,OAAO,KAAA,CAAM,KAAK,IAAI,CAAA;AACxB;AAEA,SAAS,aAAa,KAAA,EAA6D;AACjF,EAAA,MAAM,WAAA,GAAc,CAAA;AACpB,EAAA,MAAM,OAAA,uBAA8D,GAAA,EAAI;AACxE,EAAA,MAAM,YAAsB,EAAC;AAE7B,EAAA,KAAA,MAAW,EAAE,CAAA,EAAG,CAAA,EAAG,GAAA,MAAS,KAAA,EAAO;AACjC,IAAA,IAAI,SAAA;AACJ,IAAA,KAAA,MAAW,OAAO,SAAA,EAAW;AAC3B,MAAA,IAAI,IAAA,CAAK,GAAA,CAAI,GAAA,GAAM,CAAC,KAAK,WAAA,EAAa;AACpC,QAAA,SAAA,GAAY,GAAA;AACZ,QAAA;AAAA,MACF;AAAA,IACF;AACA,IAAA,IAAI,cAAc,MAAA,EAAW;AAC3B,MAAA,SAAA,GAAY,CAAA;AACZ,MAAA,SAAA,CAAU,KAAK,CAAC,CAAA;AAChB,MAAA,OAAA,CAAQ,GAAA,CAAI,CAAA,EAAG,EAAE,CAAA;AAAA,IACnB;AACA,IAAA,OAAA,CAAQ,IAAI,SAAS,CAAA,CAAG,KAAK,EAAE,CAAA,EAAG,KAAK,CAAA;AAAA,EACzC;AAGA,EAAA,SAAA,CAAU,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,IAAI,CAAC,CAAA;AAE9B,EAAA,OAAO,SAAA,CACJ,GAAA;AAAA,IAAI,CAAC,GAAA,KAAA,CACH,OAAA,CAAQ,GAAA,CAAI,GAAG,CAAA,IAAK,EAAC,EACnB,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,EAAE,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,CACxB,GAAA,CAAI,CAAC,EAAA,KAAO,EAAA,CAAG,GAAG,CAAA,CAClB,IAAA,CAAK,GAAG,CAAA,CACR,OAAA,CAAQ,WAAA,EAAa,GAAG,EACxB,IAAA;AAAK,GACV,CACC,MAAA,CAAO,OAAO,CAAA,CACd,KAAK,IAAI,CAAA;AACd","file":"index.mjs","sourcesContent":["/**\n * Extract plain text from a PDF buffer.\n *\n * Requires `pdfjs-dist` to be installed (optional peerDependency):\n * npm install pdfjs-dist\n *\n * @param data - Raw PDF bytes as Uint8Array or ArrayBuffer\n * @returns Extracted text, ready to pass as `resumeText` to analyzeResume\n */\nexport async function extractTextFromPDF(\n data: Uint8Array | ArrayBuffer\n): Promise<string> {\n // ponytail: lazy import keeps core zero-dep; missing peer throws with clear message\n let pdfjsLib: typeof import(\"pdfjs-dist\");\n try {\n pdfjsLib = await import(\"pdfjs-dist\");\n } catch {\n throw new Error(\n \"pdfjs-dist is required for PDF extraction. Install it: npm install pdfjs-dist\"\n );\n }\n\n const bytes =\n data instanceof ArrayBuffer ? new Uint8Array(data) : data;\n\n const doc = await pdfjsLib.getDocument({ data: bytes }).promise;\n const pages: string[] = [];\n\n for (let i = 1; i <= doc.numPages; i++) {\n const page = await doc.getPage(i);\n const content = await page.getTextContent();\n\n type RawItem = { x: number; y: number; str: string };\n const items: RawItem[] = [];\n\n for (const item of content.items) {\n if (!(\"str\" in item) || !item.str.trim()) continue;\n const transform: number[] | undefined = Array.isArray(\n (item as { transform?: number[] }).transform\n )\n ? (item as { transform: number[] }).transform\n : undefined;\n\n if (!transform) {\n // No positional info (unit-test mocks) — treat as single-column item\n items.push({ x: 0, y: 0, str: item.str });\n } else {\n items.push({ x: transform[4], y: transform[5], str: item.str });\n }\n }\n\n // Detect column boundary: find the largest x-gap among item start positions.\n // If it exceeds COLUMN_GAP_THRESHOLD, split into left / right columns and\n // process each independently so headers in different columns don't merge.\n // ponytail: single largest-gap heuristic handles the common 2-column resume;\n // n-column needs k-means on x-distribution — upgrade if this proves insufficient.\n // Column boundary heuristic: the largest gap in item x-positions.\n // Real PDF column gutters show as a gap >>80px; normal word spacing is <50px.\n // ponytail: magic number calibrated to PranavRaut2026.pdf (104px gap); raise\n // if single-column PDFs with wide indentation start getting falsely split.\n const COLUMN_GAP_THRESHOLD = 80;\n const xPositions = [...new Set(items.map((it) => Math.round(it.x)))].sort(\n (a, b) => a - b\n );\n\n let columnBoundary: number | null = null;\n let maxGap = 0;\n for (let j = 1; j < xPositions.length; j++) {\n const gap = xPositions[j] - xPositions[j - 1];\n if (gap > maxGap) {\n maxGap = gap;\n columnBoundary = (xPositions[j - 1] + xPositions[j]) / 2;\n }\n }\n if (maxGap < COLUMN_GAP_THRESHOLD) columnBoundary = null;\n\n const columns =\n columnBoundary !== null\n ? [\n items.filter((it) => it.x < columnBoundary!),\n items.filter((it) => it.x >= columnBoundary!),\n ]\n : [items];\n\n const columnTexts = columns.map((col) => renderColumn(col));\n pages.push(columnTexts.filter(Boolean).join(\"\\n\"));\n }\n\n return pages.join(\"\\n\");\n}\n\nfunction renderColumn(items: Array<{ x: number; y: number; str: string }>): string {\n const Y_TOLERANCE = 2;\n const lineMap: Map<number, Array<{ x: number; str: string }>> = new Map();\n const lineOrder: number[] = [];\n\n for (const { x, y, str } of items) {\n let bucketKey: number | undefined;\n for (const key of lineOrder) {\n if (Math.abs(key - y) <= Y_TOLERANCE) {\n bucketKey = key;\n break;\n }\n }\n if (bucketKey === undefined) {\n bucketKey = y;\n lineOrder.push(y);\n lineMap.set(y, []);\n }\n lineMap.get(bucketKey)!.push({ x, str });\n }\n\n // pdfjs y=0 is bottom of page — sort descending so top comes first\n lineOrder.sort((a, b) => b - a);\n\n return lineOrder\n .map((key) =>\n (lineMap.get(key) ?? [])\n .sort((a, b) => a.x - b.x)\n .map((it) => it.str)\n .join(\" \")\n .replace(/[^\\S\\n]+/g, \" \")\n .trim()\n )\n .filter(Boolean)\n .join(\"\\n\");\n}\n"]}
1
+ {"version":3,"sources":["../../src/pdf/index.ts"],"names":[],"mappings":";AAwBA,eAAsB,kBAAA,CACpB,MACA,OAAA,EACiB;AAEjB,EAAA,IAAI,QAAA;AACJ,EAAA,IAAI;AACF,IAAA,QAAA,GAAW,MAAM,OAAO,YAAY,CAAA;AAAA,EACtC,CAAA,CAAA,MAAQ;AACN,IAAA,MAAM,IAAI,KAAA;AAAA,MACR;AAAA,KACF;AAAA,EACF;AAEA,EAAA,MAAM,QACJ,IAAA,YAAgB,WAAA,GAAc,IAAI,UAAA,CAAW,IAAI,CAAA,GAAI,IAAA;AAEvD,EAAA,MAAM,GAAA,GAAM,MAAM,QAAA,CAAS,WAAA,CAAY,EAAE,IAAA,EAAM,KAAA,EAAO,CAAA,CAAE,OAAA;AACxD,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,IAAK,GAAA,CAAI,UAAU,CAAA,EAAA,EAAK;AACtC,IAAA,MAAM,IAAA,GAAO,MAAM,GAAA,CAAI,OAAA,CAAQ,CAAC,CAAA;AAChC,IAAA,MAAM,OAAA,GAAU,MAAM,IAAA,CAAK,cAAA,EAAe;AAG1C,IAAA,MAAM,QAAmB,EAAC;AAE1B,IAAA,KAAA,MAAW,IAAA,IAAQ,QAAQ,KAAA,EAAO;AAChC,MAAA,IAAI,EAAE,KAAA,IAAS,IAAA,CAAA,IAAS,CAAC,IAAA,CAAK,GAAA,CAAI,MAAK,EAAG;AAC1C,MAAA,MAAM,YAAkC,KAAA,CAAM,OAAA;AAAA,QAC3C,IAAA,CAAkC;AAAA,OACrC,GACK,KAAiC,SAAA,GAClC,MAAA;AAEJ,MAAA,IAAI,CAAC,SAAA,EAAW;AAEd,QAAA,KAAA,CAAM,IAAA,CAAK,EAAE,CAAA,EAAG,CAAA,EAAG,GAAG,CAAA,EAAG,GAAA,EAAK,IAAA,CAAK,GAAA,EAAK,CAAA;AAAA,MAC1C,CAAA,MAAO;AACL,QAAA,KAAA,CAAM,IAAA,CAAK,EAAE,CAAA,EAAG,SAAA,CAAU,CAAC,CAAA,EAAG,CAAA,EAAG,SAAA,CAAU,CAAC,CAAA,EAAG,GAAA,EAAK,IAAA,CAAK,KAAK,CAAA;AAAA,MAChE;AAAA,IACF;AAWA,IAAA,MAAM,oBAAA,GAAuB,EAAA;AAC7B,IAAA,MAAM,aAAa,CAAC,GAAG,IAAI,GAAA,CAAI,MAAM,GAAA,CAAI,CAAC,EAAA,KAAO,IAAA,CAAK,MAAM,EAAA,CAAG,CAAC,CAAC,CAAC,CAAC,CAAA,CAAE,IAAA;AAAA,MACnE,CAAC,CAAA,EAAG,CAAA,KAAM,CAAA,GAAI;AAAA,KAChB;AAEA,IAAA,IAAI,cAAA,GAAgC,IAAA;AACpC,IAAA,IAAI,MAAA,GAAS,CAAA;AACb,IAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,UAAA,CAAW,QAAQ,CAAA,EAAA,EAAK;AAC1C,MAAA,MAAM,MAAM,UAAA,CAAW,CAAC,CAAA,GAAI,UAAA,CAAW,IAAI,CAAC,CAAA;AAC5C,MAAA,IAAI,MAAM,MAAA,EAAQ;AAChB,QAAA,MAAA,GAAS,GAAA;AACT,QAAA,cAAA,GAAA,CAAkB,WAAW,CAAA,GAAI,CAAC,CAAA,GAAI,UAAA,CAAW,CAAC,CAAA,IAAK,CAAA;AAAA,MACzD;AAAA,IACF;AACA,IAAA,IAAI,MAAA,GAAS,sBAAsB,cAAA,GAAiB,IAAA;AAEpD,IAAA,MAAM,OAAA,GACJ,mBAAmB,IAAA,GACf;AAAA,MACE,MAAM,MAAA,CAAO,CAAC,EAAA,KAAO,EAAA,CAAG,IAAI,cAAe,CAAA;AAAA,MAC3C,MAAM,MAAA,CAAO,CAAC,EAAA,KAAO,EAAA,CAAG,KAAK,cAAe;AAAA,KAC9C,GACA,CAAC,KAAK,CAAA;AAEZ,IAAA,MAAM,cAAc,OAAA,CAAQ,GAAA,CAAI,CAAC,GAAA,KAAQ,YAAA,CAAa,GAAG,CAAC,CAAA;AAC1D,IAAA,KAAA,CAAM,KAAK,WAAA,CAAY,MAAA,CAAO,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA;AAAA,EACnD;AAEA,EAAA,MAAM,IAAA,GAAO,KAAA,CAAM,IAAA,CAAK,IAAI,CAAA;AAI5B,EAAA,MAAM,aAAA,GAAgB,SAAS,aAAA,IAAiB,GAAA;AAChD,EAAA,IAAI,SAAS,WAAA,IAAe,IAAA,CAAK,IAAA,EAAK,CAAE,SAAS,aAAA,EAAe;AAC9D,IAAA,IAAI;AACF,MAAA,MAAM,OAAA,GAAU,MAAM,OAAA,CAAQ,WAAA,CAAY,KAAK,CAAA;AAC/C,MAAA,IAAI,OAAA,CAAQ,MAAK,CAAE,MAAA,GAAS,KAAK,IAAA,EAAK,CAAE,QAAQ,OAAO,OAAA;AAAA,IACzD,CAAA,CAAA,MAAQ;AAAA,IAER;AAAA,EACF;AAEA,EAAA,OAAO,IAAA;AACT;AAEA,SAAS,aAAa,KAAA,EAA6D;AACjF,EAAA,MAAM,WAAA,GAAc,CAAA;AACpB,EAAA,MAAM,OAAA,uBAA8D,GAAA,EAAI;AACxE,EAAA,MAAM,YAAsB,EAAC;AAE7B,EAAA,KAAA,MAAW,EAAE,CAAA,EAAG,CAAA,EAAG,GAAA,MAAS,KAAA,EAAO;AACjC,IAAA,IAAI,SAAA;AACJ,IAAA,KAAA,MAAW,OAAO,SAAA,EAAW;AAC3B,MAAA,IAAI,IAAA,CAAK,GAAA,CAAI,GAAA,GAAM,CAAC,KAAK,WAAA,EAAa;AACpC,QAAA,SAAA,GAAY,GAAA;AACZ,QAAA;AAAA,MACF;AAAA,IACF;AACA,IAAA,IAAI,cAAc,MAAA,EAAW;AAC3B,MAAA,SAAA,GAAY,CAAA;AACZ,MAAA,SAAA,CAAU,KAAK,CAAC,CAAA;AAChB,MAAA,OAAA,CAAQ,GAAA,CAAI,CAAA,EAAG,EAAE,CAAA;AAAA,IACnB;AACA,IAAA,OAAA,CAAQ,IAAI,SAAS,CAAA,CAAG,KAAK,EAAE,CAAA,EAAG,KAAK,CAAA;AAAA,EACzC;AAGA,EAAA,SAAA,CAAU,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,IAAI,CAAC,CAAA;AAE9B,EAAA,OAAO,SAAA,CACJ,GAAA;AAAA,IAAI,CAAC,GAAA,KAAA,CACH,OAAA,CAAQ,GAAA,CAAI,GAAG,CAAA,IAAK,EAAC,EACnB,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,EAAE,CAAA,GAAI,CAAA,CAAE,CAAC,CAAA,CACxB,GAAA,CAAI,CAAC,EAAA,KAAO,EAAA,CAAG,GAAG,CAAA,CAClB,IAAA,CAAK,GAAG,CAAA,CACR,OAAA,CAAQ,WAAA,EAAa,GAAG,EACxB,IAAA;AAAK,GACV,CACC,MAAA,CAAO,OAAO,CAAA,CACd,KAAK,IAAI,CAAA;AACd","file":"index.mjs","sourcesContent":["/**\n * Caller-supplied OCR implementation, invoked only when the PDF's text layer\n * comes back too short to be useful (scanned/image PDFs). The caller owns\n * the actual OCR engine and dependency — this library never bundles one.\n */\nexport type OCRClient = (data: Uint8Array) => Promise<string>;\n\nexport interface ExtractTextOptions {\n /** Called with the raw PDF bytes when text-layer extraction is too short. */\n ocrFallback?: OCRClient;\n /** Threshold (trimmed char count) below which ocrFallback is tried. Default 100, matching resume.parser.ts's scanned-PDF warning. */\n minTextLength?: number;\n}\n\n/**\n * Extract plain text from a PDF buffer.\n *\n * Requires `pdfjs-dist` to be installed (optional peerDependency):\n * npm install pdfjs-dist\n *\n * @param data - Raw PDF bytes as Uint8Array or ArrayBuffer\n * @param options - Optional OCR fallback for scanned/image PDFs\n * @returns Extracted text, ready to pass as `resumeText` to analyzeResume\n */\nexport async function extractTextFromPDF(\n data: Uint8Array | ArrayBuffer,\n options?: ExtractTextOptions\n): Promise<string> {\n // ponytail: lazy import keeps core zero-dep; missing peer throws with clear message\n let pdfjsLib: typeof import(\"pdfjs-dist\");\n try {\n pdfjsLib = await import(\"pdfjs-dist\");\n } catch {\n throw new Error(\n \"pdfjs-dist is required for PDF extraction. Install it: npm install pdfjs-dist\"\n );\n }\n\n const bytes =\n data instanceof ArrayBuffer ? new Uint8Array(data) : data;\n\n const doc = await pdfjsLib.getDocument({ data: bytes }).promise;\n const pages: string[] = [];\n\n for (let i = 1; i <= doc.numPages; i++) {\n const page = await doc.getPage(i);\n const content = await page.getTextContent();\n\n type RawItem = { x: number; y: number; str: string };\n const items: RawItem[] = [];\n\n for (const item of content.items) {\n if (!(\"str\" in item) || !item.str.trim()) continue;\n const transform: number[] | undefined = Array.isArray(\n (item as { transform?: number[] }).transform\n )\n ? (item as { transform: number[] }).transform\n : undefined;\n\n if (!transform) {\n // No positional info (unit-test mocks) — treat as single-column item\n items.push({ x: 0, y: 0, str: item.str });\n } else {\n items.push({ x: transform[4], y: transform[5], str: item.str });\n }\n }\n\n // Detect column boundary: find the largest x-gap among item start positions.\n // If it exceeds COLUMN_GAP_THRESHOLD, split into left / right columns and\n // process each independently so headers in different columns don't merge.\n // ponytail: single largest-gap heuristic handles the common 2-column resume;\n // n-column needs k-means on x-distribution — upgrade if this proves insufficient.\n // Column boundary heuristic: the largest gap in item x-positions.\n // Real PDF column gutters show as a gap >>80px; normal word spacing is <50px.\n // ponytail: magic number calibrated to PranavRaut2026.pdf (104px gap); raise\n // if single-column PDFs with wide indentation start getting falsely split.\n const COLUMN_GAP_THRESHOLD = 80;\n const xPositions = [...new Set(items.map((it) => Math.round(it.x)))].sort(\n (a, b) => a - b\n );\n\n let columnBoundary: number | null = null;\n let maxGap = 0;\n for (let j = 1; j < xPositions.length; j++) {\n const gap = xPositions[j] - xPositions[j - 1];\n if (gap > maxGap) {\n maxGap = gap;\n columnBoundary = (xPositions[j - 1] + xPositions[j]) / 2;\n }\n }\n if (maxGap < COLUMN_GAP_THRESHOLD) columnBoundary = null;\n\n const columns =\n columnBoundary !== null\n ? [\n items.filter((it) => it.x < columnBoundary!),\n items.filter((it) => it.x >= columnBoundary!),\n ]\n : [items];\n\n const columnTexts = columns.map((col) => renderColumn(col));\n pages.push(columnTexts.filter(Boolean).join(\"\\n\"));\n }\n\n const text = pages.join(\"\\n\");\n\n // ponytail: OCR is the caller's engine/dependency — we only decide *when*\n // to ask for it (text layer too short) and pick the better of the two results.\n const minTextLength = options?.minTextLength ?? 100;\n if (options?.ocrFallback && text.trim().length < minTextLength) {\n try {\n const ocrText = await options.ocrFallback(bytes);\n if (ocrText.trim().length > text.trim().length) return ocrText;\n } catch {\n // OCR failure falls back to the text-layer result, never throws.\n }\n }\n\n return text;\n}\n\nfunction renderColumn(items: Array<{ x: number; y: number; str: string }>): string {\n const Y_TOLERANCE = 2;\n const lineMap: Map<number, Array<{ x: number; str: string }>> = new Map();\n const lineOrder: number[] = [];\n\n for (const { x, y, str } of items) {\n let bucketKey: number | undefined;\n for (const key of lineOrder) {\n if (Math.abs(key - y) <= Y_TOLERANCE) {\n bucketKey = key;\n break;\n }\n }\n if (bucketKey === undefined) {\n bucketKey = y;\n lineOrder.push(y);\n lineMap.set(y, []);\n }\n lineMap.get(bucketKey)!.push({ x, str });\n }\n\n // pdfjs y=0 is bottom of page — sort descending so top comes first\n lineOrder.sort((a, b) => b - a);\n\n return lineOrder\n .map((key) =>\n (lineMap.get(key) ?? [])\n .sort((a, b) => a.x - b.x)\n .map((it) => it.str)\n .join(\" \")\n .replace(/[^\\S\\n]+/g, \" \")\n .trim()\n )\n .filter(Boolean)\n .join(\"\\n\");\n}\n"]}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pranavraut033/ats-checker",
3
- "version": "1.3.0",
3
+ "version": "1.3.2",
4
4
  "description": "Deterministic, configurable ATS (Applicant Tracking System) compatibility checker with no external dependencies. Analyze resumes, generate scores, and get actionable suggestions.",
5
5
  "license": "MIT",
6
6
  "author": {