npm - @llamaindex/liteparse - Versions diffs - 1.4.2 → 1.4.4 - Mend

@llamaindex/liteparse 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/src/core/parser.d.ts.map +1 -1
package/dist/src/core/parser.js +2 -2
package/dist/src/core/parser.js.map +1 -1
package/dist/src/engines/pdf/interface.d.ts +7 -2
package/dist/src/engines/pdf/interface.d.ts.map +1 -1
package/dist/src/engines/pdf/pdfjs.d.ts +3 -3
package/dist/src/engines/pdf/pdfjs.d.ts.map +1 -1
package/dist/src/engines/pdf/pdfjs.js +432 -220
package/dist/src/engines/pdf/pdfjs.js.map +1 -1
package/dist/src/processing/gridProjection.d.ts.map +1 -1
package/dist/src/processing/gridProjection.js +18 -3
package/dist/src/processing/gridProjection.js.map +1 -1
package/dist/src/processing/gridProjection.test.js +30 -0
package/dist/src/processing/gridProjection.test.js.map +1 -1
package/dist/src/vendor/pdfjs/pdf.worker.mjs +2 -1
package/package.json +1 -1
package/src/vendor/pdfjs/pdf.worker.mjs +2 -1

package/dist/src/engines/pdf/pdfjs.js CHANGED Viewed

@@ -36,198 +36,423 @@ function applyTransformation(point, transform) {
     };
 }
 // Pre-compiled regex patterns for string decoding
-const BUGGY_FONT_MARKER_REGEX = /:->\|>_(\d+)_\d+_<\|<-:/g;
 const BUGGY_FONT_MARKER_CHECK = ":->|>";
 const PIPE_PATTERN_REGEX = /\s*\|([^|])\|\s*/g;
 /**
- * Common tabular figures font encoding mappings.
- * Many fonts with "Differences" arrays use similar patterns for tabular digits.
- * These mappings are derived from common font encoding conventions.
+ * Adobe Glyph List subset: maps standard PostScript glyph names to Unicode characters.
  *
- * Note: The same PDF can use multiple fonts with DIFFERENT glyph-to-character mappings
- * for the same glyph IDs. We try all mappings and pick the best match.
+ * When PDF.js detects a "buggy" font (one whose ToUnicode/encoding maps glyphs to
+ * control characters or PUA code points), it emits markers containing the glyph's
+ * original char code AND the glyph name from the font's /Differences or /Encoding
+ * dictionary. This map resolves those glyph names to correct Unicode characters.
  *
- * Special glyphs:
- * - 42: '*' (asterisk for significance markers)
- * - 150: '-' (minus sign/dash)
+ * This is a ~200-entry subset of the full Adobe Glyph List (~4,300 entries).
+ * The full canonical source is: https://github.com/adobe-type-tools/agl-aglfn
+ * (see glyphlist.txt). Our subset covers basic Latin, digits, ligatures, punctuation,
+ * typographic characters, Greek, math symbols, and common accented Latin. Glyph names
+ * not in this subset fall through to the uniXXXX convention and ASCII-range fallbacks
+ * in resolveGlyphName(). Add entries here if a PDF's buggy font uses a standard glyph
+ * name that isn't covered and doesn't match those fallbacks.
  */
-const TABULAR_FIGURES_MAPPINGS = [
-    // Mapping 1: Bold/header style (e.g., census PDF header row)
-    // Characters: 0123456789.,
-    {
-        17: "4",
-        18: "6",
-        19: "8",
-        20: "5",
-        21: "9",
-        22: "7",
-        23: "1",
-        24: " ",
-        25: ",",
-        26: "+",
-        27: "-",
-        28: "3",
-        29: "0",
-        30: "2",
-        31: ".",
-        42: "*",
-        150: "-",
-    },
-    // Mapping 2: Book/body style (e.g., census PDF detail rows)
-    // Note: Same glyph IDs but different character assignments!
-    {
-        17: "+",
-        18: "7",
-        19: "-",
-        20: "9",
-        21: "6",
-        22: "3",
-        23: "1",
-        24: " ",
-        25: "8",
-        26: "5",
-        27: "4",
-        28: "0",
-        29: "2",
-        30: ".",
-        31: ",",
-        42: "*",
-        150: "-",
-    },
-];
+const ADOBE_GLYPH_MAP = {
+    // Basic Latin letters
+    A: "A",
+    B: "B",
+    C: "C",
+    D: "D",
+    E: "E",
+    F: "F",
+    G: "G",
+    H: "H",
+    I: "I",
+    J: "J",
+    K: "K",
+    L: "L",
+    M: "M",
+    N: "N",
+    O: "O",
+    P: "P",
+    Q: "Q",
+    R: "R",
+    S: "S",
+    T: "T",
+    U: "U",
+    V: "V",
+    W: "W",
+    X: "X",
+    Y: "Y",
+    Z: "Z",
+    a: "a",
+    b: "b",
+    c: "c",
+    d: "d",
+    e: "e",
+    f: "f",
+    g: "g",
+    h: "h",
+    i: "i",
+    j: "j",
+    k: "k",
+    l: "l",
+    m: "m",
+    n: "n",
+    o: "o",
+    p: "p",
+    q: "q",
+    r: "r",
+    s: "s",
+    t: "t",
+    u: "u",
+    v: "v",
+    w: "w",
+    x: "x",
+    y: "y",
+    z: "z",
+    // Digits
+    zero: "0",
+    one: "1",
+    two: "2",
+    three: "3",
+    four: "4",
+    five: "5",
+    six: "6",
+    seven: "7",
+    eight: "8",
+    nine: "9",
+    // Ligatures (Unicode presentation forms — decomposed later by stripControlChars)
+    fi: "\uFB01",
+    fl: "\uFB02",
+    ff: "\uFB00",
+    ffi: "\uFB03",
+    ffl: "\uFB04",
+    // Punctuation and symbols
+    space: " ",
+    period: ".",
+    comma: ",",
+    colon: ":",
+    semicolon: ";",
+    hyphen: "-",
+    minus: "\u2212",
+    slash: "/",
+    question: "?",
+    dollar: "$",
+    parenleft: "(",
+    parenright: ")",
+    asterisk: "*",
+    plus: "+",
+    equal: "=",
+    numbersign: "#",
+    percent: "%",
+    ampersand: "&",
+    at: "@",
+    exclam: "!",
+    bracketleft: "[",
+    bracketright: "]",
+    braceleft: "{",
+    braceright: "}",
+    underscore: "_",
+    quotedbl: '"',
+    quotesingle: "'",
+    backslash: "\\",
+    bar: "|",
+    asciitilde: "~",
+    asciicircum: "^",
+    grave: "`",
+    less: "<",
+    greater: ">",
+    // Typographic
+    quoteright: "\u2019",
+    quoteleft: "\u2018",
+    quotedblleft: "\u201C",
+    quotedblright: "\u201D",
+    quotesinglbase: "\u201A",
+    quotedblbase: "\u201E",
+    endash: "\u2013",
+    emdash: "\u2014",
+    bullet: "\u2022",
+    ellipsis: "\u2026",
+    dagger: "\u2020",
+    daggerdbl: "\u2021",
+    guilsinglleft: "\u2039",
+    guilsinglright: "\u203A",
+    guillemotleft: "\u00AB",
+    guillemotright: "\u00BB",
+    trademark: "\u2122",
+    registered: "\u00AE",
+    copyright: "\u00A9",
+    // Greek
+    Alpha: "\u0391",
+    Beta: "\u0392",
+    Gamma: "\u0393",
+    Delta: "\u2206",
+    Epsilon: "\u0395",
+    Zeta: "\u0396",
+    Eta: "\u0397",
+    Theta: "\u0398",
+    Iota: "\u0399",
+    Kappa: "\u039A",
+    Lambda: "\u039B",
+    Mu: "\u039C",
+    Nu: "\u039D",
+    Xi: "\u039E",
+    Omicron: "\u039F",
+    Pi: "\u03A0",
+    Rho: "\u03A1",
+    Sigma: "\u03A3",
+    Tau: "\u03A4",
+    Upsilon: "\u03A5",
+    Phi: "\u03A6",
+    Chi: "\u03A7",
+    Psi: "\u03A8",
+    Omega: "\u2126",
+    alpha: "\u03B1",
+    beta: "\u03B2",
+    gamma: "\u03B3",
+    delta: "\u03B4",
+    epsilon: "\u03B5",
+    zeta: "\u03B6",
+    eta: "\u03B7",
+    theta: "\u03B8",
+    iota: "\u03B9",
+    kappa: "\u03BA",
+    lambda: "\u03BB",
+    mu: "\u00B5",
+    nu: "\u03BD",
+    xi: "\u03BE",
+    omicron: "\u03BF",
+    pi: "\u03C0",
+    rho: "\u03C1",
+    sigma: "\u03C3",
+    tau: "\u03C4",
+    upsilon: "\u03C5",
+    phi: "\u03C6",
+    chi: "\u03C7",
+    psi: "\u03C8",
+    omega: "\u03C9",
+    // Math symbols
+    greaterequal: "\u2265",
+    lessequal: "\u2264",
+    notequal: "\u2260",
+    plusminus: "\u00B1",
+    multiply: "\u00D7",
+    divide: "\u00F7",
+    infinity: "\u221E",
+    summation: "\u2211",
+    integral: "\u222B",
+    partialdiff: "\u2202",
+    radical: "\u221A",
+    approxequal: "\u2248",
+    degree: "\u00B0",
+    // Accented Latin (common)
+    Aacute: "\u00C1",
+    Agrave: "\u00C0",
+    Acircumflex: "\u00C2",
+    Atilde: "\u00C3",
+    Adieresis: "\u00C4",
+    Aring: "\u00C5",
+    Eacute: "\u00C9",
+    Egrave: "\u00C8",
+    Ecircumflex: "\u00CA",
+    Edieresis: "\u00CB",
+    Iacute: "\u00CD",
+    Igrave: "\u00CC",
+    Icircumflex: "\u00CE",
+    Idieresis: "\u00CF",
+    Oacute: "\u00D3",
+    Ograve: "\u00D2",
+    Ocircumflex: "\u00D4",
+    Otilde: "\u00D5",
+    Odieresis: "\u00D6",
+    Uacute: "\u00DA",
+    Ugrave: "\u00D9",
+    Ucircumflex: "\u00DB",
+    Udieresis: "\u00DC",
+    Ntilde: "\u00D1",
+    Ccedilla: "\u00C7",
+    Scaron: "\u0160",
+    Zcaron: "\u017D",
+    aacute: "\u00E1",
+    agrave: "\u00E0",
+    acircumflex: "\u00E2",
+    atilde: "\u00E3",
+    adieresis: "\u00E4",
+    aring: "\u00E5",
+    eacute: "\u00E9",
+    egrave: "\u00E8",
+    ecircumflex: "\u00EA",
+    edieresis: "\u00EB",
+    iacute: "\u00ED",
+    igrave: "\u00EC",
+    icircumflex: "\u00EE",
+    idieresis: "\u00EF",
+    oacute: "\u00F3",
+    ograve: "\u00F2",
+    ocircumflex: "\u00F4",
+    otilde: "\u00F5",
+    odieresis: "\u00F6",
+    uacute: "\u00FA",
+    ugrave: "\u00F9",
+    ucircumflex: "\u00FB",
+    udieresis: "\u00FC",
+    ntilde: "\u00F1",
+    ccedilla: "\u00E7",
+    scaron: "\u0161",
+    zcaron: "\u017E",
+    ydieresis: "\u00FF",
+    // Miscellaneous
+    AE: "\u00C6",
+    ae: "\u00E6",
+    OE: "\u0152",
+    oe: "\u0153",
+    Eth: "\u00D0",
+    eth: "\u00F0",
+    Thorn: "\u00DE",
+    thorn: "\u00FE",
+    germandbls: "\u00DF",
+    dotlessi: "\u0131",
+    section: "\u00A7",
+    paragraph: "\u00B6",
+    currency: "\u00A4",
+    cent: "\u00A2",
+    sterling: "\u00A3",
+    yen: "\u00A5",
+    Euro: "\u20AC",
+    logicalnot: "\u00AC",
+    nbspace: "\u00A0",
+};
 /**
- * Check if all glyphs in the range would produce printable ASCII via direct char code.
- * Returns true if using String.fromCharCode on these glyphs would produce valid text.
+ * Resolve a glyph name to its Unicode character using the Adobe Glyph List.
+ * Handles standard names, the "uniXXXX" convention, and underscore-separated
+ * composite names (e.g., "f_i" → resolve "f" + "i" = "fi").
  */
-function canDecodeAsAscii(glyphs) {
-    // Check if ALL glyphs would produce valid printable ASCII or common whitespace
-    for (const g of glyphs) {
-        // Printable ASCII range (space through tilde), plus tab/newline
-        if (!((g >= 32 && g <= 126) || g === 9 || g === 10 || g === 13)) {
-            return false;
+function resolveGlyphName(glyphName) {
+    if (glyphName in ADOBE_GLYPH_MAP)
+        return ADOBE_GLYPH_MAP[glyphName];
+    // Handle "uniXXXX" convention (e.g., "uni00A0" → U+00A0)
+    if (glyphName.startsWith("uni") && glyphName.length === 7) {
+        const code = parseInt(glyphName.slice(3), 16);
+        if (!isNaN(code) && code > 0)
+            return String.fromCharCode(code);
+    }
+    // Handle underscore-separated composite names (e.g., "f_i" → "fi", "f_f_i" → "ffi")
+    // Some fonts use this convention instead of standard ligature names
+    if (glyphName.includes("_")) {
+        const parts = glyphName.split("_");
+        const resolved = parts.map((p) => resolveGlyphName(p));
+        if (resolved.every((r) => r !== null)) {
+            return resolved.join("");
         }
     }
-    return true;
+    return null;
 }
 /**
- * Score a decoded string for how "number-like" it appears.
- * Higher scores indicate better number formatting.
+ * Decode buggy font markers emitted by patched PDF.js.
+ *
+ * Marker format: :->|>_<glyphId>_<fontCharCode>@<glyphName>@<|<-:
+ * The glyph name is delimited by @ instead of _ because some fonts use
+ * non-standard glyph names containing underscores (e.g., "f_i" for "fi").
+ *
+ * Resolution strategy:
+ * 1. Use glyph name from font's /Differences or /Encoding dictionary
+ * 2. Fall back to glyphId if it's in printable ASCII range (32-126)
+ * 3. Drop the character if neither works (better than guessing)
  */
-function scoreNumberFormat(decoded) {
-    let score = 0;
-    // Count digits - primary indicator of a number
-    const digitCount = (decoded.match(/[0-9]/g) || []).length;
-    score += digitCount * 2;
-    // Bonus for matching common number patterns
-    // Pattern: digits with optional commas for thousands
-    if (/^\d{1,3}(,\d{3})*$/.test(decoded)) {
-        score += 5; // e.g., "248,800"
-    }
-    // Pattern: decimal number
-    if (/^\d+\.\d+$/.test(decoded)) {
-        score += 5; // e.g., "10.5"
-    }
-    // Pattern: negative number
-    if (/^[*-]?\d/.test(decoded)) {
-        score += 2; // e.g., "-1,132" or "*-0.4"
-    }
-    // Pattern: percentage or simple number
-    if (/^\d+$/.test(decoded)) {
-        score += 3; // e.g., "897"
-    }
-    // Penalize bad patterns
-    // Consecutive punctuation marks (not valid in numbers)
-    if (/[.,]{2,}/.test(decoded)) {
-        score -= 10;
-    }
-    // Punctuation at start (except minus/asterisk) or end
-    if (/^[.,+]|[.,+]$/.test(decoded)) {
-        score -= 5;
-    }
-    // Comma followed by anything other than 3 digits then boundary
-    if (/,(?!\d{3}(?:[,.]|$))/.test(decoded)) {
-        score -= 3;
-    }
-    // Period not followed by digits (except at end)
-    if (/\.(?![0-9])/.test(decoded) && !decoded.endsWith(".")) {
-        score -= 3;
-    }
-    return score;
+const BUGGY_FONT_MARKER_RE = /:->\|>_(\d+)_\d+@([^@]*)@<\|<-:/g;
+function decodeBuggyFontMarkers(str) {
+    return str.replace(BUGGY_FONT_MARKER_RE, (_match, glyphIdStr, glyphName) => {
+        // Priority 1: Resolve via glyph name from font metadata
+        if (glyphName) {
+            const resolved = resolveGlyphName(glyphName);
+            if (resolved)
+                return resolved;
+        }
+        // Priority 2: If glyphId is in printable ASCII range, use it directly
+        const glyphId = parseInt(glyphIdStr);
+        if (glyphId >= 32 && glyphId <= 126) {
+            return String.fromCharCode(glyphId);
+        }
+        // Priority 3: Drop unresolvable characters
+        return "";
+    });
 }
 /**
- * Try to decode buggy font markers using known tabular figures mappings.
- * Returns the decoded string if a mapping produces valid-looking text,
- * otherwise returns null to fall back to charCode decoding.
+ * Windows-1252 to Unicode mapping for the C1 control range (0x80-0x9F).
  *
- * Strategy:
- * 1. If glyphs are in ASCII range (32-126), let the fallback handle it
- * 2. If glyphs are in tabular range (17-31, plus special chars), try mappings
- * 3. Score each result for how "number-like" it appears
- * 4. Return the best result if it looks like a valid number
+ * Many PDFs encode smart quotes, em-dashes, and other typographic characters
+ * using Windows-1252 byte values. When PDF.js decodes these without a proper
+ * ToUnicode map, the raw byte values end up in the 0x80-0x9F range — which is
+ * technically the C1 control character block in Unicode. Rather than stripping
+ * them (which loses apostrophes, quotes, dashes, etc.), we map them to their
+ * correct Unicode equivalents.
  */
-function tryDecodeTabularFigures(str) {
-    if (!str.includes(BUGGY_FONT_MARKER_CHECK))
-        return null;
-    // Extract all glyph IDs from the markers
-    const glyphs = [];
-    let match;
-    const regex = /:->\|>_(\d+)_\d+_<\|<-:/g;
-    while ((match = regex.exec(str)) !== null) {
-        glyphs.push(parseInt(match[1]));
-    }
-    if (glyphs.length === 0)
-        return null;
-    // If these glyphs would decode fine as ASCII, don't use tabular mapping
-    if (canDecodeAsAscii(glyphs)) {
-        return null;
-    }
-    // Check if glyphs are in the tabular figures range
-    // Tabular figures typically use glyphs 17-31, plus special chars like 42, 150
-    const tabularRange = glyphs.every((g) => (g >= 17 && g <= 31) || // Core tabular figures
-        g === 42 || // Asterisk
-        g === 150 || // Minus
-        g === 8 ||
-        g === 9 ||
-        g === 10 // Some special chars
-    );
-    if (!tabularRange) {
-        // Mixed content - not pure tabular figures
-        return null;
-    }
-    // Try each mapping and pick the best result
-    let bestResult = null;
-    let bestScore = -Infinity;
-    for (const mapping of TABULAR_FIGURES_MAPPINGS) {
-        const decoded = glyphs.map((g) => mapping[g] || "").join("");
-        // Skip if there are unmapped glyphs
-        const unmapped = glyphs.filter((g) => !mapping[g]).length;
-        if (unmapped > 0)
-            continue;
-        // Score based on how "number-like" the result looks
-        const score = scoreNumberFormat(decoded);
-        if (score > bestScore) {
-            bestScore = score;
-            bestResult = decoded;
-        }
-    }
-    // Only return if we got a reasonable score (at least some digits, proper format)
-    if (bestResult && bestScore > 0) {
-        return bestResult;
-    }
-    return null;
-}
+const WINDOWS_1252_TO_UNICODE = {
+    0x80: "\u20AC", // €
+    0x82: "\u201A", // ‚
+    0x83: "\u0192", // ƒ
+    0x84: "\u201E", // „
+    0x85: "\u2026", // …
+    0x86: "\u2020", // †
+    0x87: "\u2021", // ‡
+    0x88: "\u02C6", // ˆ
+    0x89: "\u2030", // ‰
+    0x8a: "\u0160", // Š
+    0x8b: "\u2039", // ‹
+    0x8c: "\u0152", // Œ
+    0x8e: "\u017D", // Ž
+    0x91: "\u2018", // '
+    0x92: "\u2019", // ' (right single quote / apostrophe)
+    0x93: "\u201C", // "
+    0x94: "\u201D", // "
+    0x95: "\u2022", // •
+    0x96: "\u2013", // –
+    0x97: "\u2014", // —
+    0x98: "\u02DC", // ˜
+    0x99: "\u2122", // ™
+    0x9a: "\u0161", // š
+    0x9b: "\u203A", // ›
+    0x9c: "\u0153", // œ
+    0x9e: "\u017E", // ž
+    0x9f: "\u0178", // Ÿ
+};
+/**
+ * Unicode ligature decomposition map.
+ * PDF fonts often use ligature glyphs; decomposing them to plain ASCII
+ * ensures the text is searchable and NLP-friendly.
+ */
+const LIGATURE_MAP = {
+    "\uFB00": "ff",
+    "\uFB01": "fi",
+    "\uFB02": "fl",
+    "\uFB03": "ffi",
+    "\uFB04": "ffl",
+    "\uFB05": "st",
+    "\uFB06": "st",
+};
 /**
- * Strip C0/C1 control characters from text (except common whitespace).
- * These can appear in PDF text due to font encoding issues but the
- * surrounding text may still be valid.
+ * Strip C0 control characters from text (except common whitespace),
+ * map C1 control range (0x80-0x9F) to proper Unicode via Windows-1252,
+ * and decompose Unicode ligatures to plain text.
  */
 function stripControlChars(str) {
     let result = "";
     for (const char of str) {
         const code = char.charCodeAt(0);
-        // Skip C0 controls (except tab, newline, carriage return) and C1 controls
-        if ((code >= 0x00 && code <= 0x1f && code !== 0x09 && code !== 0x0a && code !== 0x0d) ||
-            (code >= 0x80 && code <= 0x9f)) {
+        // Decompose Unicode ligatures (fi, fl, ff, ffi, ffl, st)
+        if (LIGATURE_MAP[char]) {
+            result += LIGATURE_MAP[char];
+            continue;
+        }
+        // Map Windows-1252 C1 range to proper Unicode (smart quotes, em-dashes, etc.)
+        if (code >= 0x80 && code <= 0x9f) {
+            const mapped = WINDOWS_1252_TO_UNICODE[code];
+            if (mapped) {
+                result += mapped;
+            }
+            // Undefined C1 positions (0x81, 0x8D, 0x8F, 0x90) are dropped
+            continue;
+        }
+        // Skip C0 controls (except tab, newline, carriage return)
+        if (code >= 0x00 && code <= 0x1f && code !== 0x09 && code !== 0x0a && code !== 0x0d) {
             continue;
         }
         result += char;
@@ -260,11 +485,19 @@ function isGarbledFontOutput(str) {
     for (const char of str) {
         const code = char.charCodeAt(0);
         // C0 control characters (0x00-0x1F) except common whitespace (tab, newline, carriage return)
-        // C1 control characters (0x80-0x9F)
-        if ((code >= 0x00 && code <= 0x1f && code !== 0x09 && code !== 0x0a && code !== 0x0d) ||
-            (code >= 0x80 && code <= 0x9f)) {
+        if (code >= 0x00 && code <= 0x1f && code !== 0x09 && code !== 0x0a && code !== 0x0d) {
             controlCharCount++;
         }
+        // C1 range (0x80-0x9F): only count as control chars if NOT a valid Windows-1252 character.
+        // Many PDFs use Windows-1252 encoding for smart quotes, em-dashes, etc.
+        else if (code >= 0x80 && code <= 0x9f) {
+            if (WINDOWS_1252_TO_UNICODE[code]) {
+                normalCharCount++; // Valid Windows-1252 char (smart quote, dash, etc.)
+            }
+            else {
+                controlCharCount++; // Undefined C1 position — likely garbled
+            }
+        }
         // Private Use Area (U+E000-U+F8FF) - almost always garbled
         else if (code >= 0xe000 && code <= 0xf8ff) {
             privateUseCount++;
@@ -385,7 +618,7 @@ export class PdfJsEngine {
             _pdfDocument: pdfDocument,
         };
     }
-    async extractPage(doc, pageNum) {
+    async extractPage(doc, pageNum, options) {
         const pdfDocument = doc._pdfDocument;
         const page = await pdfDocument.getPage(pageNum);
         // Get viewport
@@ -406,18 +639,9 @@ export class PdfJsEngine {
             const cm = multiplyMatrices(viewportTransform, item.transform);
             // Get lower-left corner (text space origin)
             const ll = applyTransformation({ x: 0, y: 0 }, cm);
-            // Extract scale factors directly from matrix components (not SVD).
-            // For matrix [a, b, c, d, tx, ty]:
-            // - Horizontal scale = sqrt(a² + b²)
-            // - Vertical scale = sqrt(c² + d²)
-            // This correctly preserves axis association unlike SVD which returns
-            // singular values sorted by magnitude (causing x/y swap for some fonts).
             const scaleX = Math.sqrt(item.transform[0] ** 2 + item.transform[1] ** 2);
             const scaleY = Math.sqrt(item.transform[2] ** 2 + item.transform[3] ** 2);
-            // Get upper-right corner by first converting width/height to text space
-            // (dividing by the scale factors), then transforming to viewport space
             const ur = applyTransformation({ x: item.width / scaleX, y: item.height / scaleY }, cm);
-            // Calculate final bounding box in viewport space
             const left = Math.min(ll.x, ur.x);
             const right = Math.max(ll.x, ur.x);
             const top = Math.min(ll.y, ur.y);
@@ -427,44 +651,30 @@ export class PdfJsEngine {
                 continue;
             const width = right - left;
             const height = bottom - top;
-            // Calculate rotation from combined transformation matrix
+            // Get rotation angle from the transformation matrix
             let rotation = getRotation(cm);
-            // Normalize to 0-360 range
-            if (rotation < 0) {
+            if (rotation < 0)
                 rotation += 360;
-            }
-            // Decode buggy font markers from PDF.js (only if marker is present)
-            // Format: :->|>_<charCode>_<fontChar>_<|<-:
+            // Decode buggy font markers using glyph names from font metadata
             let decodedStr = item.str;
             if (decodedStr.includes(BUGGY_FONT_MARKER_CHECK)) {
-                // Try tabular figures decoding first (common in government/census PDFs)
-                const tabularDecoded = tryDecodeTabularFigures(decodedStr);
-                if (tabularDecoded) {
-                    decodedStr = tabularDecoded;
-                }
-                else {
-                    // Fall back to original approach: use glyph ID as character code
-                    BUGGY_FONT_MARKER_REGEX.lastIndex = 0; // Reset regex state
-                    decodedStr = decodedStr.replace(BUGGY_FONT_MARKER_REGEX, (_, charCode) => String.fromCharCode(parseInt(charCode)));
-                }
+                BUGGY_FONT_MARKER_RE.lastIndex = 0;
+                decodedStr = decodeBuggyFontMarkers(decodedStr);
             }
             // Handle pipe-separated characters: " |a|  |r|  |X| " -> "arX"
-            // Some PDFs encode text with characters separated by pipes and spaces
             if (decodedStr.includes("|")) {
-                PIPE_PATTERN_REGEX.lastIndex = 0; // Reset regex state
+                PIPE_PATTERN_REGEX.lastIndex = 0;
                 const matches = [...decodedStr.matchAll(PIPE_PATTERN_REGEX)];
                 if (matches.length > 0) {
                     decodedStr = matches.map((m) => m[1]).join("");
                 }
             }
             // Skip garbled text from fonts with corrupted ToUnicode mappings
-            // Save the bounding box so OCR can fill in these specific regions
             if (isGarbledFontOutput(decodedStr)) {
                 garbledTextRegions.push({ x: left, y: top, width, height });
                 continue;
             }
-            // Strip any remaining control characters from valid text
-            // (e.g., form feed chars that sneak into ligatures like "fi")
+            // Strip remaining control characters, map Windows-1252, decompose ligatures
             decodedStr = stripControlChars(decodedStr);
             textItems.push({
                 str: decodedStr,
@@ -481,22 +691,24 @@ export class PdfJsEngine {
             });
         }
         let images = [];
-        try {
-            const pdfInput = this.currentPdfPath || this.currentPdfData || doc.data;
-            if (!this.pdfiumRenderer) {
-                this.pdfiumRenderer = new PdfiumRenderer();
-                await this.pdfiumRenderer.loadDocument(pdfInput);
+        if (options?.extractImages !== false) {
+            try {
+                const pdfInput = this.currentPdfPath || this.currentPdfData || doc.data;
+                if (!this.pdfiumRenderer) {
+                    this.pdfiumRenderer = new PdfiumRenderer();
+                    await this.pdfiumRenderer.loadDocument(pdfInput);
+                }
+                const imageBounds = await this.pdfiumRenderer.extractImageBounds(pdfInput, pageNum);
+                images = imageBounds.map((bounds) => ({
+                    x: bounds.x,
+                    y: bounds.y,
+                    width: bounds.width,
+                    height: bounds.height,
+                }));
+            }
+            catch {
+                // Image extraction is best-effort
             }
-            const imageBounds = await this.pdfiumRenderer.extractImageBounds(pdfInput, pageNum);
-            images = imageBounds.map((bounds) => ({
-                x: bounds.x,
-                y: bounds.y,
-                width: bounds.width,
-                height: bounds.height,
-            }));
-        }
-        catch {
-            // Image extraction is best-effort
         }
         // Skip annotation extraction - not currently used in processing pipeline
         // Can be re-enabled if needed for link extraction, etc.
@@ -512,7 +724,7 @@ export class PdfJsEngine {
             garbledTextRegions: garbledTextRegions.length > 0 ? garbledTextRegions : undefined,
         };
     }
-    async extractAllPages(doc, maxPages, targetPages) {
+    async extractAllPages(doc, maxPages, targetPages, options) {
         const numPages = Math.min(doc.numPages, maxPages || doc.numPages);
         const pages = [];
         // Parse target pages if specified
@@ -527,7 +739,7 @@ export class PdfJsEngine {
             if (maxPages && pages.length >= maxPages) {
                 break;
             }
-            const pageData = await this.extractPage(doc, pageNum);
+            const pageData = await this.extractPage(doc, pageNum, options);
             pages.push(pageData);
         }
         return pages;