npm - @heripo/pdf-parser - Versions diffs - 0.1.8 → 0.1.10 - Mend

@heripo/pdf-parser 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/index.cjs CHANGED Viewed

@@ -81,11 +81,17 @@ var DOCLING_ENVIRONMENT = {
    */
   STARTUP_DELAY_MS: 2e3
 };
+var PAGE_RENDERING = {
+  /** Default rendering DPI for VLM text recognition quality */
+  DEFAULT_DPI: 200,
+  /** Low-resolution DPI for OCR strategy sampling */
+  SAMPLE_DPI: 150
+};
 var IMAGE_PDF_CONVERTER = {
   /**
    * ImageMagick density option (DPI) for PDF to image conversion
    */
-  DENSITY: 300,
+  DENSITY: PAGE_RENDERING.DEFAULT_DPI,
   /**
    * ImageMagick quality option (1-100)
    */
@@ -869,10 +875,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
 // src/core/pdf-converter.ts
 var import_es_toolkit = require("es-toolkit");
-var import_node_fs7 = require("fs");
-var import_promises = require("fs/promises");
+var import_node_fs8 = require("fs");
+var import_promises4 = require("fs/promises");
 var import_node_path7 = require("path");
-var import_promises2 = require("stream/promises");
+var import_promises5 = require("stream/promises");
 // src/errors/image-pdf-fallback-error.ts
 var ImagePdfFallbackError = class extends Error {
@@ -887,12 +893,17 @@ var ImagePdfFallbackError = class extends Error {
 };
 // src/processors/image-extractor.ts
-var import_node_fs = require("fs");
+var import_node_fs2 = require("fs");
 var import_node_path2 = require("path");
+var import_node_stream = require("stream");
+var import_promises3 = require("stream/promises");
 var yauzl = __toESM(require("yauzl"), 1);
 // src/utils/jq.ts
 var import_node_child_process2 = require("child_process");
+var import_node_fs = require("fs");
+var import_promises = require("fs/promises");
+var import_promises2 = require("stream/promises");
 function getJqPath() {
   const p = process.env.JQ_PATH?.trim();
   return p && p.length > 0 ? p : "jq";
@@ -944,25 +955,139 @@ function runJqFileJson(program, filePath) {
     });
   });
 }
-function jqExtractBase64PngStrings(filePath) {
-  const program = `
-      [
-        .. |
-        select(type == "string" and startswith("data:image/png;base64"))
-      ]
-    `;
-  return runJqFileJson(program, filePath);
+function runJqFileToFile(program, inputPath, outputPath) {
+  return new Promise((resolve, reject) => {
+    const jqPath = getJqPath();
+    const args = [program, inputPath];
+    const child = (0, import_node_child_process2.spawn)(jqPath, args, {
+      stdio: ["ignore", "pipe", "pipe"],
+      env: process.env
+    });
+    let stderr = "";
+    let exitCode = null;
+    let pipelineDone = false;
+    let settled = false;
+    child.stderr.setEncoding("utf-8");
+    child.stderr.on("data", (chunk) => {
+      stderr += chunk;
+    });
+    const ws = (0, import_node_fs.createWriteStream)(outputPath);
+    function trySettle() {
+      if (settled) return;
+      if (!pipelineDone || exitCode === null) return;
+      settled = true;
+      if (exitCode !== 0) {
+        reject(
+          new Error(
+            `jq exited with code ${exitCode}. ${stderr ? "Stderr: " + stderr : ""}`
+          )
+        );
+      } else {
+        resolve();
+      }
+    }
+    child.on("error", (err) => {
+      if (settled) return;
+      settled = true;
+      ws.destroy();
+      reject(err);
+    });
+    (0, import_promises2.pipeline)(child.stdout, ws).then(() => {
+      pipelineDone = true;
+      trySettle();
+    }).catch((err) => {
+      if (settled) return;
+      settled = true;
+      reject(err);
+    });
+    child.on("close", (code) => {
+      exitCode = code ?? 1;
+      trySettle();
+    });
+  });
+}
+function runJqFileLines(program, filePath, onLine) {
+  return new Promise((resolve, reject) => {
+    const jqPath = getJqPath();
+    const args = ["-r", program, filePath];
+    const child = (0, import_node_child_process2.spawn)(jqPath, args, {
+      stdio: ["ignore", "pipe", "pipe"],
+      env: process.env
+    });
+    let stderr = "";
+    let buffer = "";
+    let callbackError = false;
+    child.stdout.setEncoding("utf-8");
+    child.stderr.setEncoding("utf-8");
+    function safeOnLine(line) {
+      if (callbackError) return;
+      try {
+        onLine(line);
+      } catch (err) {
+        callbackError = true;
+        child.kill();
+        reject(err);
+      }
+    }
+    child.stdout.on("data", (chunk) => {
+      buffer += chunk;
+      let newlineIdx;
+      while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
+        const line = buffer.slice(0, newlineIdx);
+        buffer = buffer.slice(newlineIdx + 1);
+        if (line.length > 0) {
+          safeOnLine(line);
+        }
+      }
+    });
+    child.stderr.on("data", (chunk) => {
+      stderr += chunk;
+    });
+    child.on("error", (err) => {
+      if (!callbackError) reject(err);
+    });
+    child.on("close", (code) => {
+      if (callbackError) return;
+      if (buffer.length > 0) {
+        safeOnLine(buffer);
+      }
+      if (callbackError) return;
+      if (code !== 0) {
+        reject(
+          new Error(
+            `jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
+          )
+        );
+      } else {
+        resolve();
+      }
+    });
+  });
 }
-function jqReplaceBase64WithPaths(filePath, dirName, prefix) {
+async function jqExtractBase64PngStringsStreaming(filePath, onImage) {
+  let index = 0;
+  await runJqFileLines(
+    '.. | select(type == "string" and startswith("data:image/png;base64"))',
+    filePath,
+    (line) => {
+      onImage(line, index);
+      index++;
+    }
+  );
+  return index;
+}
+async function jqReplaceBase64WithPathsToFile(inputPath, outputPath, dirName, prefix) {
   const program = `
       reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
         {data: ., counter: 0};
         .counter as $idx |
         .data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
         .counter += 1
-      ) | {data: .data, count: .counter}
+      ) | .data
     `;
-  return runJqFileJson(program, filePath);
+  const tmpPath = outputPath + ".tmp";
+  await runJqFileToFile(program, inputPath, tmpPath);
+  await (0, import_promises.rename)(tmpPath, outputPath);
 }
 // src/processors/image-extractor.ts
@@ -981,7 +1106,7 @@ var ImageExtractor = class _ImageExtractor {
         zipfile.on("entry", (entry) => {
           const entryPath = (0, import_node_path2.join)(targetDir, entry.fileName);
           if (/\/$/.test(entry.fileName)) {
-            (0, import_node_fs.mkdirSync)(entryPath, { recursive: true });
+            (0, import_node_fs2.mkdirSync)(entryPath, { recursive: true });
             zipfile.readEntry();
           } else {
             zipfile.openReadStream(entry, (err2, readStream) => {
@@ -989,8 +1114,8 @@ var ImageExtractor = class _ImageExtractor {
                 reject(err2 || new Error("Failed to open read stream"));
                 return;
               }
-              (0, import_node_fs.mkdirSync)((0, import_node_path2.join)(entryPath, ".."), { recursive: true });
-              const writeStream = (0, import_node_fs.createWriteStream)(entryPath);
+              (0, import_node_fs2.mkdirSync)((0, import_node_path2.join)(entryPath, ".."), { recursive: true });
+              const writeStream = (0, import_node_fs2.createWriteStream)(entryPath);
               readStream.pipe(writeStream);
               writeStream.on("finish", () => {
                 zipfile.readEntry();
@@ -1006,26 +1131,6 @@ var ImageExtractor = class _ImageExtractor {
       });
     });
   }
-  /**
-   * Extract base64 images from JSON file using jq (for large files)
-   * Returns array of base64 data strings
-   */
-  static async extractBase64ImagesFromJsonWithJq(jsonPath) {
-    return jqExtractBase64PngStrings(jsonPath);
-  }
-  /**
-   * Replace base64 images with file paths in JSON using jq (for large files)
-   * Uses reduce to maintain counter state while walking the JSON
-   */
-  static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
-    const { data, count } = await jqReplaceBase64WithPaths(
-      jsonPath,
-      dirName,
-      prefix
-    );
-    (0, import_node_fs.writeFileSync)(outputPath, JSON.stringify(data, null, 2), "utf-8");
-    return count;
-  }
   /**
    * Extract a base64-encoded image to a file and return the relative path
    */
@@ -1035,12 +1140,70 @@ var ImageExtractor = class _ImageExtractor {
     const filename = `${prefix}_${index}.png`;
     const filepath = (0, import_node_path2.join)(imagesDir, filename);
     const buffer = Buffer.from(base64Content, "base64");
-    (0, import_node_fs.writeFileSync)(filepath, buffer);
+    (0, import_node_fs2.writeFileSync)(filepath, buffer);
     return `${dirName}/${filename}`;
   }
   /**
-   * Save JSON and HTML documents with base64 images extracted to separate files
-   * Uses jq for JSON processing to handle large files
+   * Extract base64 images from HTML using streaming.
+   * Reads HTML file as a stream, extracts base64 images from src attributes,
+   * saves them as PNG files, and replaces with file paths in the output HTML.
+   * Returns the number of images extracted.
+   */
+  static async extractImagesFromHtmlStream(htmlInputPath, htmlOutputPath, imagesDir) {
+    let imageIndex = 0;
+    let pending = "";
+    const MARKER = 'src="data:image/png;base64,';
+    const transform = new import_node_stream.Transform({
+      decodeStrings: false,
+      encoding: "utf-8",
+      transform(chunk, _encoding, callback) {
+        pending += chunk;
+        let result = "";
+        while (true) {
+          const markerIdx = pending.indexOf(MARKER);
+          if (markerIdx === -1) {
+            const safeEnd = Math.max(0, pending.length - MARKER.length);
+            result += pending.slice(0, safeEnd);
+            pending = pending.slice(safeEnd);
+            break;
+          }
+          result += pending.slice(0, markerIdx);
+          const dataStart = markerIdx + MARKER.length;
+          const quoteIdx = pending.indexOf('"', dataStart);
+          if (quoteIdx === -1) {
+            pending = pending.slice(markerIdx);
+            break;
+          }
+          const base64Content = pending.slice(dataStart, quoteIdx);
+          const filename = `image_${imageIndex}.png`;
+          const filepath = (0, import_node_path2.join)(imagesDir, filename);
+          const buf = Buffer.from(base64Content, "base64");
+          (0, import_node_fs2.writeFileSync)(filepath, buf);
+          const relativePath = `images/${filename}`;
+          result += `src="${relativePath}"`;
+          imageIndex++;
+          pending = pending.slice(quoteIdx + 1);
+        }
+        if (result.length > 0) {
+          this.push(result);
+        }
+        callback();
+      },
+      flush(callback) {
+        if (pending.length > 0) {
+          this.push(pending);
+        }
+        callback();
+      }
+    });
+    const rs = (0, import_node_fs2.createReadStream)(htmlInputPath, { encoding: "utf-8" });
+    const ws = (0, import_node_fs2.createWriteStream)(htmlOutputPath, { encoding: "utf-8" });
+    await (0, import_promises3.pipeline)(rs, transform, ws);
+    return imageIndex;
+  }
+  /**
+   * Save JSON and HTML documents with base64 images extracted to separate files.
+   * Uses jq for JSON processing and streaming for HTML to handle large files.
    *
    * This method:
    * 1. Extracts base64-encoded images from JSON and HTML content
@@ -1048,43 +1211,45 @@ var ImageExtractor = class _ImageExtractor {
    * 3. Replaces base64 data with relative file paths
    * 4. Saves the transformed documents to the output directory
    */
-  static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlContent) {
+  static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlSourcePath) {
     try {
-      if ((0, import_node_fs.existsSync)(outputDir)) {
-        (0, import_node_fs.rmSync)(outputDir, { recursive: true, force: true });
+      if ((0, import_node_fs2.existsSync)(outputDir)) {
+        (0, import_node_fs2.rmSync)(outputDir, { recursive: true, force: true });
       }
     } catch (e) {
       logger.warn("[PDFConverter] Failed to clear output directory:", e);
     }
-    (0, import_node_fs.mkdirSync)(outputDir, { recursive: true });
+    (0, import_node_fs2.mkdirSync)(outputDir, { recursive: true });
     const baseName = filename.replace((0, import_node_path2.extname)(filename), "");
     const jsonPath = (0, import_node_path2.join)(outputDir, `${baseName}.json`);
     try {
-      const pagesDir = (0, import_node_path2.join)(outputDir, "pages");
-      if (!(0, import_node_fs.existsSync)(pagesDir)) {
-        (0, import_node_fs.mkdirSync)(pagesDir, { recursive: true });
-      }
-      const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
-      base64Images.forEach((base64Data, index) => {
-        _ImageExtractor.extractBase64ImageToFile(
-          base64Data,
-          pagesDir,
-          index,
-          "page",
-          "pages"
-        );
-      });
+      const imagesDir = (0, import_node_path2.join)(outputDir, "images");
+      if (!(0, import_node_fs2.existsSync)(imagesDir)) {
+        (0, import_node_fs2.mkdirSync)(imagesDir, { recursive: true });
+      }
+      const imageCount = await jqExtractBase64PngStringsStreaming(
+        jsonSourcePath,
+        (base64Data, index) => {
+          _ImageExtractor.extractBase64ImageToFile(
+            base64Data,
+            imagesDir,
+            index,
+            "pic",
+            "images"
+          );
+        }
+      );
       logger.info(
-        `[PDFConverter] Extracted ${base64Images.length} images from JSON to ${pagesDir}`
+        `[PDFConverter] Extracted ${imageCount} picture images from JSON to ${imagesDir}`
       );
-      const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
+      await jqReplaceBase64WithPathsToFile(
         jsonSourcePath,
         jsonPath,
-        "pages",
-        "page"
+        "images",
+        "pic"
       );
       logger.info(
-        `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
+        `[PDFConverter] Replaced ${imageCount} base64 images with file paths`
       );
     } catch (e) {
       logger.warn(
@@ -1097,51 +1262,45 @@ var ImageExtractor = class _ImageExtractor {
     const htmlPath = (0, import_node_path2.join)(outputDir, `${baseName}.html`);
     try {
       const imagesDir = (0, import_node_path2.join)(outputDir, "images");
-      if (!(0, import_node_fs.existsSync)(imagesDir)) {
-        (0, import_node_fs.mkdirSync)(imagesDir, { recursive: true });
-      }
-      let imageIndex = 0;
-      const transformedHtml = htmlContent.replace(
-        /src="data:image\/png;base64,([^"]+)"/g,
-        (_, base64Content) => {
-          const filename2 = `image_${imageIndex}.png`;
-          const filepath = (0, import_node_path2.join)(imagesDir, filename2);
-          const buffer = Buffer.from(base64Content, "base64");
-          (0, import_node_fs.writeFileSync)(filepath, buffer);
-          const relativePath = `images/${filename2}`;
-          imageIndex += 1;
-          return `src="${relativePath}"`;
-        }
+      if (!(0, import_node_fs2.existsSync)(imagesDir)) {
+        (0, import_node_fs2.mkdirSync)(imagesDir, { recursive: true });
+      }
+      const htmlImageCount = await _ImageExtractor.extractImagesFromHtmlStream(
+        htmlSourcePath,
+        htmlPath,
+        imagesDir
       );
       logger.info(
-        `[PDFConverter] Extracted ${imageIndex} images from HTML to ${imagesDir}`
+        `[PDFConverter] Extracted ${htmlImageCount} images from HTML to ${imagesDir}`
       );
-      (0, import_node_fs.writeFileSync)(htmlPath, transformedHtml, "utf-8");
     } catch (e) {
       logger.warn(
-        "[PDFConverter] Failed to extract images from HTML, writing original. Error:",
+        "[PDFConverter] Failed to extract images from HTML, copying original. Error:",
         e
       );
-      (0, import_node_fs.writeFileSync)(htmlPath, htmlContent, "utf-8");
+      const rs = (0, import_node_fs2.createReadStream)(htmlSourcePath);
+      const ws = (0, import_node_fs2.createWriteStream)(htmlPath);
+      await (0, import_promises3.pipeline)(rs, ws);
     }
     logger.info("[PDFConverter] Saved HTML:", htmlPath);
   }
   /**
    * Extract documents from ZIP and save with extracted images
-   * Uses jq for JSON processing to handle large files without loading into Node.js memory
+   * Uses jq for JSON processing and streaming for HTML to handle large files
+   * without loading into Node.js memory
    *
    * Complete workflow:
    * 1. Extract ZIP file to temporary directory
    * 2. Find JSON and HTML files from extracted files
-   * 3. Use jq to extract base64 images from JSON and save as separate files
-   * 4. Use jq to replace base64 with file paths in JSON
-   * 5. Process HTML with regex to extract and replace images
+   * 3. Use jq to stream-extract base64 images from JSON and save as separate files
+   * 4. Use jq to replace base64 with file paths in JSON (piped to file)
+   * 5. Process HTML with streaming Transform to extract and replace images
    * 6. Save transformed documents to output directory (as result.json and result.html)
    */
   static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
     logger.info("[PDFConverter] Extracting ZIP file...");
     await _ImageExtractor.extractZip(zipPath, extractDir);
-    const files = (0, import_node_fs.readdirSync)(extractDir);
+    const files = (0, import_node_fs2.readdirSync)(extractDir);
     const jsonFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".json");
     const htmlFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".html");
     if (!jsonFile || !htmlFile) {
@@ -1151,23 +1310,22 @@ var ImageExtractor = class _ImageExtractor {
     }
     const jsonPath = (0, import_node_path2.join)(extractDir, jsonFile);
     const htmlPath = (0, import_node_path2.join)(extractDir, htmlFile);
-    const htmlContent = (0, import_node_fs.readFileSync)(htmlPath, "utf-8");
     logger.info("[PDFConverter] Saving converted files to output...");
     await _ImageExtractor.saveDocumentsWithExtractedImages(
       logger,
       outputDir,
       "result",
       jsonPath,
-      htmlContent
+      htmlPath
     );
     logger.info("[PDFConverter] Files saved to:", outputDir);
   }
 };
 // src/processors/page-renderer.ts
-var import_node_fs2 = require("fs");
+var import_node_fs3 = require("fs");
 var import_node_path3 = require("path");
-var DEFAULT_DPI = 300;
+var PROGRESS_POLL_INTERVAL_MS = 2e3;
 var PageRenderer = class {
   constructor(logger) {
     this.logger = logger;
@@ -1181,31 +1339,62 @@ var PageRenderer = class {
    * @returns Render result with page count and file paths
    */
   async renderPages(pdfPath, outputDir, options) {
-    const dpi = options?.dpi ?? DEFAULT_DPI;
+    const dpi = options?.dpi ?? PAGE_RENDERING.DEFAULT_DPI;
     const pagesDir = (0, import_node_path3.join)(outputDir, "pages");
-    if (!(0, import_node_fs2.existsSync)(pagesDir)) {
-      (0, import_node_fs2.mkdirSync)(pagesDir, { recursive: true });
+    if (!(0, import_node_fs3.existsSync)(pagesDir)) {
+      (0, import_node_fs3.mkdirSync)(pagesDir, { recursive: true });
     }
-    this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
-    const outputPattern = (0, import_node_path3.join)(pagesDir, "page_%d.png");
-    const result = await spawnAsync("magick", [
-      "-density",
-      dpi.toString(),
-      pdfPath,
-      "-background",
-      "white",
-      "-alpha",
-      "remove",
-      "-alpha",
-      "off",
-      outputPattern
-    ]);
-    if (result.code !== 0) {
-      throw new Error(
-        `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
+    const totalPages = await this.getPageCount(pdfPath);
+    if (totalPages > 0) {
+      this.logger.info(
+        `[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
       );
+    } else {
+      this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
+    }
+    const outputPattern = (0, import_node_path3.join)(pagesDir, "page_%d.png");
+    let progressInterval = null;
+    if (totalPages > 0) {
+      let lastLoggedCount = 0;
+      progressInterval = setInterval(() => {
+        try {
+          const rendered = (0, import_node_fs3.readdirSync)(pagesDir).filter(
+            (f) => f.startsWith("page_") && f.endsWith(".png")
+          ).length;
+          if (rendered > 0 && rendered !== lastLoggedCount) {
+            lastLoggedCount = rendered;
+            this.logger.info(
+              `[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
+            );
+          }
+        } catch {
+        }
+      }, PROGRESS_POLL_INTERVAL_MS);
+    }
+    try {
+      const result = await spawnAsync("magick", [
+        "-density",
+        dpi.toString(),
+        pdfPath,
+        "-background",
+        "white",
+        "-alpha",
+        "remove",
+        "-alpha",
+        "off",
+        outputPattern
+      ]);
+      if (result.code !== 0) {
+        throw new Error(
+          `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
+        );
+      }
+    } finally {
+      if (progressInterval) {
+        clearInterval(progressInterval);
+      }
     }
-    const pageFiles = (0, import_node_fs2.readdirSync)(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
+    const pageFiles = (0, import_node_fs3.readdirSync)(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
       const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
       const numB = parseInt(b.replace("page_", "").replace(".png", ""), 10);
       return numA - numB;
@@ -1219,6 +1408,20 @@ var PageRenderer = class {
       pageFiles
     };
   }
+  /**
+   * Get total page count using pdfinfo.
+   * Returns 0 on failure (progress logging will be skipped).
+   */
+  async getPageCount(pdfPath) {
+    try {
+      const result = await spawnAsync("pdfinfo", [pdfPath]);
+      if (result.code !== 0) return 0;
+      const match = result.stdout.match(/^Pages:\s+(\d+)/m);
+      return match ? parseInt(match[1], 10) : 0;
+    } catch {
+      return 0;
+    }
+  }
 };
 // src/processors/pdf-text-extractor.ts
@@ -1304,7 +1507,7 @@ var PdfTextExtractor = class {
 };
 // src/processors/vlm-text-corrector.ts
-var import_node_fs3 = require("fs");
+var import_node_fs4 = require("fs");
 var import_node_path4 = require("path");
 // src/types/vlm-text-correction-schema.ts
@@ -1436,7 +1639,7 @@ var VlmTextCorrector = class {
   async correctAndSave(outputDir, model, options) {
     this.logger.info("[VlmTextCorrector] Starting text correction...");
     const resultPath = (0, import_node_path4.join)(outputDir, "result.json");
-    const doc = JSON.parse((0, import_node_fs3.readFileSync)(resultPath, "utf-8"));
+    const doc = JSON.parse((0, import_node_fs4.readFileSync)(resultPath, "utf-8"));
     let pageNumbers = this.getPageNumbers(doc);
     if (pageNumbers.length === 0) {
       this.logger.info("[VlmTextCorrector] No pages to process");
@@ -1487,7 +1690,7 @@ var VlmTextCorrector = class {
       if (corrections === null) continue;
       this.applyCorrections(doc, pageNumbers[i], corrections);
     }
-    (0, import_node_fs3.writeFileSync)(resultPath, JSON.stringify(doc, null, 2));
+    (0, import_node_fs4.writeFileSync)(resultPath, JSON.stringify(doc, null, 2));
     this.logger.info(
       `[VlmTextCorrector] Correction complete: ${totalTextCorrections} text, ${totalCellCorrections} cell corrections across ${pageNumbers.length} pages (${pagesFailed} failed)`
     );
@@ -1763,7 +1966,7 @@ var VlmTextCorrector = class {
    */
   readPageImage(outputDir, pageNo) {
     const imagePath = (0, import_node_path4.join)(outputDir, "pages", `page_${pageNo - 1}.png`);
-    return (0, import_node_fs3.readFileSync)(imagePath).toString("base64");
+    return (0, import_node_fs4.readFileSync)(imagePath).toString("base64");
   }
   /**
    * Apply VLM corrections to the DoclingDocument.
@@ -1817,9 +2020,9 @@ var VlmTextCorrector = class {
 };
 // src/samplers/ocr-strategy-sampler.ts
-var import_node_fs4 = require("fs");
+var import_model = require("@heripo/model");
+var import_node_fs5 = require("fs");
 var import_v42 = require("zod/v4");
-var SAMPLE_DPI = 150;
 var EDGE_TRIM_RATIO = 0.1;
 var DEFAULT_MAX_SAMPLE_PAGES = 15;
 var DEFAULT_MAX_RETRIES2 = 3;
@@ -1870,7 +2073,7 @@ var OcrStrategySampler = class {
     const renderResult = await this.pageRenderer.renderPages(
       pdfPath,
       outputDir,
-      { dpi: SAMPLE_DPI }
+      { dpi: PAGE_RENDERING.SAMPLE_DPI }
     );
     if (renderResult.pageCount === 0) {
       this.logger.info("[OcrStrategySampler] No pages found in PDF");
@@ -1889,7 +2092,7 @@ var OcrStrategySampler = class {
       `[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
     );
     let sampledCount = 0;
-    let detectedLanguages;
+    const languageFrequency = /* @__PURE__ */ new Map();
     for (const idx of sampleIndices) {
       sampledCount++;
       const pageFile = renderResult.pageFiles[idx];
@@ -1899,14 +2102,17 @@ var OcrStrategySampler = class {
         model,
         options
       );
-      detectedLanguages = pageAnalysis.detectedLanguages;
+      for (const lang of pageAnalysis.detectedLanguages) {
+        languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
+      }
       if (pageAnalysis.hasKoreanHanjaMix) {
         this.logger.info(
           `[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
         );
+        const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
         return {
           method: "vlm",
-          detectedLanguages,
+          detectedLanguages: detectedLanguages2,
           reason: `Korean-Hanja mix detected on page ${idx + 1}`,
           sampledPages: sampledCount,
           totalPages: renderResult.pageCount
@@ -1916,6 +2122,7 @@ var OcrStrategySampler = class {
     this.logger.info(
       "[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
     );
+    const detectedLanguages = this.aggregateLanguages(languageFrequency);
     return {
       method: "ocrmac",
       detectedLanguages,
@@ -2020,14 +2227,15 @@ var OcrStrategySampler = class {
   }
   /**
    * Analyze a single sample page for Korean-Hanja mixed script and primary language.
+   * Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
    *
-   * @returns Object with Korean-Hanja detection result and detected languages
+   * @returns Object with Korean-Hanja detection result and normalized detected languages
    */
   async analyzeSamplePage(pageFile, pageNo, model, options) {
     this.logger.debug(
       `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
     );
-    const base64Image = (0, import_node_fs4.readFileSync)(pageFile).toString("base64");
+    const base64Image = (0, import_node_fs5.readFileSync)(pageFile).toString("base64");
     const messages = [
       {
         role: "user",
@@ -2055,18 +2263,27 @@ var OcrStrategySampler = class {
       options.aggregator.track(result.usage);
     }
     const output = result.output;
+    const normalizedLanguages = output.detectedLanguages.map(import_model.normalizeToBcp47).filter((tag) => tag !== null);
     this.logger.debug(
-      `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${output.detectedLanguages.join(",")}`
+      `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
     );
     return {
       hasKoreanHanjaMix: output.hasKoreanHanjaMix,
-      detectedLanguages: output.detectedLanguages
+      detectedLanguages: normalizedLanguages
     };
   }
+  /**
+   * Aggregate language frequency map into a sorted array.
+   * Returns languages sorted by frequency (descending), or undefined if empty.
+   */
+  aggregateLanguages(frequencyMap) {
+    if (frequencyMap.size === 0) return void 0;
+    return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
+  }
 };
 // src/utils/local-file-server.ts
-var import_node_fs5 = require("fs");
+var import_node_fs6 = require("fs");
 var import_node_http = require("http");
 var import_node_path5 = require("path");
 var LocalFileServer = class {
@@ -2080,7 +2297,7 @@ var LocalFileServer = class {
    */
   async start(filePath) {
     const filename = (0, import_node_path5.basename)(filePath);
-    const stat = (0, import_node_fs5.statSync)(filePath);
+    const stat = (0, import_node_fs6.statSync)(filePath);
     return new Promise((resolve, reject) => {
       this.server = (0, import_node_http.createServer)((req, res) => {
         if (req.url === `/${filename}`) {
@@ -2088,7 +2305,7 @@ var LocalFileServer = class {
             "Content-Type": "application/pdf",
             "Content-Length": stat.size
           });
-          (0, import_node_fs5.createReadStream)(filePath).pipe(res);
+          (0, import_node_fs6.createReadStream)(filePath).pipe(res);
         } else {
           res.writeHead(404);
           res.end("Not Found");
@@ -2125,7 +2342,7 @@ var LocalFileServer = class {
 };
 // src/core/image-pdf-converter.ts
-var import_node_fs6 = require("fs");
+var import_node_fs7 = require("fs");
 var import_node_os = require("os");
 var import_node_path6 = require("path");
 var ImagePdfConverter = class {
@@ -2153,8 +2370,8 @@ var ImagePdfConverter = class {
       this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
       return outputPath;
     } finally {
-      if ((0, import_node_fs6.existsSync)(inputPath)) {
-        (0, import_node_fs6.rmSync)(inputPath, { force: true });
+      if ((0, import_node_fs7.existsSync)(inputPath)) {
+        (0, import_node_fs7.rmSync)(inputPath, { force: true });
       }
     }
   }
@@ -2201,12 +2418,12 @@ var ImagePdfConverter = class {
    * Cleanup the temporary image PDF file
    */
   cleanup(imagePdfPath) {
-    if ((0, import_node_fs6.existsSync)(imagePdfPath)) {
+    if ((0, import_node_fs7.existsSync)(imagePdfPath)) {
       this.logger.info(
         "[ImagePdfConverter] Cleaning up temp file:",
         imagePdfPath
       );
-      (0, import_node_fs6.rmSync)(imagePdfPath, { force: true });
+      (0, import_node_fs7.rmSync)(imagePdfPath, { force: true });
     }
   }
 };
@@ -2350,8 +2567,8 @@ var PDFConverter = class {
       }
       return strategy;
     } finally {
-      if ((0, import_node_fs7.existsSync)(samplingDir)) {
-        (0, import_node_fs7.rmSync)(samplingDir, { recursive: true, force: true });
+      if ((0, import_node_fs8.existsSync)(samplingDir)) {
+        (0, import_node_fs8.rmSync)(samplingDir, { recursive: true, force: true });
       }
     }
   }
@@ -2373,8 +2590,10 @@ var PDFConverter = class {
       let pageTexts;
       try {
         const resultPath2 = (0, import_node_path7.join)(outputDir, "result.json");
-        const doc = JSON.parse((0, import_node_fs7.readFileSync)(resultPath2, "utf-8"));
-        const totalPages = Object.keys(doc.pages).length;
+        const totalPages = await runJqFileJson(
+          ".pages | length",
+          resultPath2
+        );
         const textExtractor = new PdfTextExtractor(this.logger);
         pageTexts = await textExtractor.extractText(pdfPath, totalPages);
       } catch {
@@ -2384,7 +2603,7 @@ var PDFConverter = class {
       }
       const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
       const ocrOriginPath = (0, import_node_path7.join)(outputDir, "result_ocr_origin.json");
-      (0, import_node_fs7.copyFileSync)(resultPath, ocrOriginPath);
+      (0, import_node_fs8.copyFileSync)(resultPath, ocrOriginPath);
       const corrector = new VlmTextCorrector(this.logger);
       await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
         concurrency: options.vlmConcurrency,
@@ -2531,6 +2750,7 @@ var PDFConverter = class {
     const outputDir = (0, import_node_path7.join)(cwd, "output", reportId);
     try {
       await this.processConvertedFiles(zipPath, extractDir, outputDir);
+      await this.renderPageImages(url, outputDir);
       if (abortSignal?.aborted) {
         this.logger.info("[PDFConverter] Conversion aborted before callback");
         const error = new Error("PDF conversion was aborted");
@@ -2544,19 +2764,19 @@ var PDFConverter = class {
       this.logger.info("[PDFConverter] Total time:", duration, "ms");
     } finally {
       this.logger.info("[PDFConverter] Cleaning up temporary files...");
-      if ((0, import_node_fs7.existsSync)(zipPath)) {
-        (0, import_node_fs7.rmSync)(zipPath, { force: true });
+      if ((0, import_node_fs8.existsSync)(zipPath)) {
+        (0, import_node_fs8.rmSync)(zipPath, { force: true });
       }
-      if ((0, import_node_fs7.existsSync)(extractDir)) {
-        (0, import_node_fs7.rmSync)(extractDir, { recursive: true, force: true });
+      if ((0, import_node_fs8.existsSync)(extractDir)) {
+        (0, import_node_fs8.rmSync)(extractDir, { recursive: true, force: true });
       }
       if (cleanupAfterCallback) {
         this.logger.info(
           "[PDFConverter] Cleaning up output directory:",
           outputDir
         );
-        if ((0, import_node_fs7.existsSync)(outputDir)) {
-          (0, import_node_fs7.rmSync)(outputDir, { recursive: true, force: true });
+        if ((0, import_node_fs8.existsSync)(outputDir)) {
+          (0, import_node_fs8.rmSync)(outputDir, { recursive: true, force: true });
         }
       } else {
         this.logger.info("[PDFConverter] Output preserved at:", outputDir);
@@ -2586,6 +2806,8 @@ var PDFConverter = class {
         framework: "livetext"
       },
       generate_picture_images: true,
+      generate_page_images: false,
+      // Page images are rendered by PageRenderer (ImageMagick) after conversion
       images_scale: 2,
       /**
        * While disabling this option yields the most accurate text extraction for readable PDFs,
@@ -2703,12 +2925,12 @@ var PDFConverter = class {
     const zipPath = (0, import_node_path7.join)(process.cwd(), "result.zip");
     this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
     if (zipResult.fileStream) {
-      const writeStream = (0, import_node_fs7.createWriteStream)(zipPath);
-      await (0, import_promises2.pipeline)(zipResult.fileStream, writeStream);
+      const writeStream = (0, import_node_fs8.createWriteStream)(zipPath);
+      await (0, import_promises5.pipeline)(zipResult.fileStream, writeStream);
       return;
     }
     if (zipResult.data) {
-      await (0, import_promises.writeFile)(zipPath, zipResult.data);
+      await (0, import_promises4.writeFile)(zipPath, zipResult.data);
       return;
     }
     this.logger.warn(
@@ -2724,7 +2946,7 @@ var PDFConverter = class {
       );
     }
     const buffer = new Uint8Array(await response.arrayBuffer());
-    await (0, import_promises.writeFile)(zipPath, buffer);
+    await (0, import_promises4.writeFile)(zipPath, buffer);
   }
   async processConvertedFiles(zipPath, extractDir, outputDir) {
     await ImageExtractor.extractAndSaveDocumentsFromZip(
@@ -2734,6 +2956,42 @@ var PDFConverter = class {
       outputDir
     );
   }
+  /**
+   * Render page images from the source PDF using ImageMagick and update result.json.
+   * Uses jq to update the JSON file without loading it into Node.js memory.
+   * Replaces Docling's generate_page_images which fails on large PDFs
+   * due to memory limits when embedding all page images as base64.
+   */
+  async renderPageImages(url, outputDir) {
+    if (!url.startsWith("file://")) {
+      this.logger.warn(
+        "[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
+      );
+      return;
+    }
+    const pdfPath = url.slice(7);
+    this.logger.info(
+      "[PDFConverter] Rendering page images with ImageMagick..."
+    );
+    const renderer = new PageRenderer(this.logger);
+    const renderResult = await renderer.renderPages(pdfPath, outputDir);
+    const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
+    const tmpPath = resultPath + ".tmp";
+    const jqProgram = `
+      .pages |= with_entries(
+        if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
+          .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
+          .value.image.mimetype = "image/png" |
+          .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
+        else . end
+      )
+    `;
+    await runJqFileToFile(jqProgram, resultPath, tmpPath);
+    await (0, import_promises4.rename)(tmpPath, resultPath);
+    this.logger.info(
+      `[PDFConverter] Rendered ${renderResult.pageCount} page images`
+    );
+  }
 };
 // src/core/pdf-parser.ts
@@ -2772,6 +3030,7 @@ var PDFParser = class {
     this.logger.info("[PDFParser] Initializing...");
     this.checkOperatingSystem();
     this.checkJqInstalled();
+    this.checkPopplerInstalled();
     this.checkMacOSVersion();
     if (this.enableImagePdfFallback && !this.baseUrl) {
       this.checkImageMagickInstalled();
@@ -2828,6 +3087,15 @@ var PDFParser = class {
       );
     }
   }
+  checkPopplerInstalled() {
+    try {
+      (0, import_node_child_process3.execSync)("which pdftotext", { stdio: "ignore" });
+    } catch {
+      throw new Error(
+        "poppler is not installed. Please install poppler using: brew install poppler"
+      );
+    }
+  }
   checkMacOSVersion() {
     try {
       const versionOutput = (0, import_node_child_process3.execSync)("sw_vers -productVersion", {